{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.35832011699676514, "epoch": 0.002, "frac_reward_zero_std": 0.0, "grad_norm": 1.264101505279541, "kl": 0.0, "learning_rate": 0.0, "loss": 0.2758, "num_tokens": 5417.0, "reward": 0.4775000214576721, "reward_std": 0.5056283473968506, "rewards/reward_func/mean": 0.4775000214576721, "rewards/reward_func/std": 0.5403900742530823, "sampling/importance_sampling_ratio/max": 2.4071154594421387, "sampling/importance_sampling_ratio/mean": 1.1429595947265625, "sampling/importance_sampling_ratio/min": 0.5015585422515869, "sampling/sampling_logp_difference/max": 0.5305562019348145, "sampling/sampling_logp_difference/mean": 0.024324804544448853, "step": 1, "step_time": 30.05394913199416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.3473261594772339, "epoch": 0.004, "frac_reward_zero_std": 0.0, "grad_norm": 3.3787319660186768, "kl": 0.0, "learning_rate": 1.6666666666666668e-07, "loss": 0.2918, "num_tokens": 11253.0, "reward": 0.581250011920929, "reward_std": 0.5712425708770752, "rewards/reward_func/mean": 0.581250011920929, "rewards/reward_func/std": 0.5513473749160767, "sampling/importance_sampling_ratio/max": 2.3380353450775146, "sampling/importance_sampling_ratio/mean": 1.2109484672546387, "sampling/importance_sampling_ratio/min": 0.4137703776359558, "sampling/sampling_logp_difference/max": 0.6683757305145264, "sampling/sampling_logp_difference/mean": 0.024658963084220886, "step": 2, "step_time": 40.791004868005984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 56.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.360579252243042, "epoch": 0.006, "frac_reward_zero_std": 0.0, "grad_norm": 1.022918939590454, "kl": 0.0012595340376719832, "learning_rate": 3.3333333333333335e-07, "loss": -0.118, "num_tokens": 16681.0, "reward": 0.48250001668930054, "reward_std": 0.5084458589553833, "rewards/reward_func/mean": 0.48250001668930054, "rewards/reward_func/std": 0.5402578711509705, "sampling/importance_sampling_ratio/max": 1.7170885801315308, "sampling/importance_sampling_ratio/mean": 0.9650065898895264, "sampling/importance_sampling_ratio/min": 0.30409955978393555, "sampling/sampling_logp_difference/max": 0.5745421648025513, "sampling/sampling_logp_difference/mean": 0.02655378170311451, "step": 3, "step_time": 32.76647898698866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 55.75, "completions/mean_terminated_length": 55.75, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.3717118501663208, "epoch": 0.008, "frac_reward_zero_std": 0.0, "grad_norm": 1.620926856994629, "kl": 0.0016124368412420154, "learning_rate": 5.000000000000001e-07, "loss": -0.3319, "num_tokens": 22512.0, "reward": 0.3187499940395355, "reward_std": 0.576077938079834, "rewards/reward_func/mean": 0.3187499940395355, "rewards/reward_func/std": 0.5602917075157166, "sampling/importance_sampling_ratio/max": 2.1794252395629883, "sampling/importance_sampling_ratio/mean": 1.2134031057357788, "sampling/importance_sampling_ratio/min": 0.6474471092224121, "sampling/sampling_logp_difference/max": 0.5795614719390869, "sampling/sampling_logp_difference/mean": 0.025603361427783966, "step": 4, "step_time": 37.759238163998816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.3655037581920624, "epoch": 0.01, "frac_reward_zero_std": 0.0, "grad_norm": 1.429198980331421, "kl": 0.0023652869276702404, "learning_rate": 6.666666666666667e-07, "loss": 0.1995, "num_tokens": 28617.0, "reward": 0.17875000834465027, "reward_std": 0.5367715358734131, "rewards/reward_func/mean": 0.17875000834465027, "rewards/reward_func/std": 0.4974919259548187, "sampling/importance_sampling_ratio/max": 2.4826161861419678, "sampling/importance_sampling_ratio/mean": 1.161120057106018, "sampling/importance_sampling_ratio/min": 0.5131281018257141, "sampling/sampling_logp_difference/max": 0.5112643241882324, "sampling/sampling_logp_difference/mean": 0.024323755875229836, "step": 5, "step_time": 41.24092036399816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3876664638519287, "epoch": 0.012, "frac_reward_zero_std": 0.0, "grad_norm": 1.3607584238052368, "kl": 0.001689540920779109, "learning_rate": 8.333333333333333e-07, "loss": 0.0571, "num_tokens": 33384.0, "reward": 0.5862500071525574, "reward_std": 0.5741689205169678, "rewards/reward_func/mean": 0.5862500071525574, "rewards/reward_func/std": 0.5580050945281982, "sampling/importance_sampling_ratio/max": 1.6102370023727417, "sampling/importance_sampling_ratio/mean": 0.898230254650116, "sampling/importance_sampling_ratio/min": 0.34930747747421265, "sampling/sampling_logp_difference/max": 0.7642672061920166, "sampling/sampling_logp_difference/mean": 0.027652274817228317, "step": 6, "step_time": 23.575158892999752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3462250828742981, "epoch": 0.014, "frac_reward_zero_std": 0.0, "grad_norm": 1.7692639827728271, "kl": 0.0016474956646561623, "learning_rate": 1.0000000000000002e-06, "loss": -0.1133, "num_tokens": 39130.0, "reward": 0.16500000655651093, "reward_std": 0.5287132263183594, "rewards/reward_func/mean": 0.16500000655651093, "rewards/reward_func/std": 0.4902477562427521, "sampling/importance_sampling_ratio/max": 1.69410240650177, "sampling/importance_sampling_ratio/mean": 1.0078184604644775, "sampling/importance_sampling_ratio/min": 0.47111791372299194, "sampling/sampling_logp_difference/max": 0.5242133140563965, "sampling/sampling_logp_difference/mean": 0.01969078555703163, "step": 7, "step_time": 42.84424216199841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 51.875, "completions/mean_terminated_length": 51.875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.3769228458404541, "epoch": 0.016, "frac_reward_zero_std": 0.0, "grad_norm": 1.4515846967697144, "kl": 0.0016455797012895346, "learning_rate": 1.1666666666666668e-06, "loss": -0.0421, "num_tokens": 45078.0, "reward": 0.09000000357627869, "reward_std": 0.27620843052864075, "rewards/reward_func/mean": 0.09000000357627869, "rewards/reward_func/std": 0.36974895000457764, "sampling/importance_sampling_ratio/max": 1.7868865728378296, "sampling/importance_sampling_ratio/mean": 1.0823638439178467, "sampling/importance_sampling_ratio/min": 0.3364071249961853, "sampling/sampling_logp_difference/max": 0.5305700898170471, "sampling/sampling_logp_difference/mean": 0.02302919700741768, "step": 8, "step_time": 35.18679962800525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.29516422748565674, "epoch": 0.018, "frac_reward_zero_std": 0.0, "grad_norm": 1.408183217048645, "kl": 0.0017756135202944279, "learning_rate": 1.3333333333333334e-06, "loss": -0.4256, "num_tokens": 50752.0, "reward": 0.30375000834465027, "reward_std": 0.5890097618103027, "rewards/reward_func/mean": 0.30375000834465027, "rewards/reward_func/std": 0.5595645308494568, "sampling/importance_sampling_ratio/max": 2.3854382038116455, "sampling/importance_sampling_ratio/mean": 0.949848473072052, "sampling/importance_sampling_ratio/min": 0.43223410844802856, "sampling/sampling_logp_difference/max": 0.3985975682735443, "sampling/sampling_logp_difference/mean": 0.024290772154927254, "step": 9, "step_time": 36.36361191300966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 51.625, "completions/mean_terminated_length": 51.625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3654099106788635, "epoch": 0.02, "frac_reward_zero_std": 0.0, "grad_norm": 2.196986675262451, "kl": 0.0032762186601758003, "learning_rate": 1.5e-06, "loss": -0.2107, "num_tokens": 56409.0, "reward": 0.09375, "reward_std": 0.2810794711112976, "rewards/reward_func/mean": 0.09375, "rewards/reward_func/std": 0.36730435490608215, "sampling/importance_sampling_ratio/max": 2.5383903980255127, "sampling/importance_sampling_ratio/mean": 1.4091622829437256, "sampling/importance_sampling_ratio/min": 0.6962175369262695, "sampling/sampling_logp_difference/max": 1.2695891857147217, "sampling/sampling_logp_difference/mean": 0.030169054865837097, "step": 10, "step_time": 38.47457867600315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 47.875, "completions/mean_terminated_length": 47.875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.4065442681312561, "epoch": 0.022, "frac_reward_zero_std": 0.0, "grad_norm": 1.2501963376998901, "kl": 0.0023037088103592396, "learning_rate": 1.6666666666666667e-06, "loss": -0.0843, "num_tokens": 62619.0, "reward": 0.07124999910593033, "reward_std": 0.29781630635261536, "rewards/reward_func/mean": 0.07124999910593033, "rewards/reward_func/std": 0.37745150923728943, "sampling/importance_sampling_ratio/max": 1.785474419593811, "sampling/importance_sampling_ratio/mean": 0.9539740085601807, "sampling/importance_sampling_ratio/min": 0.431792289018631, "sampling/sampling_logp_difference/max": 0.6463680267333984, "sampling/sampling_logp_difference/mean": 0.023907780647277832, "step": 11, "step_time": 36.200093806008226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 50.625, "completions/mean_terminated_length": 50.625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.35928064584732056, "epoch": 0.024, "frac_reward_zero_std": 0.0, "grad_norm": 1.0825868844985962, "kl": 0.0015529417432844639, "learning_rate": 1.8333333333333333e-06, "loss": 0.1996, "num_tokens": 68221.0, "reward": 0.08125000447034836, "reward_std": 0.2829767167568207, "rewards/reward_func/mean": 0.08125000447034836, "rewards/reward_func/std": 0.3724604547023773, "sampling/importance_sampling_ratio/max": 1.8231761455535889, "sampling/importance_sampling_ratio/mean": 1.0552520751953125, "sampling/importance_sampling_ratio/min": 0.6413195133209229, "sampling/sampling_logp_difference/max": 0.5809029340744019, "sampling/sampling_logp_difference/mean": 0.02717738226056099, "step": 12, "step_time": 36.93798936299572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 46.75, "completions/mean_terminated_length": 46.75, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.3927002549171448, "epoch": 0.026, "frac_reward_zero_std": 0.0, "grad_norm": 1.864451289176941, "kl": 0.0021730177104473114, "learning_rate": 2.0000000000000003e-06, "loss": -0.2762, "num_tokens": 74060.0, "reward": 0.08249999582767487, "reward_std": 0.2659415602684021, "rewards/reward_func/mean": 0.08249999582767487, "rewards/reward_func/std": 0.35664108395576477, "sampling/importance_sampling_ratio/max": 2.3068079948425293, "sampling/importance_sampling_ratio/mean": 1.3477238416671753, "sampling/importance_sampling_ratio/min": 0.39379340410232544, "sampling/sampling_logp_difference/max": 0.6892986297607422, "sampling/sampling_logp_difference/mean": 0.028836481273174286, "step": 13, "step_time": 39.853118643004564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.40448594093322754, "epoch": 0.028, "frac_reward_zero_std": 0.0, "grad_norm": 2.6799466609954834, "kl": 0.001972722355276346, "learning_rate": 2.166666666666667e-06, "loss": -0.0182, "num_tokens": 78889.0, "reward": 0.3199999928474426, "reward_std": 0.5358837246894836, "rewards/reward_func/mean": 0.3199999928474426, "rewards/reward_func/std": 0.5208784341812134, "sampling/importance_sampling_ratio/max": 1.972301721572876, "sampling/importance_sampling_ratio/mean": 1.1966495513916016, "sampling/importance_sampling_ratio/min": 0.5895167589187622, "sampling/sampling_logp_difference/max": 0.33179569244384766, "sampling/sampling_logp_difference/mean": 0.02415962889790535, "step": 14, "step_time": 30.755012333000195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3787211775779724, "epoch": 0.03, "frac_reward_zero_std": 0.0, "grad_norm": 1.1619508266448975, "kl": 0.0027514053508639336, "learning_rate": 2.3333333333333336e-06, "loss": 0.1111, "num_tokens": 84325.0, "reward": 0.48250001668930054, "reward_std": 0.5952338576316833, "rewards/reward_func/mean": 0.48250001668930054, "rewards/reward_func/std": 0.5513554811477661, "sampling/importance_sampling_ratio/max": 1.4301223754882812, "sampling/importance_sampling_ratio/mean": 0.8711071610450745, "sampling/importance_sampling_ratio/min": 0.36811545491218567, "sampling/sampling_logp_difference/max": 0.8878096342086792, "sampling/sampling_logp_difference/mean": 0.026348719373345375, "step": 15, "step_time": 28.58861476900347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3328179121017456, "epoch": 0.032, "frac_reward_zero_std": 0.0, "grad_norm": 0.6959343552589417, "kl": 0.001039550406858325, "learning_rate": 2.5e-06, "loss": -0.0017, "num_tokens": 89747.0, "reward": 0.0912499949336052, "reward_std": 0.2814132273197174, "rewards/reward_func/mean": 0.0912499949336052, "rewards/reward_func/std": 0.3689536452293396, "sampling/importance_sampling_ratio/max": 1.175565481185913, "sampling/importance_sampling_ratio/mean": 0.8306176066398621, "sampling/importance_sampling_ratio/min": 0.3712834417819977, "sampling/sampling_logp_difference/max": 0.5475239753723145, "sampling/sampling_logp_difference/mean": 0.024504121392965317, "step": 16, "step_time": 36.591242304988555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.3186699450016022, "epoch": 0.034, "frac_reward_zero_std": 0.0, "grad_norm": 1.1346180438995361, "kl": 0.0015763898845762014, "learning_rate": 2.666666666666667e-06, "loss": -0.1755, "num_tokens": 95132.0, "reward": 0.4337500035762787, "reward_std": 0.08031108975410461, "rewards/reward_func/mean": 0.4337500035762787, "rewards/reward_func/std": 0.548945426940918, "sampling/importance_sampling_ratio/max": 1.9401581287384033, "sampling/importance_sampling_ratio/mean": 0.9301601648330688, "sampling/importance_sampling_ratio/min": 0.5294641852378845, "sampling/sampling_logp_difference/max": 0.3347742557525635, "sampling/sampling_logp_difference/mean": 0.018799789249897003, "step": 17, "step_time": 44.30997213399678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 58.125, "completions/mean_terminated_length": 58.125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.41471022367477417, "epoch": 0.036, "frac_reward_zero_std": 0.0, "grad_norm": 1.5832308530807495, "kl": 0.0018035045359283686, "learning_rate": 2.8333333333333335e-06, "loss": 0.1381, "num_tokens": 101131.0, "reward": 0.32875001430511475, "reward_std": 0.5735915303230286, "rewards/reward_func/mean": 0.32875001430511475, "rewards/reward_func/std": 0.548646092414856, "sampling/importance_sampling_ratio/max": 2.193835496902466, "sampling/importance_sampling_ratio/mean": 1.2385116815567017, "sampling/importance_sampling_ratio/min": 0.7738122344017029, "sampling/sampling_logp_difference/max": 0.41960763931274414, "sampling/sampling_logp_difference/mean": 0.025856416672468185, "step": 18, "step_time": 39.32068163600343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 50.625, "completions/mean_terminated_length": 50.625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3718729615211487, "epoch": 0.038, "frac_reward_zero_std": 0.0, "grad_norm": 0.9075581431388855, "kl": 0.0021873265504837036, "learning_rate": 3e-06, "loss": -0.0808, "num_tokens": 106359.0, "reward": -0.04749999940395355, "reward_std": 0.04638735204935074, "rewards/reward_func/mean": -0.04749999940395355, "rewards/reward_func/std": 0.04527692496776581, "sampling/importance_sampling_ratio/max": 1.4301691055297852, "sampling/importance_sampling_ratio/mean": 0.8160060048103333, "sampling/importance_sampling_ratio/min": 0.3894173800945282, "sampling/sampling_logp_difference/max": 0.7142742872238159, "sampling/sampling_logp_difference/mean": 0.026370640844106674, "step": 19, "step_time": 38.445566442009294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 48.875, "completions/mean_terminated_length": 48.875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.340387225151062, "epoch": 0.04, "frac_reward_zero_std": 0.0, "grad_norm": 1.247363805770874, "kl": 0.001818232238292694, "learning_rate": 3.1666666666666667e-06, "loss": 0.0987, "num_tokens": 112295.0, "reward": 0.07375000417232513, "reward_std": 0.288117915391922, "rewards/reward_func/mean": 0.07375000417232513, "rewards/reward_func/std": 0.3656671941280365, "sampling/importance_sampling_ratio/max": 2.058504581451416, "sampling/importance_sampling_ratio/mean": 1.1552469730377197, "sampling/importance_sampling_ratio/min": 0.7517771124839783, "sampling/sampling_logp_difference/max": 0.4139009714126587, "sampling/sampling_logp_difference/mean": 0.02037879265844822, "step": 20, "step_time": 37.69729056301003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.375, "completions/mean_terminated_length": 54.375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.37244129180908203, "epoch": 0.042, "frac_reward_zero_std": 0.0, "grad_norm": 1.0636333227157593, "kl": 0.0019398150034248829, "learning_rate": 3.3333333333333333e-06, "loss": 0.1205, "num_tokens": 118239.0, "reward": 0.20250000059604645, "reward_std": 0.3253607749938965, "rewards/reward_func/mean": 0.20250000059604645, "rewards/reward_func/std": 0.49331969022750854, "sampling/importance_sampling_ratio/max": 1.381617546081543, "sampling/importance_sampling_ratio/mean": 0.9092652201652527, "sampling/importance_sampling_ratio/min": 0.5254658460617065, "sampling/sampling_logp_difference/max": 0.706791877746582, "sampling/sampling_logp_difference/mean": 0.025023311376571655, "step": 21, "step_time": 42.413751760002924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 57.0, "completions/mean_terminated_length": 57.0, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.32273274660110474, "epoch": 0.044, "frac_reward_zero_std": 0.0, "grad_norm": 0.8321790099143982, "kl": 0.0011215595295652747, "learning_rate": 3.5e-06, "loss": 0.0122, "num_tokens": 123987.0, "reward": 0.06625001132488251, "reward_std": 0.292948454618454, "rewards/reward_func/mean": 0.06625001132488251, "rewards/reward_func/std": 0.3691278398036957, "sampling/importance_sampling_ratio/max": 1.8723653554916382, "sampling/importance_sampling_ratio/mean": 0.8478108644485474, "sampling/importance_sampling_ratio/min": 0.38013601303100586, "sampling/sampling_logp_difference/max": 0.4778859615325928, "sampling/sampling_logp_difference/mean": 0.022008519619703293, "step": 22, "step_time": 36.69069067799137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 53.625, "completions/mean_terminated_length": 53.625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.36345064640045166, "epoch": 0.046, "frac_reward_zero_std": 0.0, "grad_norm": 0.9762702584266663, "kl": 0.0013981228694319725, "learning_rate": 3.6666666666666666e-06, "loss": 0.2781, "num_tokens": 129451.0, "reward": 0.2224999964237213, "reward_std": 0.5127884149551392, "rewards/reward_func/mean": 0.2224999964237213, "rewards/reward_func/std": 0.4748157858848572, "sampling/importance_sampling_ratio/max": 1.6413739919662476, "sampling/importance_sampling_ratio/mean": 0.99114990234375, "sampling/importance_sampling_ratio/min": 0.44171378016471863, "sampling/sampling_logp_difference/max": 0.47839784622192383, "sampling/sampling_logp_difference/mean": 0.02052409201860428, "step": 23, "step_time": 32.5673299110058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.36843228340148926, "epoch": 0.048, "frac_reward_zero_std": 0.0, "grad_norm": 1.1462467908859253, "kl": 0.0015321827959269285, "learning_rate": 3.833333333333334e-06, "loss": 0.0698, "num_tokens": 135132.0, "reward": 0.23000001907348633, "reward_std": 0.5106528997421265, "rewards/reward_func/mean": 0.23000001907348633, "rewards/reward_func/std": 0.4728032052516937, "sampling/importance_sampling_ratio/max": 1.2816814184188843, "sampling/importance_sampling_ratio/mean": 0.9215267300605774, "sampling/importance_sampling_ratio/min": 0.5011075139045715, "sampling/sampling_logp_difference/max": 0.5086667537689209, "sampling/sampling_logp_difference/mean": 0.0266867745667696, "step": 24, "step_time": 32.060357182999724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3656223714351654, "epoch": 0.05, "frac_reward_zero_std": 0.0, "grad_norm": 1.6805341243743896, "kl": 0.00146653619594872, "learning_rate": 4.000000000000001e-06, "loss": -0.3181, "num_tokens": 140635.0, "reward": 0.3475000262260437, "reward_std": 0.5446269512176514, "rewards/reward_func/mean": 0.3475000262260437, "rewards/reward_func/std": 0.5270063877105713, "sampling/importance_sampling_ratio/max": 1.6357054710388184, "sampling/importance_sampling_ratio/mean": 1.019447684288025, "sampling/importance_sampling_ratio/min": 0.44408416748046875, "sampling/sampling_logp_difference/max": 0.3505585193634033, "sampling/sampling_logp_difference/mean": 0.022699594497680664, "step": 25, "step_time": 27.742008060988155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3471141457557678, "epoch": 0.052, "frac_reward_zero_std": 0.0, "grad_norm": 0.8676110506057739, "kl": 0.0014656296698376536, "learning_rate": 4.166666666666667e-06, "loss": -0.0186, "num_tokens": 146738.0, "reward": -0.058750003576278687, "reward_std": 0.0412052683532238, "rewards/reward_func/mean": -0.058750003576278687, "rewards/reward_func/std": 0.041554611176252365, "sampling/importance_sampling_ratio/max": 1.3209142684936523, "sampling/importance_sampling_ratio/mean": 0.8133621215820312, "sampling/importance_sampling_ratio/min": 0.41941919922828674, "sampling/sampling_logp_difference/max": 0.35713934898376465, "sampling/sampling_logp_difference/mean": 0.02249450981616974, "step": 26, "step_time": 44.71595883600821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 46.75, "completions/mean_terminated_length": 46.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.32017290592193604, "epoch": 0.054, "frac_reward_zero_std": 0.0, "grad_norm": 0.9609636664390564, "kl": 0.0014645641203969717, "learning_rate": 4.333333333333334e-06, "loss": 0.0422, "num_tokens": 151823.0, "reward": 0.09000000357627869, "reward_std": 0.28282541036605835, "rewards/reward_func/mean": 0.09000000357627869, "rewards/reward_func/std": 0.37028947472572327, "sampling/importance_sampling_ratio/max": 1.3593155145645142, "sampling/importance_sampling_ratio/mean": 0.9048177003860474, "sampling/importance_sampling_ratio/min": 0.6389055848121643, "sampling/sampling_logp_difference/max": 0.3579772114753723, "sampling/sampling_logp_difference/mean": 0.021750561892986298, "step": 27, "step_time": 33.81092618100229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.437652051448822, "epoch": 0.056, "frac_reward_zero_std": 0.0, "grad_norm": 1.9713760614395142, "kl": 0.0016673305071890354, "learning_rate": 4.5e-06, "loss": -0.4088, "num_tokens": 157338.0, "reward": 0.3199999928474426, "reward_std": 0.5766737461090088, "rewards/reward_func/mean": 0.3199999928474426, "rewards/reward_func/std": 0.558569610118866, "sampling/importance_sampling_ratio/max": 2.007301092147827, "sampling/importance_sampling_ratio/mean": 1.1249253749847412, "sampling/importance_sampling_ratio/min": 0.32354435324668884, "sampling/sampling_logp_difference/max": 0.32819199562072754, "sampling/sampling_logp_difference/mean": 0.02238292247056961, "step": 28, "step_time": 37.15755737799918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.4016689658164978, "epoch": 0.058, "frac_reward_zero_std": 0.0, "grad_norm": 1.5781503915786743, "kl": 0.0016747142653912306, "learning_rate": 4.666666666666667e-06, "loss": 0.0816, "num_tokens": 163567.0, "reward": 0.33500000834465027, "reward_std": 0.5604823231697083, "rewards/reward_func/mean": 0.33500000834465027, "rewards/reward_func/std": 0.5377200245857239, "sampling/importance_sampling_ratio/max": 1.7749114036560059, "sampling/importance_sampling_ratio/mean": 1.0603770017623901, "sampling/importance_sampling_ratio/min": 0.6003548502922058, "sampling/sampling_logp_difference/max": 0.447523832321167, "sampling/sampling_logp_difference/mean": 0.02700149640440941, "step": 29, "step_time": 38.102109844010556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3838121294975281, "epoch": 0.06, "frac_reward_zero_std": 0.0, "grad_norm": 1.1578418016433716, "kl": 0.002017122693359852, "learning_rate": 4.833333333333333e-06, "loss": 0.0726, "num_tokens": 169243.0, "reward": 0.19749999046325684, "reward_std": 0.5363912582397461, "rewards/reward_func/mean": 0.19749999046325684, "rewards/reward_func/std": 0.49660995602607727, "sampling/importance_sampling_ratio/max": 2.339462995529175, "sampling/importance_sampling_ratio/mean": 1.108468770980835, "sampling/importance_sampling_ratio/min": 0.5691421031951904, "sampling/sampling_logp_difference/max": 0.38687825202941895, "sampling/sampling_logp_difference/mean": 0.027435339987277985, "step": 30, "step_time": 34.95135859200673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 40.625, "completions/mean_terminated_length": 40.625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.33706551790237427, "epoch": 0.062, "frac_reward_zero_std": 0.0, "grad_norm": 0.9564450979232788, "kl": 0.0014677501749247313, "learning_rate": 5e-06, "loss": 0.084, "num_tokens": 175424.0, "reward": 0.32374998927116394, "reward_std": 0.5670105218887329, "rewards/reward_func/mean": 0.32374998927116394, "rewards/reward_func/std": 0.5370538234710693, "sampling/importance_sampling_ratio/max": 1.2009906768798828, "sampling/importance_sampling_ratio/mean": 0.8830825090408325, "sampling/importance_sampling_ratio/min": 0.47186899185180664, "sampling/sampling_logp_difference/max": 0.47310686111450195, "sampling/sampling_logp_difference/mean": 0.02114824578166008, "step": 31, "step_time": 39.84195031199488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 57.625, "completions/mean_terminated_length": 57.625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.35063254833221436, "epoch": 0.064, "frac_reward_zero_std": 0.0, "grad_norm": 0.9092409610748291, "kl": 0.0016583865508437157, "learning_rate": 4.99998688809149e-06, "loss": -0.0826, "num_tokens": 180456.0, "reward": 0.08749999850988388, "reward_std": 0.2758382558822632, "rewards/reward_func/mean": 0.08749999850988388, "rewards/reward_func/std": 0.369314044713974, "sampling/importance_sampling_ratio/max": 1.4586107730865479, "sampling/importance_sampling_ratio/mean": 0.7794057130813599, "sampling/importance_sampling_ratio/min": 0.26318106055259705, "sampling/sampling_logp_difference/max": 0.4512190818786621, "sampling/sampling_logp_difference/mean": 0.02120751515030861, "step": 32, "step_time": 35.140949385997374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 57.0, "completions/mean_terminated_length": 57.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.37451615929603577, "epoch": 0.066, "frac_reward_zero_std": 0.0, "grad_norm": 1.266938328742981, "kl": 0.0013357822317630053, "learning_rate": 4.9999475525034974e-06, "loss": 0.2632, "num_tokens": 186220.0, "reward": -0.05249999836087227, "reward_std": 0.060625821352005005, "rewards/reward_func/mean": -0.05249999836087227, "rewards/reward_func/std": 0.06363961100578308, "sampling/importance_sampling_ratio/max": 1.8263978958129883, "sampling/importance_sampling_ratio/mean": 1.2238435745239258, "sampling/importance_sampling_ratio/min": 0.7157539129257202, "sampling/sampling_logp_difference/max": 0.33373260498046875, "sampling/sampling_logp_difference/mean": 0.021757911890745163, "step": 33, "step_time": 43.75499219499761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 53.875, "completions/mean_terminated_length": 53.875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.35137397050857544, "epoch": 0.068, "frac_reward_zero_std": 0.0, "grad_norm": 1.322182536125183, "kl": 0.0025231819599866867, "learning_rate": 4.999881993648633e-06, "loss": -0.1529, "num_tokens": 191647.0, "reward": 0.1850000023841858, "reward_std": 0.5347095727920532, "rewards/reward_func/mean": 0.1850000023841858, "rewards/reward_func/std": 0.4952344596385956, "sampling/importance_sampling_ratio/max": 1.5585519075393677, "sampling/importance_sampling_ratio/mean": 0.9386357665061951, "sampling/importance_sampling_ratio/min": 0.322815865278244, "sampling/sampling_logp_difference/max": 0.49080967903137207, "sampling/sampling_logp_difference/mean": 0.024871371686458588, "step": 34, "step_time": 40.144638138997834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 46.25, "completions/mean_terminated_length": 46.25, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.38170647621154785, "epoch": 0.07, "frac_reward_zero_std": 0.0, "grad_norm": 1.953609585762024, "kl": 0.001635606400668621, "learning_rate": 4.99979021221458e-06, "loss": -0.0461, "num_tokens": 197510.0, "reward": 0.2150000035762787, "reward_std": 0.5176790952682495, "rewards/reward_func/mean": 0.2150000035762787, "rewards/reward_func/std": 0.47931498289108276, "sampling/importance_sampling_ratio/max": 2.3140695095062256, "sampling/importance_sampling_ratio/mean": 1.449246883392334, "sampling/importance_sampling_ratio/min": 0.5865305066108704, "sampling/sampling_logp_difference/max": 0.3477973937988281, "sampling/sampling_logp_difference/mean": 0.025315163657069206, "step": 35, "step_time": 33.668802553002024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 63.375, "completions/mean_terminated_length": 63.375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.3781251311302185, "epoch": 0.072, "frac_reward_zero_std": 0.0, "grad_norm": 1.1683272123336792, "kl": 0.0017781654605641961, "learning_rate": 4.9996722091640805e-06, "loss": -0.0293, "num_tokens": 202499.0, "reward": 0.3174999952316284, "reward_std": 0.5784124135971069, "rewards/reward_func/mean": 0.3174999952316284, "rewards/reward_func/std": 0.5600191354751587, "sampling/importance_sampling_ratio/max": 1.6650283336639404, "sampling/importance_sampling_ratio/mean": 1.2166297435760498, "sampling/importance_sampling_ratio/min": 0.7639206647872925, "sampling/sampling_logp_difference/max": 0.44861912727355957, "sampling/sampling_logp_difference/mean": 0.019233262166380882, "step": 36, "step_time": 31.37140485800046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 49.375, "completions/mean_terminated_length": 49.375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.42192620038986206, "epoch": 0.074, "frac_reward_zero_std": 0.0, "grad_norm": 0.7755328416824341, "kl": 0.00290558859705925, "learning_rate": 4.999527985734932e-06, "loss": 0.1481, "num_tokens": 208194.0, "reward": 0.16625000536441803, "reward_std": 0.3537360727787018, "rewards/reward_func/mean": 0.16625000536441803, "rewards/reward_func/std": 0.49612608551979065, "sampling/importance_sampling_ratio/max": 1.2127708196640015, "sampling/importance_sampling_ratio/mean": 0.6914026737213135, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.3314962387084961, "sampling/sampling_logp_difference/mean": 0.026732761412858963, "step": 37, "step_time": 41.92197997200128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 45.0, "completions/mean_terminated_length": 45.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3740369379520416, "epoch": 0.076, "frac_reward_zero_std": 0.0, "grad_norm": 1.343247652053833, "kl": 0.0024613114073872566, "learning_rate": 4.999357543439969e-06, "loss": 0.0229, "num_tokens": 213926.0, "reward": 0.23625001311302185, "reward_std": 0.49587899446487427, "rewards/reward_func/mean": 0.23625001311302185, "rewards/reward_func/std": 0.4592210054397583, "sampling/importance_sampling_ratio/max": 1.9151374101638794, "sampling/importance_sampling_ratio/mean": 1.0875842571258545, "sampling/importance_sampling_ratio/min": 0.6512177586555481, "sampling/sampling_logp_difference/max": 0.4941213130950928, "sampling/sampling_logp_difference/mean": 0.02705160342156887, "step": 38, "step_time": 35.657528919997276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.38872674107551575, "epoch": 0.078, "frac_reward_zero_std": 0.0, "grad_norm": 1.444061517715454, "kl": 0.00284218811430037, "learning_rate": 4.999160884067051e-06, "loss": 0.1495, "num_tokens": 219542.0, "reward": 0.4675000011920929, "reward_std": 0.5747828483581543, "rewards/reward_func/mean": 0.4675000011920929, "rewards/reward_func/std": 0.5322123765945435, "sampling/importance_sampling_ratio/max": 2.0046651363372803, "sampling/importance_sampling_ratio/mean": 0.9659349918365479, "sampling/importance_sampling_ratio/min": 0.2745288014411926, "sampling/sampling_logp_difference/max": 0.6689493656158447, "sampling/sampling_logp_difference/mean": 0.03054666332900524, "step": 39, "step_time": 41.85108021501219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.34974485635757446, "epoch": 0.08, "frac_reward_zero_std": 0.0, "grad_norm": 1.2580375671386719, "kl": 0.0013389361556619406, "learning_rate": 4.9989380096790416e-06, "loss": -0.1842, "num_tokens": 225548.0, "reward": 0.07874999940395355, "reward_std": 0.2814640700817108, "rewards/reward_func/mean": 0.07874999940395355, "rewards/reward_func/std": 0.37380045652389526, "sampling/importance_sampling_ratio/max": 1.6368787288665771, "sampling/importance_sampling_ratio/mean": 1.1340277194976807, "sampling/importance_sampling_ratio/min": 0.5276371836662292, "sampling/sampling_logp_difference/max": 0.4883451461791992, "sampling/sampling_logp_difference/mean": 0.019177088513970375, "step": 40, "step_time": 49.17670016500051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 47.625, "completions/mean_terminated_length": 47.625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3754725456237793, "epoch": 0.082, "frac_reward_zero_std": 0.0, "grad_norm": 1.3454259634017944, "kl": 0.002098641125485301, "learning_rate": 4.998688922613788e-06, "loss": -0.2422, "num_tokens": 231041.0, "reward": 0.08375000208616257, "reward_std": 0.2680739760398865, "rewards/reward_func/mean": 0.08375000208616257, "rewards/reward_func/std": 0.3638656735420227, "sampling/importance_sampling_ratio/max": 1.3377426862716675, "sampling/importance_sampling_ratio/mean": 0.807715892791748, "sampling/importance_sampling_ratio/min": 0.4852953851222992, "sampling/sampling_logp_difference/max": 0.6739339828491211, "sampling/sampling_logp_difference/mean": 0.02791445143520832, "step": 41, "step_time": 35.51692575198831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 49.25, "completions/mean_terminated_length": 49.25, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.3402999937534332, "epoch": 0.084, "frac_reward_zero_std": 0.0, "grad_norm": 1.2757831811904907, "kl": 0.0019170227460563183, "learning_rate": 4.998413625484095e-06, "loss": -0.1765, "num_tokens": 236135.0, "reward": 0.4612500071525574, "reward_std": 0.6052623987197876, "rewards/reward_func/mean": 0.4612500071525574, "rewards/reward_func/std": 0.56057208776474, "sampling/importance_sampling_ratio/max": 1.7278234958648682, "sampling/importance_sampling_ratio/mean": 0.9955820441246033, "sampling/importance_sampling_ratio/min": 0.2917014956474304, "sampling/sampling_logp_difference/max": 0.5930330753326416, "sampling/sampling_logp_difference/mean": 0.02681322768330574, "step": 42, "step_time": 28.90468980500009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 55.375, "completions/mean_terminated_length": 55.375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.43568652868270874, "epoch": 0.086, "frac_reward_zero_std": 0.0, "grad_norm": 1.1961538791656494, "kl": 0.0031014331616461277, "learning_rate": 4.9981121211777e-06, "loss": 0.132, "num_tokens": 242383.0, "reward": 0.07874999195337296, "reward_std": 0.273262619972229, "rewards/reward_func/mean": 0.07874999195337296, "rewards/reward_func/std": 0.3736475706100464, "sampling/importance_sampling_ratio/max": 1.8331316709518433, "sampling/importance_sampling_ratio/mean": 0.9131045341491699, "sampling/importance_sampling_ratio/min": 0.3854982852935791, "sampling/sampling_logp_difference/max": 0.4006004333496094, "sampling/sampling_logp_difference/mean": 0.02947426773607731, "step": 43, "step_time": 40.384530305993394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3090088665485382, "epoch": 0.088, "frac_reward_zero_std": 0.0, "grad_norm": 1.4481481313705444, "kl": 0.0016173927579075098, "learning_rate": 4.997784412857239e-06, "loss": 0.1052, "num_tokens": 248660.0, "reward": -0.06874999403953552, "reward_std": 0.04914231598377228, "rewards/reward_func/mean": -0.06874999403953552, "rewards/reward_func/std": 0.04764077067375183, "sampling/importance_sampling_ratio/max": 1.2356544733047485, "sampling/importance_sampling_ratio/mean": 0.9581372737884521, "sampling/importance_sampling_ratio/min": 0.5436961054801941, "sampling/sampling_logp_difference/max": 0.29942846298217773, "sampling/sampling_logp_difference/mean": 0.015336403623223305, "step": 44, "step_time": 53.181460595995304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.37320029735565186, "epoch": 0.09, "frac_reward_zero_std": 0.0, "grad_norm": 1.1145901679992676, "kl": 0.003977172076702118, "learning_rate": 4.99743050396022e-06, "loss": 0.1775, "num_tokens": 254142.0, "reward": 0.45625001192092896, "reward_std": 0.6143215894699097, "rewards/reward_func/mean": 0.45625001192092896, "rewards/reward_func/std": 0.5695847868919373, "sampling/importance_sampling_ratio/max": 1.8426916599273682, "sampling/importance_sampling_ratio/mean": 0.8322780728340149, "sampling/importance_sampling_ratio/min": 0.19014649093151093, "sampling/sampling_logp_difference/max": 0.40699613094329834, "sampling/sampling_logp_difference/mean": 0.027948087081313133, "step": 45, "step_time": 30.825133632999496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3248752951622009, "epoch": 0.092, "frac_reward_zero_std": 0.0, "grad_norm": 1.1806740760803223, "kl": 0.0028996632900089025, "learning_rate": 4.997050398198977e-06, "loss": 0.1501, "num_tokens": 259227.0, "reward": 0.2162500023841858, "reward_std": 0.3277096152305603, "rewards/reward_func/mean": 0.2162500023841858, "rewards/reward_func/std": 0.48576709628105164, "sampling/importance_sampling_ratio/max": 1.6642227172851562, "sampling/importance_sampling_ratio/mean": 1.01229727268219, "sampling/importance_sampling_ratio/min": 0.4485871493816376, "sampling/sampling_logp_difference/max": 0.46175622940063477, "sampling/sampling_logp_difference/mean": 0.022898774594068527, "step": 46, "step_time": 35.48211781200371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 47.875, "completions/mean_terminated_length": 47.875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.4646483063697815, "epoch": 0.094, "frac_reward_zero_std": 0.0, "grad_norm": 1.5030003786087036, "kl": 0.0027977940626442432, "learning_rate": 4.9966440995606415e-06, "loss": 0.1655, "num_tokens": 264627.0, "reward": 0.1887499988079071, "reward_std": 0.5297601222991943, "rewards/reward_func/mean": 0.1887499988079071, "rewards/reward_func/std": 0.4906100332736969, "sampling/importance_sampling_ratio/max": 2.4210548400878906, "sampling/importance_sampling_ratio/mean": 1.32478928565979, "sampling/importance_sampling_ratio/min": 0.4550531506538391, "sampling/sampling_logp_difference/max": 0.4784013032913208, "sampling/sampling_logp_difference/mean": 0.031322650611400604, "step": 47, "step_time": 30.883059543004492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3293306231498718, "epoch": 0.096, "frac_reward_zero_std": 0.0, "grad_norm": 0.8927807807922363, "kl": 0.002377166645601392, "learning_rate": 4.9962116123070925e-06, "loss": 0.042, "num_tokens": 270378.0, "reward": 0.2162500023841858, "reward_std": 0.4940055012702942, "rewards/reward_func/mean": 0.2162500023841858, "rewards/reward_func/std": 0.45831796526908875, "sampling/importance_sampling_ratio/max": 1.1039625406265259, "sampling/importance_sampling_ratio/mean": 0.8569784164428711, "sampling/importance_sampling_ratio/min": 0.536712646484375, "sampling/sampling_logp_difference/max": 0.34067440032958984, "sampling/sampling_logp_difference/mean": 0.018954172730445862, "step": 48, "step_time": 38.16978211799869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 54.375, "completions/mean_terminated_length": 54.375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.377954363822937, "epoch": 0.098, "frac_reward_zero_std": 0.0, "grad_norm": 1.487809658050537, "kl": 0.0016349733341485262, "learning_rate": 4.9957529409749185e-06, "loss": 0.3189, "num_tokens": 275988.0, "reward": 0.07874999940395355, "reward_std": 0.2923800051212311, "rewards/reward_func/mean": 0.07874999940395355, "rewards/reward_func/std": 0.3753260672092438, "sampling/importance_sampling_ratio/max": 2.0353479385375977, "sampling/importance_sampling_ratio/mean": 1.0705337524414062, "sampling/importance_sampling_ratio/min": 0.35857802629470825, "sampling/sampling_logp_difference/max": 0.5681980848312378, "sampling/sampling_logp_difference/mean": 0.0232619009912014, "step": 49, "step_time": 35.3692782720027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.36007416248321533, "epoch": 0.1, "frac_reward_zero_std": 0.0, "grad_norm": 1.4686626195907593, "kl": 0.0032578343525528908, "learning_rate": 4.995268090375362e-06, "loss": -0.0069, "num_tokens": 281915.0, "reward": 0.1899999976158142, "reward_std": 0.3334062099456787, "rewards/reward_func/mean": 0.1899999976158142, "rewards/reward_func/std": 0.4919930398464203, "sampling/importance_sampling_ratio/max": 1.8520060777664185, "sampling/importance_sampling_ratio/mean": 1.2530664205551147, "sampling/importance_sampling_ratio/min": 0.6933834552764893, "sampling/sampling_logp_difference/max": 0.3359670639038086, "sampling/sampling_logp_difference/mean": 0.02223808318376541, "step": 50, "step_time": 39.47781550500076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 52.625, "completions/mean_terminated_length": 52.625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3799910545349121, "epoch": 0.102, "frac_reward_zero_std": 0.0, "grad_norm": 1.1234054565429688, "kl": 0.003303245175629854, "learning_rate": 4.99475706559428e-06, "loss": -0.1382, "num_tokens": 287424.0, "reward": 0.3125, "reward_std": 0.2701009511947632, "rewards/reward_func/mean": 0.3125, "rewards/reward_func/std": 0.5506294965744019, "sampling/importance_sampling_ratio/max": 1.6724814176559448, "sampling/importance_sampling_ratio/mean": 0.9746721386909485, "sampling/importance_sampling_ratio/min": 0.4102705419063568, "sampling/sampling_logp_difference/max": 0.48910510540008545, "sampling/sampling_logp_difference/mean": 0.02539738267660141, "step": 51, "step_time": 37.61800301600306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.31345584988594055, "epoch": 0.104, "frac_reward_zero_std": 0.0, "grad_norm": 1.0368928909301758, "kl": 0.00263200793415308, "learning_rate": 4.994219871992077e-06, "loss": -0.0105, "num_tokens": 292745.0, "reward": 0.45499998331069946, "reward_std": 0.5025076270103455, "rewards/reward_func/mean": 0.45499998331069946, "rewards/reward_func/std": 0.5353503227233887, "sampling/importance_sampling_ratio/max": 1.1843898296356201, "sampling/importance_sampling_ratio/mean": 0.8895880579948425, "sampling/importance_sampling_ratio/min": 0.5178665518760681, "sampling/sampling_logp_difference/max": 0.47790735960006714, "sampling/sampling_logp_difference/mean": 0.026872076094150543, "step": 52, "step_time": 39.26654844600125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 51.875, "completions/mean_terminated_length": 51.875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.3391358554363251, "epoch": 0.106, "frac_reward_zero_std": 0.0, "grad_norm": 1.0148347616195679, "kl": 0.028528861701488495, "learning_rate": 4.993656515203662e-06, "loss": -0.0863, "num_tokens": 298787.0, "reward": 0.33249998092651367, "reward_std": 0.5578641891479492, "rewards/reward_func/mean": 0.33249998092651367, "rewards/reward_func/std": 0.5344356298446655, "sampling/importance_sampling_ratio/max": 1.4639109373092651, "sampling/importance_sampling_ratio/mean": 0.7722151279449463, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7002308368682861, "sampling/sampling_logp_difference/mean": 0.02879432588815689, "step": 53, "step_time": 37.527451172994915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3680269420146942, "epoch": 0.108, "frac_reward_zero_std": 0.0, "grad_norm": 1.1720988750457764, "kl": 0.005885637830942869, "learning_rate": 4.99306700113838e-06, "loss": -0.0454, "num_tokens": 304206.0, "reward": 0.3412500023841858, "reward_std": 0.5495070815086365, "rewards/reward_func/mean": 0.3412500023841858, "rewards/reward_func/std": 0.5323650240898132, "sampling/importance_sampling_ratio/max": 1.4783494472503662, "sampling/importance_sampling_ratio/mean": 0.9287598729133606, "sampling/importance_sampling_ratio/min": 0.6589941382408142, "sampling/sampling_logp_difference/max": 0.5849330425262451, "sampling/sampling_logp_difference/mean": 0.02534063160419464, "step": 54, "step_time": 37.79061970599287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 55.125, "completions/mean_terminated_length": 55.125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.42716383934020996, "epoch": 0.11, "frac_reward_zero_std": 0.0, "grad_norm": 1.1009771823883057, "kl": 0.002711753360927105, "learning_rate": 4.9924513359799555e-06, "loss": -0.1146, "num_tokens": 309707.0, "reward": 0.20000000298023224, "reward_std": 0.5274509191513062, "rewards/reward_func/mean": 0.20000000298023224, "rewards/reward_func/std": 0.4887009263038635, "sampling/importance_sampling_ratio/max": 2.2008090019226074, "sampling/importance_sampling_ratio/mean": 1.0757707357406616, "sampling/importance_sampling_ratio/min": 0.38931164145469666, "sampling/sampling_logp_difference/max": 1.0060789585113525, "sampling/sampling_logp_difference/mean": 0.027470823377370834, "step": 55, "step_time": 34.70951578000677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3485015034675598, "epoch": 0.112, "frac_reward_zero_std": 0.0, "grad_norm": 1.0752439498901367, "kl": 0.0026783556677401066, "learning_rate": 4.991809526186424e-06, "loss": -0.0144, "num_tokens": 314773.0, "reward": 0.3149999976158142, "reward_std": 0.5622599720954895, "rewards/reward_func/mean": 0.3149999976158142, "rewards/reward_func/std": 0.5434545874595642, "sampling/importance_sampling_ratio/max": 1.2996971607208252, "sampling/importance_sampling_ratio/mean": 0.9518929719924927, "sampling/importance_sampling_ratio/min": 0.5955594778060913, "sampling/sampling_logp_difference/max": 0.30002713203430176, "sampling/sampling_logp_difference/mean": 0.020609542727470398, "step": 56, "step_time": 33.184696926007746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 53.625, "completions/mean_terminated_length": 53.625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.3204508423805237, "epoch": 0.114, "frac_reward_zero_std": 0.0, "grad_norm": 1.1941601037979126, "kl": 0.003310043830424547, "learning_rate": 4.991141578490066e-06, "loss": 0.2686, "num_tokens": 320715.0, "reward": 0.3387500047683716, "reward_std": 0.5372268557548523, "rewards/reward_func/mean": 0.3387500047683716, "rewards/reward_func/std": 0.5153483748435974, "sampling/importance_sampling_ratio/max": 2.146245002746582, "sampling/importance_sampling_ratio/mean": 1.03570556640625, "sampling/importance_sampling_ratio/min": 0.36691558361053467, "sampling/sampling_logp_difference/max": 0.3675193786621094, "sampling/sampling_logp_difference/mean": 0.023061014711856842, "step": 57, "step_time": 38.93158349399164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 53.625, "completions/mean_terminated_length": 53.625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.33570247888565063, "epoch": 0.116, "frac_reward_zero_std": 0.0, "grad_norm": 0.9658681750297546, "kl": 0.0020056082867085934, "learning_rate": 4.990447499897339e-06, "loss": -0.0436, "num_tokens": 326183.0, "reward": 0.3375000059604645, "reward_std": 0.5548287630081177, "rewards/reward_func/mean": 0.3375000059604645, "rewards/reward_func/std": 0.5281707048416138, "sampling/importance_sampling_ratio/max": 1.2019702196121216, "sampling/importance_sampling_ratio/mean": 0.9152437448501587, "sampling/importance_sampling_ratio/min": 0.4425993859767914, "sampling/sampling_logp_difference/max": 0.31412577629089355, "sampling/sampling_logp_difference/mean": 0.01732739806175232, "step": 58, "step_time": 31.71721384600096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3592735528945923, "epoch": 0.118, "frac_reward_zero_std": 0.0, "grad_norm": 0.9438511729240417, "kl": 0.002568789292126894, "learning_rate": 4.989727297688797e-06, "loss": -0.0309, "num_tokens": 331769.0, "reward": 0.1887499988079071, "reward_std": 0.34991249442100525, "rewards/reward_func/mean": 0.1887499988079071, "rewards/reward_func/std": 0.5049027800559998, "sampling/importance_sampling_ratio/max": 1.0347727537155151, "sampling/importance_sampling_ratio/mean": 0.7055701017379761, "sampling/importance_sampling_ratio/min": 0.3710009455680847, "sampling/sampling_logp_difference/max": 0.6251668930053711, "sampling/sampling_logp_difference/mean": 0.022848688066005707, "step": 59, "step_time": 40.923916693005594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 46.75, "completions/mean_terminated_length": 46.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.2875489294528961, "epoch": 0.12, "frac_reward_zero_std": 0.0, "grad_norm": 2.1665284633636475, "kl": 0.0035660723224282265, "learning_rate": 4.98898097941902e-06, "loss": -0.0455, "num_tokens": 336898.0, "reward": -0.05624999850988388, "reward_std": 0.04053955525159836, "rewards/reward_func/mean": -0.05624999850988388, "rewards/reward_func/std": 0.04206712171435356, "sampling/importance_sampling_ratio/max": 2.0452516078948975, "sampling/importance_sampling_ratio/mean": 1.0319852828979492, "sampling/importance_sampling_ratio/min": 0.5795266032218933, "sampling/sampling_logp_difference/max": 0.531651496887207, "sampling/sampling_logp_difference/mean": 0.02353527769446373, "step": 60, "step_time": 33.48683463499765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3609686493873596, "epoch": 0.122, "frac_reward_zero_std": 0.0, "grad_norm": 0.9293365478515625, "kl": 0.002589891664683819, "learning_rate": 4.988208552916535e-06, "loss": -0.0442, "num_tokens": 342410.0, "reward": 0.20625001192092896, "reward_std": 0.5265212059020996, "rewards/reward_func/mean": 0.20625001192092896, "rewards/reward_func/std": 0.48746979236602783, "sampling/importance_sampling_ratio/max": 1.2214772701263428, "sampling/importance_sampling_ratio/mean": 0.8363133668899536, "sampling/importance_sampling_ratio/min": 0.4508911073207855, "sampling/sampling_logp_difference/max": 0.46391355991363525, "sampling/sampling_logp_difference/mean": 0.02813461422920227, "step": 61, "step_time": 35.721023104997585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 59.75, "completions/mean_terminated_length": 59.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.38784918189048767, "epoch": 0.124, "frac_reward_zero_std": 0.0, "grad_norm": 1.075391173362732, "kl": 0.0035081543028354645, "learning_rate": 4.98741002628373e-06, "loss": -0.0935, "num_tokens": 347782.0, "reward": 0.612500011920929, "reward_std": 0.5446658134460449, "rewards/reward_func/mean": 0.612500011920929, "rewards/reward_func/std": 0.5239888429641724, "sampling/importance_sampling_ratio/max": 1.6058270931243896, "sampling/importance_sampling_ratio/mean": 0.8856536149978638, "sampling/importance_sampling_ratio/min": 0.5055686831474304, "sampling/sampling_logp_difference/max": 0.3406977653503418, "sampling/sampling_logp_difference/mean": 0.025373805314302444, "step": 62, "step_time": 25.64868492000096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 49.625, "completions/mean_terminated_length": 49.625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3399428427219391, "epoch": 0.126, "frac_reward_zero_std": 0.0, "grad_norm": 1.1036839485168457, "kl": 0.003299012314528227, "learning_rate": 4.9865854078967715e-06, "loss": -0.2298, "num_tokens": 353226.0, "reward": 0.6012499928474426, "reward_std": 0.5651106834411621, "rewards/reward_func/mean": 0.6012499928474426, "rewards/reward_func/std": 0.5377848148345947, "sampling/importance_sampling_ratio/max": 1.3698598146438599, "sampling/importance_sampling_ratio/mean": 0.8536076545715332, "sampling/importance_sampling_ratio/min": 0.3125140964984894, "sampling/sampling_logp_difference/max": 1.194218635559082, "sampling/sampling_logp_difference/mean": 0.023982733488082886, "step": 63, "step_time": 29.669544973992743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 50.125, "completions/mean_terminated_length": 50.125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3663536310195923, "epoch": 0.128, "frac_reward_zero_std": 0.0, "grad_norm": 1.173079252243042, "kl": 0.0031317025423049927, "learning_rate": 4.985734706405516e-06, "loss": 0.2187, "num_tokens": 359212.0, "reward": -0.03999999910593033, "reward_std": 0.04652039706707001, "rewards/reward_func/mean": -0.03999999910593033, "rewards/reward_func/std": 0.04956958070397377, "sampling/importance_sampling_ratio/max": 1.2950257062911987, "sampling/importance_sampling_ratio/mean": 0.8518853187561035, "sampling/importance_sampling_ratio/min": 0.6529499888420105, "sampling/sampling_logp_difference/max": 0.4296393394470215, "sampling/sampling_logp_difference/mean": 0.027849294245243073, "step": 64, "step_time": 40.9083917550015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.39625003933906555, "epoch": 0.13, "frac_reward_zero_std": 0.0, "grad_norm": 1.3868184089660645, "kl": 0.003464975394308567, "learning_rate": 4.9848579307334195e-06, "loss": 0.1145, "num_tokens": 365700.0, "reward": 0.10374999791383743, "reward_std": 0.2647004723548889, "rewards/reward_func/mean": 0.10374999791383743, "rewards/reward_func/std": 0.3624495267868042, "sampling/importance_sampling_ratio/max": 1.4604157209396362, "sampling/importance_sampling_ratio/mean": 1.0366566181182861, "sampling/importance_sampling_ratio/min": 0.6759946346282959, "sampling/sampling_logp_difference/max": 0.6729015111923218, "sampling/sampling_logp_difference/mean": 0.02300150692462921, "step": 65, "step_time": 39.30383253400214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 57.125, "completions/mean_terminated_length": 57.125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.4078044891357422, "epoch": 0.132, "frac_reward_zero_std": 0.0, "grad_norm": 1.922768473625183, "kl": 0.004820004105567932, "learning_rate": 4.983955090077445e-06, "loss": -0.3134, "num_tokens": 370599.0, "reward": 0.06875000149011612, "reward_std": 0.2901986241340637, "rewards/reward_func/mean": 0.06875000149011612, "rewards/reward_func/std": 0.379301518201828, "sampling/importance_sampling_ratio/max": 2.629911184310913, "sampling/importance_sampling_ratio/mean": 1.3608753681182861, "sampling/importance_sampling_ratio/min": 0.3729094862937927, "sampling/sampling_logp_difference/max": 0.6287485361099243, "sampling/sampling_logp_difference/mean": 0.025368856266140938, "step": 66, "step_time": 37.84037004499987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.307640016078949, "epoch": 0.134, "frac_reward_zero_std": 0.0, "grad_norm": 0.7610776424407959, "kl": 0.0031319891568273306, "learning_rate": 4.983026193907962e-06, "loss": 0.0798, "num_tokens": 375798.0, "reward": 0.46000000834465027, "reward_std": 0.5894244313240051, "rewards/reward_func/mean": 0.46000000834465027, "rewards/reward_func/std": 0.5463646054267883, "sampling/importance_sampling_ratio/max": 1.1422264575958252, "sampling/importance_sampling_ratio/mean": 0.8120383024215698, "sampling/importance_sampling_ratio/min": 0.4197941720485687, "sampling/sampling_logp_difference/max": 0.45195698738098145, "sampling/sampling_logp_difference/mean": 0.023039013147354126, "step": 67, "step_time": 30.603469858993776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.36541643738746643, "epoch": 0.136, "frac_reward_zero_std": 0.0, "grad_norm": 1.7111493349075317, "kl": 0.007591226138174534, "learning_rate": 4.982071251968653e-06, "loss": 0.1488, "num_tokens": 381215.0, "reward": 0.18000000715255737, "reward_std": 0.33846431970596313, "rewards/reward_func/mean": 0.18000000715255737, "rewards/reward_func/std": 0.4910629987716675, "sampling/importance_sampling_ratio/max": 1.9301010370254517, "sampling/importance_sampling_ratio/mean": 1.1310442686080933, "sampling/importance_sampling_ratio/min": 0.2971019148826599, "sampling/sampling_logp_difference/max": 0.3519878387451172, "sampling/sampling_logp_difference/mean": 0.025238394737243652, "step": 68, "step_time": 36.97006548898935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 50.875, "completions/mean_terminated_length": 50.875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.3671402633190155, "epoch": 0.138, "frac_reward_zero_std": 0.0, "grad_norm": 1.2564339637756348, "kl": 0.004587341565638781, "learning_rate": 4.981090274276406e-06, "loss": 0.0895, "num_tokens": 387032.0, "reward": 0.3462499976158142, "reward_std": 0.2717037796974182, "rewards/reward_func/mean": 0.3462499976158142, "rewards/reward_func/std": 0.5316265821456909, "sampling/importance_sampling_ratio/max": 1.0020262002944946, "sampling/importance_sampling_ratio/mean": 0.821398913860321, "sampling/importance_sampling_ratio/min": 0.551517903804779, "sampling/sampling_logp_difference/max": 0.5747532844543457, "sampling/sampling_logp_difference/mean": 0.02456159144639969, "step": 69, "step_time": 37.778685440003756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 52.625, "completions/mean_terminated_length": 52.625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.2950865924358368, "epoch": 0.14, "frac_reward_zero_std": 0.0, "grad_norm": 0.7871184349060059, "kl": 0.00491219712421298, "learning_rate": 4.980083271121215e-06, "loss": -0.0193, "num_tokens": 392609.0, "reward": 0.45500004291534424, "reward_std": 0.5168381333351135, "rewards/reward_func/mean": 0.45500004291534424, "rewards/reward_func/std": 0.550713837146759, "sampling/importance_sampling_ratio/max": 1.3376414775848389, "sampling/importance_sampling_ratio/mean": 0.9009042978286743, "sampling/importance_sampling_ratio/min": 0.5153571963310242, "sampling/sampling_logp_difference/max": 0.4256160259246826, "sampling/sampling_logp_difference/mean": 0.021552588790655136, "step": 70, "step_time": 33.37109695599065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 48.625, "completions/mean_terminated_length": 48.625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.36967194080352783, "epoch": 0.142, "frac_reward_zero_std": 0.0, "grad_norm": 1.4737831354141235, "kl": 0.01288725808262825, "learning_rate": 4.979050253066064e-06, "loss": -0.0912, "num_tokens": 398608.0, "reward": 0.20875000953674316, "reward_std": 0.2906484007835388, "rewards/reward_func/mean": 0.20875000953674316, "rewards/reward_func/std": 0.47351083159446716, "sampling/importance_sampling_ratio/max": 1.5353319644927979, "sampling/importance_sampling_ratio/mean": 1.0639885663986206, "sampling/importance_sampling_ratio/min": 0.7201797962188721, "sampling/sampling_logp_difference/max": 0.34710121154785156, "sampling/sampling_logp_difference/mean": 0.025040332227945328, "step": 71, "step_time": 39.80532811000012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 52.125, "completions/mean_terminated_length": 52.125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.3445550203323364, "epoch": 0.144, "frac_reward_zero_std": 0.0, "grad_norm": 0.8660765290260315, "kl": 0.005788044538348913, "learning_rate": 4.977991230946824e-06, "loss": 0.0375, "num_tokens": 403626.0, "reward": 0.18250001966953278, "reward_std": 0.3195389211177826, "rewards/reward_func/mean": 0.18250001966953278, "rewards/reward_func/std": 0.4660395383834839, "sampling/importance_sampling_ratio/max": 1.5179165601730347, "sampling/importance_sampling_ratio/mean": 1.0042223930358887, "sampling/importance_sampling_ratio/min": 0.5119208693504333, "sampling/sampling_logp_difference/max": 0.47091197967529297, "sampling/sampling_logp_difference/mean": 0.02298400178551674, "step": 72, "step_time": 36.35284370899899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3828085660934448, "epoch": 0.146, "frac_reward_zero_std": 0.0, "grad_norm": 1.972751498222351, "kl": 0.015708088874816895, "learning_rate": 4.976906215872137e-06, "loss": -0.1855, "num_tokens": 409741.0, "reward": 0.581250011920929, "reward_std": 0.5518749952316284, "rewards/reward_func/mean": 0.581250011920929, "rewards/reward_func/std": 0.531801164150238, "sampling/importance_sampling_ratio/max": 2.6650352478027344, "sampling/importance_sampling_ratio/mean": 1.5247585773468018, "sampling/importance_sampling_ratio/min": 0.9828287363052368, "sampling/sampling_logp_difference/max": 0.5306482315063477, "sampling/sampling_logp_difference/mean": 0.028494730591773987, "step": 73, "step_time": 36.31543107799371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.3141552209854126, "epoch": 0.148, "frac_reward_zero_std": 0.0, "grad_norm": 1.3385131359100342, "kl": 0.008036404848098755, "learning_rate": 4.975795219223299e-06, "loss": 0.0508, "num_tokens": 415116.0, "reward": 0.07625000178813934, "reward_std": 0.27722302079200745, "rewards/reward_func/mean": 0.07625000178813934, "rewards/reward_func/std": 0.37408700585365295, "sampling/importance_sampling_ratio/max": 1.733062505722046, "sampling/importance_sampling_ratio/mean": 1.0138859748840332, "sampling/importance_sampling_ratio/min": 0.45135366916656494, "sampling/sampling_logp_difference/max": 0.582321286201477, "sampling/sampling_logp_difference/mean": 0.020871151238679886, "step": 74, "step_time": 43.80073598799936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.875, "completions/mean_terminated_length": 54.875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.3232402801513672, "epoch": 0.15, "frac_reward_zero_std": 0.0, "grad_norm": 0.7976144552230835, "kl": 0.006514251232147217, "learning_rate": 4.974658252654135e-06, "loss": 0.0469, "num_tokens": 420518.0, "reward": 0.20875000953674316, "reward_std": 0.5280059576034546, "rewards/reward_func/mean": 0.20875000953674316, "rewards/reward_func/std": 0.4891519546508789, "sampling/importance_sampling_ratio/max": 0.8984686136245728, "sampling/importance_sampling_ratio/mean": 0.6726990938186646, "sampling/importance_sampling_ratio/min": 0.3876783549785614, "sampling/sampling_logp_difference/max": 0.6330904960632324, "sampling/sampling_logp_difference/mean": 0.026919633150100708, "step": 75, "step_time": 35.85063866600103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.3804740905761719, "epoch": 0.152, "frac_reward_zero_std": 0.0, "grad_norm": 0.9046527743339539, "kl": 0.0029437714256346226, "learning_rate": 4.973495328090891e-06, "loss": -0.0522, "num_tokens": 425455.0, "reward": 0.5950000286102295, "reward_std": 0.5526574850082397, "rewards/reward_func/mean": 0.5950000286102295, "rewards/reward_func/std": 0.5316550731658936, "sampling/importance_sampling_ratio/max": 1.0966424942016602, "sampling/importance_sampling_ratio/mean": 0.7680986523628235, "sampling/importance_sampling_ratio/min": 0.4954715371131897, "sampling/sampling_logp_difference/max": 0.29123687744140625, "sampling/sampling_logp_difference/mean": 0.023649394512176514, "step": 76, "step_time": 30.126132238001446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.335227370262146, "epoch": 0.154, "frac_reward_zero_std": 0.0, "grad_norm": 0.8402533531188965, "kl": 0.012151396833360195, "learning_rate": 4.972306457732091e-06, "loss": -0.1083, "num_tokens": 430739.0, "reward": 0.32999998331069946, "reward_std": 0.30979883670806885, "rewards/reward_func/mean": 0.32999998331069946, "rewards/reward_func/std": 0.5463384985923767, "sampling/importance_sampling_ratio/max": 1.4036647081375122, "sampling/importance_sampling_ratio/mean": 0.8151348829269409, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9675121307373047, "sampling/sampling_logp_difference/mean": 0.02233710139989853, "step": 77, "step_time": 33.453950964001706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.34430697560310364, "epoch": 0.156, "frac_reward_zero_std": 0.0, "grad_norm": 0.8867908716201782, "kl": 0.0038880002684891224, "learning_rate": 4.971091654048427e-06, "loss": -0.1323, "num_tokens": 437126.0, "reward": 0.21124999225139618, "reward_std": 0.5004571676254272, "rewards/reward_func/mean": 0.21124999225139618, "rewards/reward_func/std": 0.46366357803344727, "sampling/importance_sampling_ratio/max": 1.108424186706543, "sampling/importance_sampling_ratio/mean": 0.829704999923706, "sampling/importance_sampling_ratio/min": 0.47837772965431213, "sampling/sampling_logp_difference/max": 0.3985975682735443, "sampling/sampling_logp_difference/mean": 0.020493805408477783, "step": 78, "step_time": 40.64764082799957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.3588639497756958, "epoch": 0.158, "frac_reward_zero_std": 0.0, "grad_norm": 1.0783358812332153, "kl": 0.005843465216457844, "learning_rate": 4.96985092978261e-06, "loss": -0.0939, "num_tokens": 442120.0, "reward": 0.32875001430511475, "reward_std": 0.5790164470672607, "rewards/reward_func/mean": 0.32875001430511475, "rewards/reward_func/std": 0.5529256463050842, "sampling/importance_sampling_ratio/max": 1.6267454624176025, "sampling/importance_sampling_ratio/mean": 0.9230892658233643, "sampling/importance_sampling_ratio/min": 0.31369680166244507, "sampling/sampling_logp_difference/max": 0.3530135154724121, "sampling/sampling_logp_difference/mean": 0.025078624486923218, "step": 79, "step_time": 32.59568661899539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 45.625, "completions/mean_terminated_length": 45.625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.35594335198402405, "epoch": 0.16, "frac_reward_zero_std": 0.0, "grad_norm": 1.645215630531311, "kl": 0.004302887246012688, "learning_rate": 4.968584297949255e-06, "loss": -0.2434, "num_tokens": 447668.0, "reward": 0.29624998569488525, "reward_std": 0.5543216466903687, "rewards/reward_func/mean": 0.29624998569488525, "rewards/reward_func/std": 0.5293645262718201, "sampling/importance_sampling_ratio/max": 1.6743134260177612, "sampling/importance_sampling_ratio/mean": 1.0188958644866943, "sampling/importance_sampling_ratio/min": 0.6892949938774109, "sampling/sampling_logp_difference/max": 0.3438667058944702, "sampling/sampling_logp_difference/mean": 0.02349107712507248, "step": 80, "step_time": 41.75027698998747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 44.625, "completions/mean_terminated_length": 44.625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.40325072407722473, "epoch": 0.162, "frac_reward_zero_std": 0.0, "grad_norm": 0.9571352601051331, "kl": 0.004008749034255743, "learning_rate": 4.967291771834727e-06, "loss": 0.0373, "num_tokens": 453098.0, "reward": 0.17500001192092896, "reward_std": 0.34363842010498047, "rewards/reward_func/mean": 0.17500001192092896, "rewards/reward_func/std": 0.5028774738311768, "sampling/importance_sampling_ratio/max": 1.3233540058135986, "sampling/importance_sampling_ratio/mean": 0.8455219268798828, "sampling/importance_sampling_ratio/min": 0.36903640627861023, "sampling/sampling_logp_difference/max": 0.3818695545196533, "sampling/sampling_logp_difference/mean": 0.03359740972518921, "step": 81, "step_time": 40.28149596700678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.32033318281173706, "epoch": 0.164, "frac_reward_zero_std": 0.0, "grad_norm": 0.9337257146835327, "kl": 0.011265389621257782, "learning_rate": 4.965973364997015e-06, "loss": 0.0583, "num_tokens": 459142.0, "reward": 0.20500001311302185, "reward_std": 0.3244031071662903, "rewards/reward_func/mean": 0.20500001311302185, "rewards/reward_func/std": 0.49210917949676514, "sampling/importance_sampling_ratio/max": 1.5231561660766602, "sampling/importance_sampling_ratio/mean": 0.8062511682510376, "sampling/importance_sampling_ratio/min": 0.09447702020406723, "sampling/sampling_logp_difference/max": 1.0004373788833618, "sampling/sampling_logp_difference/mean": 0.027456309646368027, "step": 82, "step_time": 36.980090021010255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3576924800872803, "epoch": 0.166, "frac_reward_zero_std": 0.0, "grad_norm": 1.1017056703567505, "kl": 0.009185336530208588, "learning_rate": 4.964629091265583e-06, "loss": 0.0681, "num_tokens": 464285.0, "reward": 0.3387500047683716, "reward_std": 0.5671484470367432, "rewards/reward_func/mean": 0.3387500047683716, "rewards/reward_func/std": 0.54511958360672, "sampling/importance_sampling_ratio/max": 1.5919902324676514, "sampling/importance_sampling_ratio/mean": 0.9405478835105896, "sampling/importance_sampling_ratio/min": 0.2894143760204315, "sampling/sampling_logp_difference/max": 0.8768386840820312, "sampling/sampling_logp_difference/mean": 0.028965311124920845, "step": 83, "step_time": 28.138020016995142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 54.875, "completions/mean_terminated_length": 54.875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.3319105803966522, "epoch": 0.168, "frac_reward_zero_std": 0.0, "grad_norm": 1.053510069847107, "kl": 0.0065056635066866875, "learning_rate": 4.963258964741227e-06, "loss": -0.1728, "num_tokens": 469551.0, "reward": 0.08000000566244125, "reward_std": 0.2886171042919159, "rewards/reward_func/mean": 0.08000000566244125, "rewards/reward_func/std": 0.3737073242664337, "sampling/importance_sampling_ratio/max": 1.9180289506912231, "sampling/importance_sampling_ratio/mean": 1.1588903665542603, "sampling/importance_sampling_ratio/min": 0.5762325525283813, "sampling/sampling_logp_difference/max": 0.4678354263305664, "sampling/sampling_logp_difference/mean": 0.024262480437755585, "step": 84, "step_time": 37.53115506299946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3720715641975403, "epoch": 0.17, "frac_reward_zero_std": 0.0, "grad_norm": 1.3156875371932983, "kl": 0.08773906528949738, "learning_rate": 4.961862999795923e-06, "loss": 0.0025, "num_tokens": 475505.0, "reward": 0.4750000238418579, "reward_std": 0.5239638686180115, "rewards/reward_func/mean": 0.4750000238418579, "rewards/reward_func/std": 0.5539726614952087, "sampling/importance_sampling_ratio/max": 1.7880008220672607, "sampling/importance_sampling_ratio/mean": 0.9351547956466675, "sampling/importance_sampling_ratio/min": 0.23688003420829773, "sampling/sampling_logp_difference/max": 1.3453662395477295, "sampling/sampling_logp_difference/mean": 0.02895110286772251, "step": 85, "step_time": 35.246074732000125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 46.0, "completions/mean_terminated_length": 46.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.32649070024490356, "epoch": 0.172, "frac_reward_zero_std": 0.0, "grad_norm": 1.0877724885940552, "kl": 0.009383026510477066, "learning_rate": 4.960441211072686e-06, "loss": 0.1414, "num_tokens": 480647.0, "reward": 0.07375000417232513, "reward_std": 0.27671927213668823, "rewards/reward_func/mean": 0.07375000417232513, "rewards/reward_func/std": 0.3709038197994232, "sampling/importance_sampling_ratio/max": 1.5382874011993408, "sampling/importance_sampling_ratio/mean": 1.010871171951294, "sampling/importance_sampling_ratio/min": 0.32187968492507935, "sampling/sampling_logp_difference/max": 0.4144246578216553, "sampling/sampling_logp_difference/mean": 0.01961221918463707, "step": 86, "step_time": 33.18983396900876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 48.375, "completions/mean_terminated_length": 48.375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.33819085359573364, "epoch": 0.174, "frac_reward_zero_std": 0.0, "grad_norm": 1.5221152305603027, "kl": 0.03121965564787388, "learning_rate": 4.958993613485406e-06, "loss": 0.2319, "num_tokens": 485763.0, "reward": 0.17625001072883606, "reward_std": 0.5154582262039185, "rewards/reward_func/mean": 0.17625001072883606, "rewards/reward_func/std": 0.47874653339385986, "sampling/importance_sampling_ratio/max": 1.8117644786834717, "sampling/importance_sampling_ratio/mean": 1.0830358266830444, "sampling/importance_sampling_ratio/min": 0.344952791929245, "sampling/sampling_logp_difference/max": 0.6365394592285156, "sampling/sampling_logp_difference/mean": 0.02507655695080757, "step": 87, "step_time": 38.59099734299525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 57.75, "completions/mean_terminated_length": 57.75, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.3603730797767639, "epoch": 0.176, "frac_reward_zero_std": 0.0, "grad_norm": 1.142964482307434, "kl": 0.007601139135658741, "learning_rate": 4.957520222218695e-06, "loss": 0.1548, "num_tokens": 491817.0, "reward": 0.08624999970197678, "reward_std": 0.27129849791526794, "rewards/reward_func/mean": 0.08624999970197678, "rewards/reward_func/std": 0.3660186529159546, "sampling/importance_sampling_ratio/max": 2.3958003520965576, "sampling/importance_sampling_ratio/mean": 1.1398122310638428, "sampling/importance_sampling_ratio/min": 0.34555134177207947, "sampling/sampling_logp_difference/max": 0.3420066833496094, "sampling/sampling_logp_difference/mean": 0.01923590898513794, "step": 88, "step_time": 43.65914840900223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.4104352593421936, "epoch": 0.178, "frac_reward_zero_std": 0.0, "grad_norm": 0.8413081169128418, "kl": 0.010053085163235664, "learning_rate": 4.956021052727731e-06, "loss": -0.0215, "num_tokens": 497655.0, "reward": 0.3449999988079071, "reward_std": 0.5602920651435852, "rewards/reward_func/mean": 0.3449999988079071, "rewards/reward_func/std": 0.5405552983283997, "sampling/importance_sampling_ratio/max": 1.5695143938064575, "sampling/importance_sampling_ratio/mean": 0.9689695835113525, "sampling/importance_sampling_ratio/min": 0.6118794083595276, "sampling/sampling_logp_difference/max": 0.7474043369293213, "sampling/sampling_logp_difference/mean": 0.024773046374320984, "step": 89, "step_time": 32.42648905600072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 59.5, "completions/mean_terminated_length": 59.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.33168306946754456, "epoch": 0.18, "frac_reward_zero_std": 0.0, "grad_norm": 1.541452169418335, "kl": 0.0170269925147295, "learning_rate": 4.954496120738094e-06, "loss": 0.1195, "num_tokens": 503146.0, "reward": 0.4775000214576721, "reward_std": 0.512241542339325, "rewards/reward_func/mean": 0.4775000214576721, "rewards/reward_func/std": 0.5457040071487427, "sampling/importance_sampling_ratio/max": 1.8365960121154785, "sampling/importance_sampling_ratio/mean": 1.0645668506622314, "sampling/importance_sampling_ratio/min": 0.3866049349308014, "sampling/sampling_logp_difference/max": 0.6471219062805176, "sampling/sampling_logp_difference/mean": 0.021687893196940422, "step": 90, "step_time": 25.5371619570069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 47.625, "completions/mean_terminated_length": 47.625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.34987396001815796, "epoch": 0.182, "frac_reward_zero_std": 0.0, "grad_norm": 0.8267898559570312, "kl": 0.009192817844450474, "learning_rate": 4.952945442245598e-06, "loss": -0.1841, "num_tokens": 509047.0, "reward": 0.21250000596046448, "reward_std": 0.5125744342803955, "rewards/reward_func/mean": 0.21250000596046448, "rewards/reward_func/std": 0.47475558519363403, "sampling/importance_sampling_ratio/max": 1.1211237907409668, "sampling/importance_sampling_ratio/mean": 0.6856480836868286, "sampling/importance_sampling_ratio/min": 0.16603736579418182, "sampling/sampling_logp_difference/max": 0.8373830318450928, "sampling/sampling_logp_difference/mean": 0.02722262404859066, "step": 91, "step_time": 32.49494073499227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3588416576385498, "epoch": 0.184, "frac_reward_zero_std": 0.0, "grad_norm": 1.3908851146697998, "kl": 0.04168040677905083, "learning_rate": 4.951369033516127e-06, "loss": -0.0976, "num_tokens": 514627.0, "reward": 0.45875000953674316, "reward_std": 0.5896173715591431, "rewards/reward_func/mean": 0.45875000953674316, "rewards/reward_func/std": 0.54619300365448, "sampling/importance_sampling_ratio/max": 1.545922040939331, "sampling/importance_sampling_ratio/mean": 1.153027057647705, "sampling/importance_sampling_ratio/min": 0.5263999104499817, "sampling/sampling_logp_difference/max": 0.7717450857162476, "sampling/sampling_logp_difference/mean": 0.02575552463531494, "step": 92, "step_time": 39.09205864999967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 52.125, "completions/mean_terminated_length": 52.125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.3676631450653076, "epoch": 0.186, "frac_reward_zero_std": 0.0, "grad_norm": 1.2240070104599, "kl": 0.02092805504798889, "learning_rate": 4.949766911085461e-06, "loss": 0.0766, "num_tokens": 520393.0, "reward": 0.48000001907348633, "reward_std": 0.5069187879562378, "rewards/reward_func/mean": 0.48000001907348633, "rewards/reward_func/std": 0.5400793552398682, "sampling/importance_sampling_ratio/max": 2.340593099594116, "sampling/importance_sampling_ratio/mean": 1.5034531354904175, "sampling/importance_sampling_ratio/min": 0.6435762643814087, "sampling/sampling_logp_difference/max": 0.8327808380126953, "sampling/sampling_logp_difference/mean": 0.023994414135813713, "step": 93, "step_time": 37.435129256002256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 59.375, "completions/mean_terminated_length": 59.375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.4088976979255676, "epoch": 0.188, "frac_reward_zero_std": 0.0, "grad_norm": 1.024932622909546, "kl": 0.004795517306774855, "learning_rate": 4.948139091759108e-06, "loss": 0.2021, "num_tokens": 526559.0, "reward": 0.48625001311302185, "reward_std": 0.5142859816551208, "rewards/reward_func/mean": 0.48625001311302185, "rewards/reward_func/std": 0.5441097021102905, "sampling/importance_sampling_ratio/max": 1.42252779006958, "sampling/importance_sampling_ratio/mean": 0.6734194159507751, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.3571433424949646, "sampling/sampling_logp_difference/mean": 0.02588842436671257, "step": 94, "step_time": 38.451151984001626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.34375160932540894, "epoch": 0.19, "frac_reward_zero_std": 0.0, "grad_norm": 0.9852098226547241, "kl": 0.00799483060836792, "learning_rate": 4.946485592612122e-06, "loss": 0.0547, "num_tokens": 532321.0, "reward": 0.22374999523162842, "reward_std": 0.5145085453987122, "rewards/reward_func/mean": 0.22374999523162842, "rewards/reward_func/std": 0.4764433205127716, "sampling/importance_sampling_ratio/max": 0.9923078417778015, "sampling/importance_sampling_ratio/mean": 0.7997302412986755, "sampling/importance_sampling_ratio/min": 0.577215850353241, "sampling/sampling_logp_difference/max": 0.3057703971862793, "sampling/sampling_logp_difference/mean": 0.019131341949105263, "step": 95, "step_time": 34.67093198900693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.33774328231811523, "epoch": 0.192, "frac_reward_zero_std": 0.0, "grad_norm": 1.1137478351593018, "kl": 0.009615856222808361, "learning_rate": 4.944806430988927e-06, "loss": -0.1692, "num_tokens": 537647.0, "reward": 0.048750005662441254, "reward_std": 0.286370187997818, "rewards/reward_func/mean": 0.048750005662441254, "rewards/reward_func/std": 0.378132164478302, "sampling/importance_sampling_ratio/max": 2.006246328353882, "sampling/importance_sampling_ratio/mean": 1.1477537155151367, "sampling/importance_sampling_ratio/min": 0.2995043992996216, "sampling/sampling_logp_difference/max": 1.0059900283813477, "sampling/sampling_logp_difference/mean": 0.025709955021739006, "step": 96, "step_time": 36.05396345700137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 58.75, "completions/mean_terminated_length": 58.75, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.3597293794155121, "epoch": 0.194, "frac_reward_zero_std": 0.0, "grad_norm": 1.5075836181640625, "kl": 0.010133556090295315, "learning_rate": 4.943101624503133e-06, "loss": -0.2914, "num_tokens": 543234.0, "reward": 0.21250000596046448, "reward_std": 0.5191043615341187, "rewards/reward_func/mean": 0.21250000596046448, "rewards/reward_func/std": 0.4812706708908081, "sampling/importance_sampling_ratio/max": 1.8020318746566772, "sampling/importance_sampling_ratio/mean": 1.1152775287628174, "sampling/importance_sampling_ratio/min": 0.6069132685661316, "sampling/sampling_logp_difference/max": 0.5707888603210449, "sampling/sampling_logp_difference/mean": 0.026453383266925812, "step": 97, "step_time": 35.73249283900077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 52.125, "completions/mean_terminated_length": 52.125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3398410677909851, "epoch": 0.196, "frac_reward_zero_std": 0.0, "grad_norm": 1.3996599912643433, "kl": 0.008223360404372215, "learning_rate": 4.941371191037353e-06, "loss": -0.1045, "num_tokens": 548982.0, "reward": 0.45249998569488525, "reward_std": 0.5502059459686279, "rewards/reward_func/mean": 0.45249998569488525, "rewards/reward_func/std": 0.5803878307342529, "sampling/importance_sampling_ratio/max": 1.8260504007339478, "sampling/importance_sampling_ratio/mean": 1.0000381469726562, "sampling/importance_sampling_ratio/min": 0.422150194644928, "sampling/sampling_logp_difference/max": 0.4241912364959717, "sampling/sampling_logp_difference/mean": 0.023312591016292572, "step": 98, "step_time": 38.02977259900945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3524587154388428, "epoch": 0.198, "frac_reward_zero_std": 0.0, "grad_norm": 1.7168464660644531, "kl": 0.0064958324655890465, "learning_rate": 4.939615148743017e-06, "loss": 0.2992, "num_tokens": 554371.0, "reward": 0.5700000524520874, "reward_std": 0.5693867802619934, "rewards/reward_func/mean": 0.5700000524520874, "rewards/reward_func/std": 0.5539984703063965, "sampling/importance_sampling_ratio/max": 1.9318469762802124, "sampling/importance_sampling_ratio/mean": 1.0261414051055908, "sampling/importance_sampling_ratio/min": 0.5000623464584351, "sampling/sampling_logp_difference/max": 0.5306458473205566, "sampling/sampling_logp_difference/mean": 0.02562127634882927, "step": 99, "step_time": 28.632846849999623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 58.75, "completions/mean_terminated_length": 58.75, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.343850314617157, "epoch": 0.2, "frac_reward_zero_std": 0.0, "grad_norm": 1.1735574007034302, "kl": 0.012371711432933807, "learning_rate": 4.937833516040177e-06, "loss": 0.1179, "num_tokens": 560801.0, "reward": 0.32749998569488525, "reward_std": 0.5631698369979858, "rewards/reward_func/mean": 0.32749998569488525, "rewards/reward_func/std": 0.5443934798240662, "sampling/importance_sampling_ratio/max": 1.9959181547164917, "sampling/importance_sampling_ratio/mean": 1.3071322441101074, "sampling/importance_sampling_ratio/min": 0.6965984106063843, "sampling/sampling_logp_difference/max": 0.5585286617279053, "sampling/sampling_logp_difference/mean": 0.018983395770192146, "step": 100, "step_time": 39.44753839400073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 49.625, "completions/mean_terminated_length": 49.625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.35799241065979004, "epoch": 0.202, "frac_reward_zero_std": 0.0, "grad_norm": 1.0344713926315308, "kl": 0.014730488881468773, "learning_rate": 4.936026311617316e-06, "loss": 0.2868, "num_tokens": 566165.0, "reward": 0.07999999821186066, "reward_std": 0.2884673476219177, "rewards/reward_func/mean": 0.07999999821186066, "rewards/reward_func/std": 0.37599390745162964, "sampling/importance_sampling_ratio/max": 1.5769941806793213, "sampling/importance_sampling_ratio/mean": 0.9793978333473206, "sampling/importance_sampling_ratio/min": 0.3244423568248749, "sampling/sampling_logp_difference/max": 0.5661906003952026, "sampling/sampling_logp_difference/mean": 0.027164215222001076, "step": 101, "step_time": 36.72873655399599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 52.625, "completions/mean_terminated_length": 52.625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.34581878781318665, "epoch": 0.204, "frac_reward_zero_std": 0.0, "grad_norm": 1.3470356464385986, "kl": 0.0077892690896987915, "learning_rate": 4.9341935544311536e-06, "loss": -0.1361, "num_tokens": 570990.0, "reward": 0.3500000238418579, "reward_std": 0.5515884160995483, "rewards/reward_func/mean": 0.3500000238418579, "rewards/reward_func/std": 0.5307945609092712, "sampling/importance_sampling_ratio/max": 1.5632634162902832, "sampling/importance_sampling_ratio/mean": 0.9828509092330933, "sampling/importance_sampling_ratio/min": 0.5150687098503113, "sampling/sampling_logp_difference/max": 0.46799755096435547, "sampling/sampling_logp_difference/mean": 0.0202273391187191, "step": 102, "step_time": 24.58553309200215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.33340394496917725, "epoch": 0.206, "frac_reward_zero_std": 0.0, "grad_norm": 0.9854996800422668, "kl": 0.007550486363470554, "learning_rate": 4.932335263706446e-06, "loss": -0.1328, "num_tokens": 576939.0, "reward": 0.19999998807907104, "reward_std": 0.5018428564071655, "rewards/reward_func/mean": 0.19999998807907104, "rewards/reward_func/std": 0.4653416574001312, "sampling/importance_sampling_ratio/max": 2.2604258060455322, "sampling/importance_sampling_ratio/mean": 0.915320098400116, "sampling/importance_sampling_ratio/min": 0.19731059670448303, "sampling/sampling_logp_difference/max": 1.3153386116027832, "sampling/sampling_logp_difference/mean": 0.024587368592619896, "step": 103, "step_time": 37.690257228998234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 48.625, "completions/mean_terminated_length": 48.625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3149360716342926, "epoch": 0.208, "frac_reward_zero_std": 0.0, "grad_norm": 0.9567559361457825, "kl": 0.017881600186228752, "learning_rate": 4.930451458935783e-06, "loss": -0.2071, "num_tokens": 581824.0, "reward": 0.20374999940395355, "reward_std": 0.5112870335578918, "rewards/reward_func/mean": 0.20374999940395355, "rewards/reward_func/std": 0.4740384817123413, "sampling/importance_sampling_ratio/max": 1.1789283752441406, "sampling/importance_sampling_ratio/mean": 0.6693528890609741, "sampling/importance_sampling_ratio/min": 0.29736894369125366, "sampling/sampling_logp_difference/max": 0.6565618515014648, "sampling/sampling_logp_difference/mean": 0.027554277330636978, "step": 104, "step_time": 30.336770003996207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 48.125, "completions/mean_terminated_length": 48.125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3541415333747864, "epoch": 0.21, "frac_reward_zero_std": 0.0, "grad_norm": 1.6499571800231934, "kl": 0.019035879522562027, "learning_rate": 4.928542159879386e-06, "loss": -0.2454, "num_tokens": 586973.0, "reward": 0.29624998569488525, "reward_std": 0.5775634050369263, "rewards/reward_func/mean": 0.29624998569488525, "rewards/reward_func/std": 0.5488412976264954, "sampling/importance_sampling_ratio/max": 1.7103009223937988, "sampling/importance_sampling_ratio/mean": 1.2459993362426758, "sampling/importance_sampling_ratio/min": 0.36490964889526367, "sampling/sampling_logp_difference/max": 0.5930310487747192, "sampling/sampling_logp_difference/mean": 0.02690104767680168, "step": 105, "step_time": 37.13503508499707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 46.375, "completions/mean_terminated_length": 46.375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.38179636001586914, "epoch": 0.212, "frac_reward_zero_std": 0.0, "grad_norm": 1.3703417778015137, "kl": 0.07741641998291016, "learning_rate": 4.926607386564898e-06, "loss": -0.1311, "num_tokens": 592270.0, "reward": 0.21875, "reward_std": 0.296247273683548, "rewards/reward_func/mean": 0.21875, "rewards/reward_func/std": 0.4547664523124695, "sampling/importance_sampling_ratio/max": 2.1035547256469727, "sampling/importance_sampling_ratio/mean": 0.9159399271011353, "sampling/importance_sampling_ratio/min": 0.3674011826515198, "sampling/sampling_logp_difference/max": 0.7901006937026978, "sampling/sampling_logp_difference/mean": 0.030587412416934967, "step": 106, "step_time": 35.471169879994704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 53.375, "completions/mean_terminated_length": 53.375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.3480619192123413, "epoch": 0.214, "frac_reward_zero_std": 0.0, "grad_norm": 1.0856449604034424, "kl": 0.01090352050960064, "learning_rate": 4.924647159287176e-06, "loss": -0.1514, "num_tokens": 597816.0, "reward": 0.19499999284744263, "reward_std": 0.5200541615486145, "rewards/reward_func/mean": 0.19499999284744263, "rewards/reward_func/std": 0.48373547196388245, "sampling/importance_sampling_ratio/max": 1.8715648651123047, "sampling/importance_sampling_ratio/mean": 0.986532986164093, "sampling/importance_sampling_ratio/min": 0.3344648778438568, "sampling/sampling_logp_difference/max": 0.6495161056518555, "sampling/sampling_logp_difference/mean": 0.025133948773145676, "step": 107, "step_time": 37.98941664501035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 49.875, "completions/mean_terminated_length": 49.875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.33404314517974854, "epoch": 0.216, "frac_reward_zero_std": 0.0, "grad_norm": 1.1951714754104614, "kl": 0.009612706489861012, "learning_rate": 4.922661498608077e-06, "loss": -0.0089, "num_tokens": 602998.0, "reward": 0.20750001072883606, "reward_std": 0.3204679489135742, "rewards/reward_func/mean": 0.20750001072883606, "rewards/reward_func/std": 0.4872298240661621, "sampling/importance_sampling_ratio/max": 1.3054614067077637, "sampling/importance_sampling_ratio/mean": 0.8988068103790283, "sampling/importance_sampling_ratio/min": 0.4151885509490967, "sampling/sampling_logp_difference/max": 0.5403620004653931, "sampling/sampling_logp_difference/mean": 0.023432891815900803, "step": 108, "step_time": 40.85303926198685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.125, "completions/mean_terminated_length": 54.125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.30731695890426636, "epoch": 0.218, "frac_reward_zero_std": 0.0, "grad_norm": 1.472243070602417, "kl": 0.015070393681526184, "learning_rate": 4.920650425356239e-06, "loss": -0.2328, "num_tokens": 608265.0, "reward": 0.4625000059604645, "reward_std": 0.5839909315109253, "rewards/reward_func/mean": 0.4625000059604645, "rewards/reward_func/std": 0.5412881970405579, "sampling/importance_sampling_ratio/max": 2.572606086730957, "sampling/importance_sampling_ratio/mean": 1.3286433219909668, "sampling/importance_sampling_ratio/min": 0.5425832271575928, "sampling/sampling_logp_difference/max": 0.5253163576126099, "sampling/sampling_logp_difference/mean": 0.023545201867818832, "step": 109, "step_time": 35.24712469300721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 52.125, "completions/mean_terminated_length": 52.125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.32885903120040894, "epoch": 0.22, "frac_reward_zero_std": 0.0, "grad_norm": 1.0502865314483643, "kl": 0.033357176929712296, "learning_rate": 4.9186139606268735e-06, "loss": 0.1122, "num_tokens": 613625.0, "reward": 0.4650000035762787, "reward_std": 0.5683454871177673, "rewards/reward_func/mean": 0.4650000035762787, "rewards/reward_func/std": 0.5274466872215271, "sampling/importance_sampling_ratio/max": 1.862295150756836, "sampling/importance_sampling_ratio/mean": 0.8837443590164185, "sampling/importance_sampling_ratio/min": 0.2522660791873932, "sampling/sampling_logp_difference/max": 0.8349018096923828, "sampling/sampling_logp_difference/mean": 0.02544923685491085, "step": 110, "step_time": 31.10387792700203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.375, "completions/mean_terminated_length": 53.375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.36214321851730347, "epoch": 0.222, "frac_reward_zero_std": 0.0, "grad_norm": 1.5155655145645142, "kl": 0.006974513176828623, "learning_rate": 4.916552125781529e-06, "loss": 0.0182, "num_tokens": 619329.0, "reward": 0.44875001907348633, "reward_std": 0.5807082653045654, "rewards/reward_func/mean": 0.44875001907348633, "rewards/reward_func/std": 0.5378113985061646, "sampling/importance_sampling_ratio/max": 2.2787792682647705, "sampling/importance_sampling_ratio/mean": 1.3124430179595947, "sampling/importance_sampling_ratio/min": 0.5105971097946167, "sampling/sampling_logp_difference/max": 0.5723431706428528, "sampling/sampling_logp_difference/mean": 0.02273915708065033, "step": 111, "step_time": 39.27943245699862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 52.875, "completions/mean_terminated_length": 52.875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.35876524448394775, "epoch": 0.224, "frac_reward_zero_std": 0.0, "grad_norm": 0.9791905879974365, "kl": 0.02388225495815277, "learning_rate": 4.9144649424478765e-06, "loss": -0.2625, "num_tokens": 624851.0, "reward": 0.07499999552965164, "reward_std": 0.2920842170715332, "rewards/reward_func/mean": 0.07499999552965164, "rewards/reward_func/std": 0.37305688858032227, "sampling/importance_sampling_ratio/max": 1.6145038604736328, "sampling/importance_sampling_ratio/mean": 0.7806305885314941, "sampling/importance_sampling_ratio/min": 0.31658169627189636, "sampling/sampling_logp_difference/max": 0.9168522357940674, "sampling/sampling_logp_difference/mean": 0.031929485499858856, "step": 112, "step_time": 40.077556558011565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 57.25, "completions/mean_terminated_length": 57.25, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.39105433225631714, "epoch": 0.226, "frac_reward_zero_std": 0.0, "grad_norm": 1.3499541282653809, "kl": 0.03188120573759079, "learning_rate": 4.912352432519484e-06, "loss": -0.2043, "num_tokens": 630276.0, "reward": 0.2224999964237213, "reward_std": 0.5185043811798096, "rewards/reward_func/mean": 0.2224999964237213, "rewards/reward_func/std": 0.48052799701690674, "sampling/importance_sampling_ratio/max": 1.5414526462554932, "sampling/importance_sampling_ratio/mean": 1.035444736480713, "sampling/importance_sampling_ratio/min": 0.3080333471298218, "sampling/sampling_logp_difference/max": 0.841944694519043, "sampling/sampling_logp_difference/mean": 0.025317739695310593, "step": 113, "step_time": 33.08625747299811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.35971030592918396, "epoch": 0.228, "frac_reward_zero_std": 0.0, "grad_norm": 1.4594817161560059, "kl": 0.017640622332692146, "learning_rate": 4.910214618155579e-06, "loss": 0.1665, "num_tokens": 636074.0, "reward": 0.3162499964237213, "reward_std": 0.5943635106086731, "rewards/reward_func/mean": 0.3162499964237213, "rewards/reward_func/std": 0.5652796626091003, "sampling/importance_sampling_ratio/max": 2.2878103256225586, "sampling/importance_sampling_ratio/mean": 1.1907858848571777, "sampling/importance_sampling_ratio/min": 0.431149423122406, "sampling/sampling_logp_difference/max": 0.8824386596679688, "sampling/sampling_logp_difference/mean": 0.023232053965330124, "step": 114, "step_time": 34.20750616800797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 46.5, "completions/mean_terminated_length": 46.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.35053420066833496, "epoch": 0.23, "frac_reward_zero_std": 0.0, "grad_norm": 0.8730654120445251, "kl": 0.10532344877719879, "learning_rate": 4.908051521780824e-06, "loss": -0.0439, "num_tokens": 641976.0, "reward": 0.3449999988079071, "reward_std": 0.5371800661087036, "rewards/reward_func/mean": 0.3449999988079071, "rewards/reward_func/std": 0.5183214545249939, "sampling/importance_sampling_ratio/max": 1.1692423820495605, "sampling/importance_sampling_ratio/mean": 0.8554747700691223, "sampling/importance_sampling_ratio/min": 0.27807939052581787, "sampling/sampling_logp_difference/max": 1.0537323951721191, "sampling/sampling_logp_difference/mean": 0.028338592499494553, "step": 115, "step_time": 39.78751064000244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 56.125, "completions/mean_terminated_length": 56.125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.34610265493392944, "epoch": 0.232, "frac_reward_zero_std": 0.0, "grad_norm": 0.7445147633552551, "kl": 0.012357879430055618, "learning_rate": 4.905863166085076e-06, "loss": -0.1522, "num_tokens": 647397.0, "reward": -0.04374999925494194, "reward_std": 0.044737864285707474, "rewards/reward_func/mean": -0.04374999925494194, "rewards/reward_func/std": 0.04340424761176109, "sampling/importance_sampling_ratio/max": 1.5308575630187988, "sampling/importance_sampling_ratio/mean": 0.9485741853713989, "sampling/importance_sampling_ratio/min": 0.4332679510116577, "sampling/sampling_logp_difference/max": 0.3149690628051758, "sampling/sampling_logp_difference/mean": 0.021136745810508728, "step": 116, "step_time": 34.7742198099877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 45.75, "completions/mean_terminated_length": 45.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.31725019216537476, "epoch": 0.234, "frac_reward_zero_std": 0.0, "grad_norm": 1.139970302581787, "kl": 0.019606376066803932, "learning_rate": 4.903649574023151e-06, "loss": -0.105, "num_tokens": 653878.0, "reward": 0.17999999225139618, "reward_std": 0.5323570966720581, "rewards/reward_func/mean": 0.17999999225139618, "rewards/reward_func/std": 0.49361640214920044, "sampling/importance_sampling_ratio/max": 1.8025496006011963, "sampling/importance_sampling_ratio/mean": 1.0302600860595703, "sampling/importance_sampling_ratio/min": 0.5091882944107056, "sampling/sampling_logp_difference/max": 0.5300393104553223, "sampling/sampling_logp_difference/mean": 0.022776808589696884, "step": 117, "step_time": 40.60955931300123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 50.125, "completions/mean_terminated_length": 50.125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3315849006175995, "epoch": 0.236, "frac_reward_zero_std": 0.0, "grad_norm": 1.014474868774414, "kl": 0.025705253705382347, "learning_rate": 4.901410768814581e-06, "loss": 0.2675, "num_tokens": 660043.0, "reward": 0.05625000223517418, "reward_std": 0.285952627658844, "rewards/reward_func/mean": 0.05625000223517418, "rewards/reward_func/std": 0.36146280169487, "sampling/importance_sampling_ratio/max": 2.1567413806915283, "sampling/importance_sampling_ratio/mean": 0.9692880511283875, "sampling/importance_sampling_ratio/min": 0.07723711431026459, "sampling/sampling_logp_difference/max": 0.8733996152877808, "sampling/sampling_logp_difference/mean": 0.03249724209308624, "step": 118, "step_time": 38.65960728799109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 50.875, "completions/mean_terminated_length": 50.875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3267901837825775, "epoch": 0.238, "frac_reward_zero_std": 0.0, "grad_norm": 1.1987251043319702, "kl": 0.03742775321006775, "learning_rate": 4.899146773943374e-06, "loss": -0.582, "num_tokens": 665037.0, "reward": 0.3512499928474426, "reward_std": 0.5559871196746826, "rewards/reward_func/mean": 0.3512499928474426, "rewards/reward_func/std": 0.5351218581199646, "sampling/importance_sampling_ratio/max": 1.7580586671829224, "sampling/importance_sampling_ratio/mean": 0.7292503714561462, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.458031415939331, "sampling/sampling_logp_difference/mean": 0.029961854219436646, "step": 119, "step_time": 26.94828627500101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 52.875, "completions/mean_terminated_length": 52.875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3302837014198303, "epoch": 0.24, "frac_reward_zero_std": 0.0, "grad_norm": 0.7964790463447571, "kl": 0.018735038116574287, "learning_rate": 4.896857613157765e-06, "loss": 0.0465, "num_tokens": 670757.0, "reward": 0.09125000238418579, "reward_std": 0.27616971731185913, "rewards/reward_func/mean": 0.09125000238418579, "rewards/reward_func/std": 0.36876001954078674, "sampling/importance_sampling_ratio/max": 1.618713617324829, "sampling/importance_sampling_ratio/mean": 0.8206771612167358, "sampling/importance_sampling_ratio/min": 0.19846399128437042, "sampling/sampling_logp_difference/max": 1.0061447620391846, "sampling/sampling_logp_difference/mean": 0.023636985570192337, "step": 120, "step_time": 36.12980112100195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 51.875, "completions/mean_terminated_length": 51.875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.30235159397125244, "epoch": 0.242, "frac_reward_zero_std": 0.0, "grad_norm": 0.8383305072784424, "kl": 0.027072228491306305, "learning_rate": 4.894543310469968e-06, "loss": -0.0366, "num_tokens": 676152.0, "reward": 0.20125000178813934, "reward_std": 0.5118378400802612, "rewards/reward_func/mean": 0.20125000178813934, "rewards/reward_func/std": 0.4741138517856598, "sampling/importance_sampling_ratio/max": 1.6411762237548828, "sampling/importance_sampling_ratio/mean": 0.9534869194030762, "sampling/importance_sampling_ratio/min": 0.41311678290367126, "sampling/sampling_logp_difference/max": 1.0102167129516602, "sampling/sampling_logp_difference/mean": 0.02292507141828537, "step": 121, "step_time": 33.572335048011155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.3367387056350708, "epoch": 0.244, "frac_reward_zero_std": 0.0, "grad_norm": 1.0167452096939087, "kl": 0.06418883055448532, "learning_rate": 4.8922038901559225e-06, "loss": -0.0048, "num_tokens": 682101.0, "reward": 0.0650000050663948, "reward_std": 0.28377196192741394, "rewards/reward_func/mean": 0.0650000050663948, "rewards/reward_func/std": 0.3791343569755554, "sampling/importance_sampling_ratio/max": 1.28421950340271, "sampling/importance_sampling_ratio/mean": 0.7725279331207275, "sampling/importance_sampling_ratio/min": 0.1273626983165741, "sampling/sampling_logp_difference/max": 1.543129801750183, "sampling/sampling_logp_difference/mean": 0.02904380112886429, "step": 122, "step_time": 39.409893271003966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 46.625, "completions/mean_terminated_length": 46.625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3044854998588562, "epoch": 0.246, "frac_reward_zero_std": 0.0, "grad_norm": 0.8013125061988831, "kl": 0.019057054072618484, "learning_rate": 4.889839376755041e-06, "loss": -0.1674, "num_tokens": 688317.0, "reward": 0.032499998807907104, "reward_std": 0.2980650067329407, "rewards/reward_func/mean": 0.032499998807907104, "rewards/reward_func/std": 0.3873997628688812, "sampling/importance_sampling_ratio/max": 1.4401540756225586, "sampling/importance_sampling_ratio/mean": 1.0427517890930176, "sampling/importance_sampling_ratio/min": 0.5601269006729126, "sampling/sampling_logp_difference/max": 0.845482587814331, "sampling/sampling_logp_difference/mean": 0.022582385689020157, "step": 123, "step_time": 47.42131155300012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.32158392667770386, "epoch": 0.248, "frac_reward_zero_std": 0.0, "grad_norm": 1.1628981828689575, "kl": 0.025724977254867554, "learning_rate": 4.887449795069948e-06, "loss": 0.0402, "num_tokens": 694391.0, "reward": 0.5987499952316284, "reward_std": 0.2651512026786804, "rewards/reward_func/mean": 0.5987499952316284, "rewards/reward_func/std": 0.5322039723396301, "sampling/importance_sampling_ratio/max": 2.152245283126831, "sampling/importance_sampling_ratio/mean": 1.4744114875793457, "sampling/importance_sampling_ratio/min": 0.2157006412744522, "sampling/sampling_logp_difference/max": 1.0837020874023438, "sampling/sampling_logp_difference/mean": 0.02428443171083927, "step": 124, "step_time": 41.57415235800727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.3349279761314392, "epoch": 0.25, "frac_reward_zero_std": 0.0, "grad_norm": 1.2164572477340698, "kl": 0.15385711193084717, "learning_rate": 4.885035170166229e-06, "loss": -0.225, "num_tokens": 699998.0, "reward": 0.4737499952316284, "reward_std": 0.5991882681846619, "rewards/reward_func/mean": 0.4737499952316284, "rewards/reward_func/std": 0.5547441244125366, "sampling/importance_sampling_ratio/max": 1.5618447065353394, "sampling/importance_sampling_ratio/mean": 0.9376882314682007, "sampling/importance_sampling_ratio/min": 0.143229141831398, "sampling/sampling_logp_difference/max": 2.045600414276123, "sampling/sampling_logp_difference/mean": 0.02819056063890457, "step": 125, "step_time": 36.00430190899351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 53.625, "completions/mean_terminated_length": 53.625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.33846554160118103, "epoch": 0.252, "frac_reward_zero_std": 0.0, "grad_norm": 0.9615040421485901, "kl": 0.028729338198900223, "learning_rate": 4.8825955273721524e-06, "loss": -0.0762, "num_tokens": 705652.0, "reward": 0.3187499940395355, "reward_std": 0.5555436015129089, "rewards/reward_func/mean": 0.3187499940395355, "rewards/reward_func/std": 0.5321770906448364, "sampling/importance_sampling_ratio/max": 1.5394591093063354, "sampling/importance_sampling_ratio/mean": 0.9313254356384277, "sampling/importance_sampling_ratio/min": 0.4095960855484009, "sampling/sampling_logp_difference/max": 0.9179900884628296, "sampling/sampling_logp_difference/mean": 0.027478884905576706, "step": 126, "step_time": 35.53052958998887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.317068874835968, "epoch": 0.254, "frac_reward_zero_std": 0.0, "grad_norm": 1.330291509628296, "kl": 0.018047351390123367, "learning_rate": 4.88013089227842e-06, "loss": -0.0729, "num_tokens": 710818.0, "reward": 0.21125000715255737, "reward_std": 0.31445786356925964, "rewards/reward_func/mean": 0.21125000715255737, "rewards/reward_func/std": 0.47555795311927795, "sampling/importance_sampling_ratio/max": 1.5873545408248901, "sampling/importance_sampling_ratio/mean": 1.018317461013794, "sampling/importance_sampling_ratio/min": 0.41594427824020386, "sampling/sampling_logp_difference/max": 1.0792710781097412, "sampling/sampling_logp_difference/mean": 0.02153525874018669, "step": 127, "step_time": 36.711535330003244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3007584810256958, "epoch": 0.256, "frac_reward_zero_std": 0.0, "grad_norm": 0.8899461030960083, "kl": 0.050801776349544525, "learning_rate": 4.8776412907378845e-06, "loss": 0.0411, "num_tokens": 716809.0, "reward": -0.03750000149011612, "reward_std": 0.028086936101317406, "rewards/reward_func/mean": -0.03750000149011612, "rewards/reward_func/std": 0.03011881187558174, "sampling/importance_sampling_ratio/max": 1.389890193939209, "sampling/importance_sampling_ratio/mean": 0.9472289085388184, "sampling/importance_sampling_ratio/min": 0.48205605149269104, "sampling/sampling_logp_difference/max": 0.8377180099487305, "sampling/sampling_logp_difference/mean": 0.020493997260928154, "step": 128, "step_time": 39.203543747993535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 49.625, "completions/mean_terminated_length": 49.625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.30576610565185547, "epoch": 0.258, "frac_reward_zero_std": 0.0, "grad_norm": 0.9521754384040833, "kl": 0.053522251546382904, "learning_rate": 4.87512674886529e-06, "loss": -0.2217, "num_tokens": 721985.0, "reward": 0.20000000298023224, "reward_std": 0.3328213095664978, "rewards/reward_func/mean": 0.20000000298023224, "rewards/reward_func/std": 0.49856939911842346, "sampling/importance_sampling_ratio/max": 2.0129761695861816, "sampling/importance_sampling_ratio/mean": 0.9459276795387268, "sampling/importance_sampling_ratio/min": 0.1618739664554596, "sampling/sampling_logp_difference/max": 1.4156498908996582, "sampling/sampling_logp_difference/mean": 0.02784860134124756, "step": 129, "step_time": 41.28106502900482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.33031588792800903, "epoch": 0.26, "frac_reward_zero_std": 0.0, "grad_norm": 0.6887200474739075, "kl": 0.016769763082265854, "learning_rate": 4.872587293036991e-06, "loss": 0.2953, "num_tokens": 728153.0, "reward": 0.20124998688697815, "reward_std": 0.47694161534309387, "rewards/reward_func/mean": 0.20124998688697815, "rewards/reward_func/std": 0.441925585269928, "sampling/importance_sampling_ratio/max": 1.7701690196990967, "sampling/importance_sampling_ratio/mean": 0.8581265211105347, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9351005554199219, "sampling/sampling_logp_difference/mean": 0.024315927177667618, "step": 130, "step_time": 38.647780115003115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 51.875, "completions/mean_terminated_length": 51.875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.35881704092025757, "epoch": 0.262, "frac_reward_zero_std": 0.0, "grad_norm": 1.5620861053466797, "kl": 0.01443159207701683, "learning_rate": 4.870022949890676e-06, "loss": 0.0968, "num_tokens": 734055.0, "reward": 0.5875000357627869, "reward_std": 0.5394585132598877, "rewards/reward_func/mean": 0.5875000357627869, "rewards/reward_func/std": 0.5231702923774719, "sampling/importance_sampling_ratio/max": 2.211554765701294, "sampling/importance_sampling_ratio/mean": 1.1617193222045898, "sampling/importance_sampling_ratio/min": 0.5277535915374756, "sampling/sampling_logp_difference/max": 0.6931544542312622, "sampling/sampling_logp_difference/mean": 0.024210434406995773, "step": 131, "step_time": 36.056461569998646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 48.875, "completions/mean_terminated_length": 48.875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.29458680748939514, "epoch": 0.264, "frac_reward_zero_std": 0.0, "grad_norm": 0.909350574016571, "kl": 0.1462167501449585, "learning_rate": 4.867433746325093e-06, "loss": 0.1241, "num_tokens": 740181.0, "reward": -0.029999997466802597, "reward_std": 0.039329893887043, "rewards/reward_func/mean": -0.029999997466802597, "rewards/reward_func/std": 0.03664501756429672, "sampling/importance_sampling_ratio/max": 2.2302660942077637, "sampling/importance_sampling_ratio/mean": 0.8643943667411804, "sampling/importance_sampling_ratio/min": 0.17711393535137177, "sampling/sampling_logp_difference/max": 1.4919137954711914, "sampling/sampling_logp_difference/mean": 0.03085586428642273, "step": 132, "step_time": 41.68611562899605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 55.125, "completions/mean_terminated_length": 55.125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.2915087342262268, "epoch": 0.266, "frac_reward_zero_std": 0.0, "grad_norm": 1.1142926216125488, "kl": 0.030653204768896103, "learning_rate": 4.864819709499762e-06, "loss": -0.2321, "num_tokens": 745522.0, "reward": 0.3174999952316284, "reward_std": 0.586236298084259, "rewards/reward_func/mean": 0.3174999952316284, "rewards/reward_func/std": 0.562894344329834, "sampling/importance_sampling_ratio/max": 1.7828896045684814, "sampling/importance_sampling_ratio/mean": 1.0796490907669067, "sampling/importance_sampling_ratio/min": 0.3094936013221741, "sampling/sampling_logp_difference/max": 0.877861499786377, "sampling/sampling_logp_difference/mean": 0.0238037146627903, "step": 133, "step_time": 34.40468131400121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.35022756457328796, "epoch": 0.268, "frac_reward_zero_std": 0.0, "grad_norm": 1.6583671569824219, "kl": 0.21844248473644257, "learning_rate": 4.862180866834691e-06, "loss": -0.0494, "num_tokens": 751260.0, "reward": 0.4650000035762787, "reward_std": 0.6151957511901855, "rewards/reward_func/mean": 0.4650000035762787, "rewards/reward_func/std": 0.5702630877494812, "sampling/importance_sampling_ratio/max": 2.757086992263794, "sampling/importance_sampling_ratio/mean": 1.3203340768814087, "sampling/importance_sampling_ratio/min": 0.6060196757316589, "sampling/sampling_logp_difference/max": 0.6190414428710938, "sampling/sampling_logp_difference/mean": 0.025964640080928802, "step": 134, "step_time": 29.936322672001552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 48.625, "completions/mean_terminated_length": 48.625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.3812062740325928, "epoch": 0.27, "frac_reward_zero_std": 0.0, "grad_norm": 1.5890302658081055, "kl": 0.014861365780234337, "learning_rate": 4.8595172460100914e-06, "loss": 0.0607, "num_tokens": 756304.0, "reward": 0.03875000402331352, "reward_std": 0.271657258272171, "rewards/reward_func/mean": 0.03875000402331352, "rewards/reward_func/std": 0.35534441471099854, "sampling/importance_sampling_ratio/max": 2.5916507244110107, "sampling/importance_sampling_ratio/mean": 0.8855729103088379, "sampling/importance_sampling_ratio/min": 0.21787041425704956, "sampling/sampling_logp_difference/max": 0.9328546524047852, "sampling/sampling_logp_difference/mean": 0.03202846646308899, "step": 135, "step_time": 39.908553318004124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3220250606536865, "epoch": 0.272, "frac_reward_zero_std": 0.0, "grad_norm": 1.4078805446624756, "kl": 0.023584317415952682, "learning_rate": 4.856828874966086e-06, "loss": -0.1456, "num_tokens": 762477.0, "reward": 0.48750001192092896, "reward_std": 0.5153918266296387, "rewards/reward_func/mean": 0.48750001192092896, "rewards/reward_func/std": 0.5482374429702759, "sampling/importance_sampling_ratio/max": 2.551933765411377, "sampling/importance_sampling_ratio/mean": 1.3002712726593018, "sampling/importance_sampling_ratio/min": 0.884465754032135, "sampling/sampling_logp_difference/max": 0.5747478008270264, "sampling/sampling_logp_difference/mean": 0.022638380527496338, "step": 136, "step_time": 32.25789829700079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 48.125, "completions/mean_terminated_length": 48.125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.3236207962036133, "epoch": 0.274, "frac_reward_zero_std": 0.0, "grad_norm": 0.9340776205062866, "kl": 0.03285123407840729, "learning_rate": 4.854115781902414e-06, "loss": 0.0131, "num_tokens": 768391.0, "reward": 0.21125000715255737, "reward_std": 0.5204670429229736, "rewards/reward_func/mean": 0.21125000715255737, "rewards/reward_func/std": 0.4824472665786743, "sampling/importance_sampling_ratio/max": 1.7785167694091797, "sampling/importance_sampling_ratio/mean": 0.8016531467437744, "sampling/importance_sampling_ratio/min": 0.2458231896162033, "sampling/sampling_logp_difference/max": 1.4985270500183105, "sampling/sampling_logp_difference/mean": 0.028025494888424873, "step": 137, "step_time": 34.520976280007744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.3042897582054138, "epoch": 0.276, "frac_reward_zero_std": 0.0, "grad_norm": 1.278320550918579, "kl": 0.015772713348269463, "learning_rate": 4.851377995278138e-06, "loss": 0.2867, "num_tokens": 773957.0, "reward": 0.19749999046325684, "reward_std": 0.31783488392829895, "rewards/reward_func/mean": 0.19749999046325684, "rewards/reward_func/std": 0.48643749952316284, "sampling/importance_sampling_ratio/max": 2.4400241374969482, "sampling/importance_sampling_ratio/mean": 1.1646664142608643, "sampling/importance_sampling_ratio/min": 0.5930517911911011, "sampling/sampling_logp_difference/max": 0.5375218391418457, "sampling/sampling_logp_difference/mean": 0.023220781236886978, "step": 138, "step_time": 37.38816410599975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.35318320989608765, "epoch": 0.278, "frac_reward_zero_std": 0.0, "grad_norm": 1.0766141414642334, "kl": 0.018960729241371155, "learning_rate": 4.8486155438113455e-06, "loss": -0.2235, "num_tokens": 779691.0, "reward": 0.2175000011920929, "reward_std": 0.321079283952713, "rewards/reward_func/mean": 0.2175000011920929, "rewards/reward_func/std": 0.48352134227752686, "sampling/importance_sampling_ratio/max": 1.471369981765747, "sampling/importance_sampling_ratio/mean": 1.112367868423462, "sampling/importance_sampling_ratio/min": 0.4520459473133087, "sampling/sampling_logp_difference/max": 0.5129961967468262, "sampling/sampling_logp_difference/mean": 0.02581869438290596, "step": 139, "step_time": 36.47941235799226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.35746175050735474, "epoch": 0.28, "frac_reward_zero_std": 0.0, "grad_norm": 1.5763007402420044, "kl": 0.049079518765211105, "learning_rate": 4.845828456478843e-06, "loss": 0.1571, "num_tokens": 785207.0, "reward": 0.32500001788139343, "reward_std": 0.2978859543800354, "rewards/reward_func/mean": 0.32500001788139343, "rewards/reward_func/std": 0.5395500659942627, "sampling/importance_sampling_ratio/max": 1.674521803855896, "sampling/importance_sampling_ratio/mean": 0.9480903148651123, "sampling/importance_sampling_ratio/min": 0.11278916150331497, "sampling/sampling_logp_difference/max": 2.2472801208496094, "sampling/sampling_logp_difference/mean": 0.03470786660909653, "step": 140, "step_time": 40.7823389940022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 52.375, "completions/mean_terminated_length": 52.375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.32371532917022705, "epoch": 0.282, "frac_reward_zero_std": 0.0, "grad_norm": 1.0982533693313599, "kl": 0.01844196580350399, "learning_rate": 4.84301676251586e-06, "loss": 0.1592, "num_tokens": 790173.0, "reward": 0.20874999463558197, "reward_std": 0.30841800570487976, "rewards/reward_func/mean": 0.20874999463558197, "rewards/reward_func/std": 0.45920541882514954, "sampling/importance_sampling_ratio/max": 1.9294414520263672, "sampling/importance_sampling_ratio/mean": 0.9358152747154236, "sampling/importance_sampling_ratio/min": 0.3433741331100464, "sampling/sampling_logp_difference/max": 1.0784287452697754, "sampling/sampling_logp_difference/mean": 0.024096664041280746, "step": 141, "step_time": 38.74317984600202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 48.625, "completions/mean_terminated_length": 48.625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.34515082836151123, "epoch": 0.284, "frac_reward_zero_std": 0.0, "grad_norm": 1.8294340372085571, "kl": 0.030305206775665283, "learning_rate": 4.840180491415733e-06, "loss": -0.1339, "num_tokens": 795229.0, "reward": 0.3450000286102295, "reward_std": 0.5490354299545288, "rewards/reward_func/mean": 0.3450000286102295, "rewards/reward_func/std": 0.5287992358207703, "sampling/importance_sampling_ratio/max": 2.045395612716675, "sampling/importance_sampling_ratio/mean": 1.216064453125, "sampling/importance_sampling_ratio/min": 0.5218259692192078, "sampling/sampling_logp_difference/max": 1.2418723106384277, "sampling/sampling_logp_difference/mean": 0.03373716026544571, "step": 142, "step_time": 31.14222859099391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 54.875, "completions/mean_terminated_length": 54.875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.38218700885772705, "epoch": 0.286, "frac_reward_zero_std": 0.0, "grad_norm": 2.0369791984558105, "kl": 0.047582805156707764, "learning_rate": 4.837319672929606e-06, "loss": 0.6064, "num_tokens": 801778.0, "reward": -0.07625000178813934, "reward_std": 0.03596387431025505, "rewards/reward_func/mean": -0.07625000178813934, "rewards/reward_func/std": 0.04240535944700241, "sampling/importance_sampling_ratio/max": 2.7870662212371826, "sampling/importance_sampling_ratio/mean": 1.0109559297561646, "sampling/importance_sampling_ratio/min": 0.2245939075946808, "sampling/sampling_logp_difference/max": 0.9906719326972961, "sampling/sampling_logp_difference/mean": 0.031783588230609894, "step": 143, "step_time": 47.368925528993714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.125, "completions/mean_terminated_length": 53.125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.3638345003128052, "epoch": 0.288, "frac_reward_zero_std": 0.0, "grad_norm": 0.9337130188941956, "kl": 0.024956434965133667, "learning_rate": 4.834434337066112e-06, "loss": -0.0504, "num_tokens": 808054.0, "reward": 0.09000000357627869, "reward_std": 0.2676810622215271, "rewards/reward_func/mean": 0.09000000357627869, "rewards/reward_func/std": 0.3601190149784088, "sampling/importance_sampling_ratio/max": 2.723515510559082, "sampling/importance_sampling_ratio/mean": 1.0728464126586914, "sampling/importance_sampling_ratio/min": 0.6166611909866333, "sampling/sampling_logp_difference/max": 0.7025502920150757, "sampling/sampling_logp_difference/mean": 0.026006463915109634, "step": 144, "step_time": 40.9219185700058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3282458782196045, "epoch": 0.29, "frac_reward_zero_std": 0.0, "grad_norm": 1.6762001514434814, "kl": 0.031979549676179886, "learning_rate": 4.831524514091056e-06, "loss": -0.1265, "num_tokens": 813504.0, "reward": 0.0624999962747097, "reward_std": 0.2866820991039276, "rewards/reward_func/mean": 0.0624999962747097, "rewards/reward_func/std": 0.36846205592155457, "sampling/importance_sampling_ratio/max": 1.8622187376022339, "sampling/importance_sampling_ratio/mean": 1.1275174617767334, "sampling/importance_sampling_ratio/min": 0.571599543094635, "sampling/sampling_logp_difference/max": 1.3647043704986572, "sampling/sampling_logp_difference/mean": 0.02731524407863617, "step": 145, "step_time": 38.23023704699881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3222322463989258, "epoch": 0.292, "frac_reward_zero_std": 0.0, "grad_norm": 0.9344398379325867, "kl": 0.0195845365524292, "learning_rate": 4.828590234527107e-06, "loss": 0.0898, "num_tokens": 818808.0, "reward": 0.5824999809265137, "reward_std": 0.5538103580474854, "rewards/reward_func/mean": 0.5824999809265137, "rewards/reward_func/std": 0.5363035202026367, "sampling/importance_sampling_ratio/max": 1.1425395011901855, "sampling/importance_sampling_ratio/mean": 0.6538075804710388, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8657850027084351, "sampling/sampling_logp_difference/mean": 0.02582240290939808, "step": 146, "step_time": 30.217266699997708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.31768035888671875, "epoch": 0.294, "frac_reward_zero_std": 0.0, "grad_norm": 1.3994672298431396, "kl": 0.02365160547196865, "learning_rate": 4.825631529153466e-06, "loss": -0.0764, "num_tokens": 824306.0, "reward": 0.35750001668930054, "reward_std": 0.2609923481941223, "rewards/reward_func/mean": 0.35750001668930054, "rewards/reward_func/std": 0.5296292901039124, "sampling/importance_sampling_ratio/max": 1.9504226446151733, "sampling/importance_sampling_ratio/mean": 1.1939644813537598, "sampling/importance_sampling_ratio/min": 0.5214744210243225, "sampling/sampling_logp_difference/max": 0.8518610000610352, "sampling/sampling_logp_difference/mean": 0.024599412456154823, "step": 147, "step_time": 38.60695266799303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 46.75, "completions/mean_terminated_length": 46.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.31512904167175293, "epoch": 0.296, "frac_reward_zero_std": 0.0, "grad_norm": 1.0805143117904663, "kl": 0.021506479009985924, "learning_rate": 4.8226484290055544e-06, "loss": 0.2375, "num_tokens": 830040.0, "reward": 0.19875000417232513, "reward_std": 0.5263077020645142, "rewards/reward_func/mean": 0.19875000417232513, "rewards/reward_func/std": 0.4873672127723694, "sampling/importance_sampling_ratio/max": 2.6040730476379395, "sampling/importance_sampling_ratio/mean": 1.1463204622268677, "sampling/importance_sampling_ratio/min": 0.38138559460639954, "sampling/sampling_logp_difference/max": 0.7039591073989868, "sampling/sampling_logp_difference/mean": 0.025920793414115906, "step": 148, "step_time": 34.074096187992836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 50.125, "completions/mean_terminated_length": 50.125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.3225823640823364, "epoch": 0.298, "frac_reward_zero_std": 0.0, "grad_norm": 1.075644850730896, "kl": 0.01727544516324997, "learning_rate": 4.8196409653746815e-06, "loss": -0.068, "num_tokens": 835585.0, "reward": 0.0637499988079071, "reward_std": 0.251331090927124, "rewards/reward_func/mean": 0.0637499988079071, "rewards/reward_func/std": 0.33940860629081726, "sampling/importance_sampling_ratio/max": 2.514382839202881, "sampling/importance_sampling_ratio/mean": 1.2466282844543457, "sampling/importance_sampling_ratio/min": 0.49027585983276367, "sampling/sampling_logp_difference/max": 0.5779092311859131, "sampling/sampling_logp_difference/mean": 0.020606372505426407, "step": 149, "step_time": 40.79852026500157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 55.125, "completions/mean_terminated_length": 55.125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.34883180260658264, "epoch": 0.3, "frac_reward_zero_std": 0.0, "grad_norm": 0.851610541343689, "kl": 0.021933497861027718, "learning_rate": 4.8166091698077165e-06, "loss": 0.0618, "num_tokens": 840529.0, "reward": 0.20374999940395355, "reward_std": 0.32920289039611816, "rewards/reward_func/mean": 0.20374999940395355, "rewards/reward_func/std": 0.4870006740093231, "sampling/importance_sampling_ratio/max": 1.754447102546692, "sampling/importance_sampling_ratio/mean": 0.8660247921943665, "sampling/importance_sampling_ratio/min": 0.4203989803791046, "sampling/sampling_logp_difference/max": 0.9182287454605103, "sampling/sampling_logp_difference/mean": 0.024133939296007156, "step": 150, "step_time": 35.18480104600894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 43.25, "completions/mean_terminated_length": 43.25, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.28381434082984924, "epoch": 0.302, "frac_reward_zero_std": 0.0, "grad_norm": 0.7717716693878174, "kl": 0.01022813469171524, "learning_rate": 4.813553074106761e-06, "loss": 0.1029, "num_tokens": 845485.0, "reward": 0.20374999940395355, "reward_std": 0.29958459734916687, "rewards/reward_func/mean": 0.20374999940395355, "rewards/reward_func/std": 0.4509335458278656, "sampling/importance_sampling_ratio/max": 1.0267828702926636, "sampling/importance_sampling_ratio/mean": 0.7419992685317993, "sampling/importance_sampling_ratio/min": 0.43720394372940063, "sampling/sampling_logp_difference/max": 0.7798802852630615, "sampling/sampling_logp_difference/mean": 0.024541743099689484, "step": 151, "step_time": 34.449168086997815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.3197404742240906, "epoch": 0.304, "frac_reward_zero_std": 0.0, "grad_norm": 1.2963764667510986, "kl": 0.021068178117275238, "learning_rate": 4.8104727103288125e-06, "loss": -0.0558, "num_tokens": 851304.0, "reward": 0.22374999523162842, "reward_std": 0.5079156160354614, "rewards/reward_func/mean": 0.22374999523162842, "rewards/reward_func/std": 0.47025638818740845, "sampling/importance_sampling_ratio/max": 1.4703145027160645, "sampling/importance_sampling_ratio/mean": 1.0212607383728027, "sampling/importance_sampling_ratio/min": 0.21392318606376648, "sampling/sampling_logp_difference/max": 1.0519495010375977, "sampling/sampling_logp_difference/mean": 0.025931382551789284, "step": 152, "step_time": 36.7780589239992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 57.75, "completions/mean_terminated_length": 57.75, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.3580837845802307, "epoch": 0.306, "frac_reward_zero_std": 0.0, "grad_norm": 1.017104148864746, "kl": 0.00937014352530241, "learning_rate": 4.80736811078543e-06, "loss": 0.0233, "num_tokens": 856882.0, "reward": 0.39499998092651367, "reward_std": 0.6293153166770935, "rewards/reward_func/mean": 0.39499998092651367, "rewards/reward_func/std": 0.5829482078552246, "sampling/importance_sampling_ratio/max": 1.3782010078430176, "sampling/importance_sampling_ratio/mean": 1.1638000011444092, "sampling/importance_sampling_ratio/min": 0.8089496493339539, "sampling/sampling_logp_difference/max": 0.35408473014831543, "sampling/sampling_logp_difference/mean": 0.018965404480695724, "step": 153, "step_time": 36.323780152990366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 52.375, "completions/mean_terminated_length": 52.375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3402557373046875, "epoch": 0.308, "frac_reward_zero_std": 0.0, "grad_norm": 1.0118770599365234, "kl": 0.02948238141834736, "learning_rate": 4.804239308042392e-06, "loss": 0.1774, "num_tokens": 862416.0, "reward": 0.46875, "reward_std": 0.5176174640655518, "rewards/reward_func/mean": 0.46875, "rewards/reward_func/std": 0.560215175151825, "sampling/importance_sampling_ratio/max": 1.8436402082443237, "sampling/importance_sampling_ratio/mean": 1.0758095979690552, "sampling/importance_sampling_ratio/min": 0.30199378728866577, "sampling/sampling_logp_difference/max": 0.8697078227996826, "sampling/sampling_logp_difference/mean": 0.029135018587112427, "step": 154, "step_time": 34.09276152199891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 53.875, "completions/mean_terminated_length": 53.875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.363218754529953, "epoch": 0.31, "frac_reward_zero_std": 0.0, "grad_norm": 1.3487781286239624, "kl": 0.022380828857421875, "learning_rate": 4.8010863349193605e-06, "loss": -0.1792, "num_tokens": 867974.0, "reward": 0.2150000035762787, "reward_std": 0.30277496576309204, "rewards/reward_func/mean": 0.2150000035762787, "rewards/reward_func/std": 0.4673328697681427, "sampling/importance_sampling_ratio/max": 2.5031354427337646, "sampling/importance_sampling_ratio/mean": 1.1692034006118774, "sampling/importance_sampling_ratio/min": 0.5184096097946167, "sampling/sampling_logp_difference/max": 0.4144449830055237, "sampling/sampling_logp_difference/mean": 0.02268042229115963, "step": 155, "step_time": 34.44658254900423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.34199315309524536, "epoch": 0.312, "frac_reward_zero_std": 0.0, "grad_norm": 1.1296980381011963, "kl": 0.03636087477207184, "learning_rate": 4.797909224489531e-06, "loss": 0.29, "num_tokens": 873668.0, "reward": 0.3400000035762787, "reward_std": 0.55422043800354, "rewards/reward_func/mean": 0.3400000035762787, "rewards/reward_func/std": 0.5347362756729126, "sampling/importance_sampling_ratio/max": 1.8167059421539307, "sampling/importance_sampling_ratio/mean": 1.0690479278564453, "sampling/importance_sampling_ratio/min": 0.3096846044063568, "sampling/sampling_logp_difference/max": 1.1487369537353516, "sampling/sampling_logp_difference/mean": 0.02510654367506504, "step": 156, "step_time": 43.02519494399894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 53.375, "completions/mean_terminated_length": 53.375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.33920204639434814, "epoch": 0.314, "frac_reward_zero_std": 0.0, "grad_norm": 0.9249754548072815, "kl": 0.01282467320561409, "learning_rate": 4.794708010079288e-06, "loss": 0.1239, "num_tokens": 879587.0, "reward": 0.19749999046325684, "reward_std": 0.5256129503250122, "rewards/reward_func/mean": 0.19749999046325684, "rewards/reward_func/std": 0.4881379008293152, "sampling/importance_sampling_ratio/max": 2.3966870307922363, "sampling/importance_sampling_ratio/mean": 1.0402398109436035, "sampling/importance_sampling_ratio/min": 0.5133960843086243, "sampling/sampling_logp_difference/max": 0.4196079969406128, "sampling/sampling_logp_difference/mean": 0.023836011067032814, "step": 157, "step_time": 37.06165342399618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 47.875, "completions/mean_terminated_length": 47.875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.35729801654815674, "epoch": 0.316, "frac_reward_zero_std": 0.0, "grad_norm": 1.6500968933105469, "kl": 0.020701207220554352, "learning_rate": 4.791482725267858e-06, "loss": -0.3411, "num_tokens": 884816.0, "reward": 0.05625000596046448, "reward_std": 0.299073725938797, "rewards/reward_func/mean": 0.05625000596046448, "rewards/reward_func/std": 0.38022318482398987, "sampling/importance_sampling_ratio/max": 2.0143370628356934, "sampling/importance_sampling_ratio/mean": 0.829423189163208, "sampling/importance_sampling_ratio/min": 0.4333471953868866, "sampling/sampling_logp_difference/max": 0.9994931221008301, "sampling/sampling_logp_difference/mean": 0.03577464818954468, "step": 158, "step_time": 36.08289574600349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 47.625, "completions/mean_terminated_length": 47.625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.39358967542648315, "epoch": 0.318, "frac_reward_zero_std": 0.0, "grad_norm": 1.0583946704864502, "kl": 0.027374830096960068, "learning_rate": 4.78823340388695e-06, "loss": -0.2827, "num_tokens": 891153.0, "reward": 0.20124998688697815, "reward_std": 0.3190965950489044, "rewards/reward_func/mean": 0.20124998688697815, "rewards/reward_func/std": 0.46035507321357727, "sampling/importance_sampling_ratio/max": 2.10949444770813, "sampling/importance_sampling_ratio/mean": 1.0305712223052979, "sampling/importance_sampling_ratio/min": 0.3240506649017334, "sampling/sampling_logp_difference/max": 0.5563297271728516, "sampling/sampling_logp_difference/mean": 0.025423740968108177, "step": 159, "step_time": 38.957578446003026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 45.75, "completions/mean_terminated_length": 45.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3258906602859497, "epoch": 0.32, "frac_reward_zero_std": 0.0, "grad_norm": 1.1462774276733398, "kl": 0.03443189710378647, "learning_rate": 4.7849600800204075e-06, "loss": -0.1244, "num_tokens": 896743.0, "reward": 0.32124999165534973, "reward_std": 0.30975341796875, "rewards/reward_func/mean": 0.32124999165534973, "rewards/reward_func/std": 0.5477600693702698, "sampling/importance_sampling_ratio/max": 1.8812116384506226, "sampling/importance_sampling_ratio/mean": 1.0209946632385254, "sampling/importance_sampling_ratio/min": 0.4928432106971741, "sampling/sampling_logp_difference/max": 1.028313159942627, "sampling/sampling_logp_difference/mean": 0.025369469076395035, "step": 160, "step_time": 40.763587570007076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.2874404788017273, "epoch": 0.322, "frac_reward_zero_std": 0.0, "grad_norm": 0.8741604089736938, "kl": 0.036176808178424835, "learning_rate": 4.781662788003851e-06, "loss": 0.0124, "num_tokens": 901634.0, "reward": 0.33375000953674316, "reward_std": 0.5386093854904175, "rewards/reward_func/mean": 0.33375000953674316, "rewards/reward_func/std": 0.5257087349891663, "sampling/importance_sampling_ratio/max": 1.1090807914733887, "sampling/importance_sampling_ratio/mean": 0.8030825853347778, "sampling/importance_sampling_ratio/min": 0.5435417294502258, "sampling/sampling_logp_difference/max": 0.7872741222381592, "sampling/sampling_logp_difference/mean": 0.02408299222588539, "step": 161, "step_time": 36.742873462993884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 41.875, "completions/mean_terminated_length": 41.875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.3158836364746094, "epoch": 0.324, "frac_reward_zero_std": 0.0, "grad_norm": 1.0320552587509155, "kl": 0.028184669092297554, "learning_rate": 4.778341562424312e-06, "loss": 0.0158, "num_tokens": 906954.0, "reward": 0.07124999910593033, "reward_std": 0.2818355858325958, "rewards/reward_func/mean": 0.07124999910593033, "rewards/reward_func/std": 0.3733798563480377, "sampling/importance_sampling_ratio/max": 1.5998609066009521, "sampling/importance_sampling_ratio/mean": 0.9003580212593079, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9920358657836914, "sampling/sampling_logp_difference/mean": 0.026213299483060837, "step": 162, "step_time": 36.50039826599823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.32623767852783203, "epoch": 0.326, "frac_reward_zero_std": 0.0, "grad_norm": 0.6853413581848145, "kl": 0.04150290787220001, "learning_rate": 4.774996438119876e-06, "loss": 0.0513, "num_tokens": 912348.0, "reward": 0.05125000327825546, "reward_std": 0.2890905439853668, "rewards/reward_func/mean": 0.05125000327825546, "rewards/reward_func/std": 0.380429744720459, "sampling/importance_sampling_ratio/max": 1.632591962814331, "sampling/importance_sampling_ratio/mean": 0.929560661315918, "sampling/importance_sampling_ratio/min": 0.5039329528808594, "sampling/sampling_logp_difference/max": 1.2554044723510742, "sampling/sampling_logp_difference/mean": 0.02818283624947071, "step": 163, "step_time": 37.48346809898794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.38629114627838135, "epoch": 0.328, "frac_reward_zero_std": 0.0, "grad_norm": 0.8776299357414246, "kl": 0.01975177228450775, "learning_rate": 4.771627450179315e-06, "loss": -0.0369, "num_tokens": 918373.0, "reward": 0.0925000011920929, "reward_std": 0.2773401141166687, "rewards/reward_func/mean": 0.0925000011920929, "rewards/reward_func/std": 0.36768582463264465, "sampling/importance_sampling_ratio/max": 1.5313901901245117, "sampling/importance_sampling_ratio/mean": 0.7385870218276978, "sampling/importance_sampling_ratio/min": 0.15066924691200256, "sampling/sampling_logp_difference/max": 1.0661274194717407, "sampling/sampling_logp_difference/mean": 0.02726839855313301, "step": 164, "step_time": 41.31438583700219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.34316372871398926, "epoch": 0.33, "frac_reward_zero_std": 0.0, "grad_norm": 1.3050645589828491, "kl": 0.04379383847117424, "learning_rate": 4.768234633941716e-06, "loss": -0.2008, "num_tokens": 924728.0, "reward": 0.4787500202655792, "reward_std": 0.033062152564525604, "rewards/reward_func/mean": 0.4787500202655792, "rewards/reward_func/std": 0.5450671911239624, "sampling/importance_sampling_ratio/max": 1.647208333015442, "sampling/importance_sampling_ratio/mean": 0.962598979473114, "sampling/importance_sampling_ratio/min": 0.3506295382976532, "sampling/sampling_logp_difference/max": 1.1214780807495117, "sampling/sampling_logp_difference/mean": 0.024030229076743126, "step": 165, "step_time": 37.69243161600025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 46.875, "completions/mean_terminated_length": 46.875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.32444441318511963, "epoch": 0.332, "frac_reward_zero_std": 0.0, "grad_norm": 1.2222055196762085, "kl": 0.01766137219965458, "learning_rate": 4.764818024996117e-06, "loss": -0.1451, "num_tokens": 930789.0, "reward": 0.20125000178813934, "reward_std": 0.30519673228263855, "rewards/reward_func/mean": 0.20125000178813934, "rewards/reward_func/std": 0.47792521119117737, "sampling/importance_sampling_ratio/max": 1.7913861274719238, "sampling/importance_sampling_ratio/mean": 1.0549870729446411, "sampling/importance_sampling_ratio/min": 0.6209827661514282, "sampling/sampling_logp_difference/max": 0.5931665897369385, "sampling/sampling_logp_difference/mean": 0.0224621519446373, "step": 166, "step_time": 44.832081535001635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 56.0, "completions/mean_terminated_length": 56.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.3322504758834839, "epoch": 0.334, "frac_reward_zero_std": 0.0, "grad_norm": 1.0410302877426147, "kl": 0.0115932896733284, "learning_rate": 4.76137765918113e-06, "loss": 0.1055, "num_tokens": 935988.0, "reward": -0.04249999672174454, "reward_std": 0.042813271284103394, "rewards/reward_func/mean": -0.04249999672174454, "rewards/reward_func/std": 0.040620192885398865, "sampling/importance_sampling_ratio/max": 1.8506327867507935, "sampling/importance_sampling_ratio/mean": 0.9713721871376038, "sampling/importance_sampling_ratio/min": 0.3454587757587433, "sampling/sampling_logp_difference/max": 0.5269238948822021, "sampling/sampling_logp_difference/mean": 0.023420769721269608, "step": 167, "step_time": 37.73256101299194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 51.125, "completions/mean_terminated_length": 51.125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3636493682861328, "epoch": 0.336, "frac_reward_zero_std": 0.0, "grad_norm": 1.0732991695404053, "kl": 0.02313004620373249, "learning_rate": 4.757913572584564e-06, "loss": 0.11, "num_tokens": 941321.0, "reward": 0.5987499952316284, "reward_std": 0.549216091632843, "rewards/reward_func/mean": 0.5987499952316284, "rewards/reward_func/std": 0.5237621665000916, "sampling/importance_sampling_ratio/max": 1.03546142578125, "sampling/importance_sampling_ratio/mean": 0.8432279825210571, "sampling/importance_sampling_ratio/min": 0.4068431258201599, "sampling/sampling_logp_difference/max": 0.6616531014442444, "sampling/sampling_logp_difference/mean": 0.024472419172525406, "step": 168, "step_time": 32.777487155995914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.35398632287979126, "epoch": 0.338, "frac_reward_zero_std": 0.0, "grad_norm": 0.9844120144844055, "kl": 0.021729620173573494, "learning_rate": 4.754425801543047e-06, "loss": 0.0676, "num_tokens": 947289.0, "reward": 0.08124999701976776, "reward_std": 0.28951090574264526, "rewards/reward_func/mean": 0.08124999701976776, "rewards/reward_func/std": 0.37745150923728943, "sampling/importance_sampling_ratio/max": 1.1652858257293701, "sampling/importance_sampling_ratio/mean": 0.9127243757247925, "sampling/importance_sampling_ratio/min": 0.5623902678489685, "sampling/sampling_logp_difference/max": 0.7757892608642578, "sampling/sampling_logp_difference/mean": 0.023509806022047997, "step": 169, "step_time": 49.02694328399957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 49.625, "completions/mean_terminated_length": 49.625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.3460179567337036, "epoch": 0.34, "frac_reward_zero_std": 0.0, "grad_norm": 1.3282604217529297, "kl": 0.020951174199581146, "learning_rate": 4.750914382641647e-06, "loss": -0.2501, "num_tokens": 952689.0, "reward": 0.4937500059604645, "reward_std": 0.5788298845291138, "rewards/reward_func/mean": 0.4937500059604645, "rewards/reward_func/std": 0.5359088182449341, "sampling/importance_sampling_ratio/max": 2.5567233562469482, "sampling/importance_sampling_ratio/mean": 1.0252596139907837, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6747963428497314, "sampling/sampling_logp_difference/mean": 0.031033912673592567, "step": 170, "step_time": 30.625086041996838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 48.125, "completions/mean_terminated_length": 48.125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3202894926071167, "epoch": 0.342, "frac_reward_zero_std": 0.0, "grad_norm": 2.965639114379883, "kl": 0.02088959515094757, "learning_rate": 4.747379352713489e-06, "loss": -0.0871, "num_tokens": 958452.0, "reward": 0.08999999612569809, "reward_std": 0.2740509510040283, "rewards/reward_func/mean": 0.08999999612569809, "rewards/reward_func/std": 0.36851051449775696, "sampling/importance_sampling_ratio/max": 2.276595115661621, "sampling/importance_sampling_ratio/mean": 1.3557054996490479, "sampling/importance_sampling_ratio/min": 0.7857190370559692, "sampling/sampling_logp_difference/max": 0.6380100250244141, "sampling/sampling_logp_difference/mean": 0.02812064066529274, "step": 171, "step_time": 45.271941720013274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 50.375, "completions/mean_terminated_length": 50.375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.36473971605300903, "epoch": 0.344, "frac_reward_zero_std": 0.0, "grad_norm": 0.925652027130127, "kl": 0.057864509522914886, "learning_rate": 4.743820748839362e-06, "loss": 0.086, "num_tokens": 963908.0, "reward": 0.34375, "reward_std": 0.2589721083641052, "rewards/reward_func/mean": 0.34375, "rewards/reward_func/std": 0.5190633535385132, "sampling/importance_sampling_ratio/max": 2.0226168632507324, "sampling/importance_sampling_ratio/mean": 0.9894812703132629, "sampling/importance_sampling_ratio/min": 0.14174574613571167, "sampling/sampling_logp_difference/max": 1.6586623191833496, "sampling/sampling_logp_difference/mean": 0.02724120020866394, "step": 172, "step_time": 35.92826945899287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 46.0, "completions/mean_terminated_length": 46.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.32610613107681274, "epoch": 0.346, "frac_reward_zero_std": 0.0, "grad_norm": 0.7379596829414368, "kl": 0.04068184643983841, "learning_rate": 4.740238608347337e-06, "loss": 0.0258, "num_tokens": 969638.0, "reward": 0.08125000447034836, "reward_std": 0.28094959259033203, "rewards/reward_func/mean": 0.08125000447034836, "rewards/reward_func/std": 0.37357112765312195, "sampling/importance_sampling_ratio/max": 1.0961157083511353, "sampling/importance_sampling_ratio/mean": 0.7261965274810791, "sampling/importance_sampling_ratio/min": 0.32102635502815247, "sampling/sampling_logp_difference/max": 1.0670392513275146, "sampling/sampling_logp_difference/mean": 0.026620227843523026, "step": 173, "step_time": 37.101911647987436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 51.125, "completions/mean_terminated_length": 51.125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.310922771692276, "epoch": 0.348, "frac_reward_zero_std": 0.0, "grad_norm": 0.942238450050354, "kl": 0.011856799945235252, "learning_rate": 4.736632968812374e-06, "loss": -0.0206, "num_tokens": 974836.0, "reward": 0.4750000238418579, "reward_std": 0.6064904928207397, "rewards/reward_func/mean": 0.4750000238418579, "rewards/reward_func/std": 0.561553955078125, "sampling/importance_sampling_ratio/max": 1.7288455963134766, "sampling/importance_sampling_ratio/mean": 0.8856727480888367, "sampling/importance_sampling_ratio/min": 0.38050374388694763, "sampling/sampling_logp_difference/max": 1.0040206909179688, "sampling/sampling_logp_difference/mean": 0.021355075761675835, "step": 174, "step_time": 28.776191511002253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 49.125, "completions/mean_terminated_length": 49.125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.39204347133636475, "epoch": 0.35, "frac_reward_zero_std": 0.0, "grad_norm": 1.9643235206604004, "kl": 0.020943326875567436, "learning_rate": 4.733003868055923e-06, "loss": -0.1772, "num_tokens": 980946.0, "reward": 0.19249999523162842, "reward_std": 0.519625186920166, "rewards/reward_func/mean": 0.19249999523162842, "rewards/reward_func/std": 0.4812706410884857, "sampling/importance_sampling_ratio/max": 1.807846188545227, "sampling/importance_sampling_ratio/mean": 1.206834077835083, "sampling/importance_sampling_ratio/min": 0.4919049143791199, "sampling/sampling_logp_difference/max": 0.673508882522583, "sampling/sampling_logp_difference/mean": 0.027322232723236084, "step": 175, "step_time": 44.86855116499646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 46.25, "completions/mean_terminated_length": 46.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3516603112220764, "epoch": 0.352, "frac_reward_zero_std": 0.0, "grad_norm": 1.0344626903533936, "kl": 0.013177501037716866, "learning_rate": 4.729351344145536e-06, "loss": -0.0118, "num_tokens": 986362.0, "reward": 0.23125000298023224, "reward_std": 0.30514803528785706, "rewards/reward_func/mean": 0.23125000298023224, "rewards/reward_func/std": 0.46887215971946716, "sampling/importance_sampling_ratio/max": 1.2862433195114136, "sampling/importance_sampling_ratio/mean": 0.8300943374633789, "sampling/importance_sampling_ratio/min": 0.2423274964094162, "sampling/sampling_logp_difference/max": 0.8146283626556396, "sampling/sampling_logp_difference/mean": 0.028766807168722153, "step": 176, "step_time": 33.98946770199109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 47.625, "completions/mean_terminated_length": 47.625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.30126285552978516, "epoch": 0.354, "frac_reward_zero_std": 0.0, "grad_norm": 0.9718162417411804, "kl": 0.047380391508340836, "learning_rate": 4.725675435394461e-06, "loss": -0.0925, "num_tokens": 991859.0, "reward": 0.3362500071525574, "reward_std": 0.570709228515625, "rewards/reward_func/mean": 0.3362500071525574, "rewards/reward_func/std": 0.5503749251365662, "sampling/importance_sampling_ratio/max": 1.8823354244232178, "sampling/importance_sampling_ratio/mean": 0.8540709018707275, "sampling/importance_sampling_ratio/min": 0.27412620186805725, "sampling/sampling_logp_difference/max": 1.3906545639038086, "sampling/sampling_logp_difference/mean": 0.02667493373155594, "step": 177, "step_time": 32.61802449199604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 49.375, "completions/mean_terminated_length": 49.375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.30542075634002686, "epoch": 0.356, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640173196792603, "kl": 0.08134660124778748, "learning_rate": 4.721976180361239e-06, "loss": -0.0737, "num_tokens": 996941.0, "reward": 0.33249998092651367, "reward_std": 0.547777533531189, "rewards/reward_func/mean": 0.33249998092651367, "rewards/reward_func/std": 0.5260295867919922, "sampling/importance_sampling_ratio/max": 2.168469190597534, "sampling/importance_sampling_ratio/mean": 0.9947339296340942, "sampling/importance_sampling_ratio/min": 0.20307093858718872, "sampling/sampling_logp_difference/max": 1.36405611038208, "sampling/sampling_logp_difference/mean": 0.026516852900385857, "step": 178, "step_time": 34.52733155799797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 50.125, "completions/mean_terminated_length": 50.125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.31798258423805237, "epoch": 0.358, "frac_reward_zero_std": 0.0, "grad_norm": 0.9152070879936218, "kl": 0.06630422919988632, "learning_rate": 4.718253617849306e-06, "loss": 0.0036, "num_tokens": 1002934.0, "reward": 0.20499999821186066, "reward_std": 0.5118623971939087, "rewards/reward_func/mean": 0.20499999821186066, "rewards/reward_func/std": 0.47473302483558655, "sampling/importance_sampling_ratio/max": 1.6232030391693115, "sampling/importance_sampling_ratio/mean": 0.7850979566574097, "sampling/importance_sampling_ratio/min": 0.14968660473823547, "sampling/sampling_logp_difference/max": 0.8372163772583008, "sampling/sampling_logp_difference/mean": 0.026052938774228096, "step": 179, "step_time": 43.08452248299727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.361512690782547, "epoch": 0.36, "frac_reward_zero_std": 0.0, "grad_norm": 0.9003314971923828, "kl": 0.021023273468017578, "learning_rate": 4.7145077869065815e-06, "loss": -0.0662, "num_tokens": 1008504.0, "reward": 0.33375000953674316, "reward_std": 0.569517970085144, "rewards/reward_func/mean": 0.33375000953674316, "rewards/reward_func/std": 0.5475122332572937, "sampling/importance_sampling_ratio/max": 1.3407214879989624, "sampling/importance_sampling_ratio/mean": 0.9676664471626282, "sampling/importance_sampling_ratio/min": 0.5192452073097229, "sampling/sampling_logp_difference/max": 0.6442506909370422, "sampling/sampling_logp_difference/mean": 0.026277683675289154, "step": 180, "step_time": 35.98104014099226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.348086416721344, "epoch": 0.362, "frac_reward_zero_std": 0.0, "grad_norm": 1.418018102645874, "kl": 0.05438853055238724, "learning_rate": 4.710738726825059e-06, "loss": 0.3842, "num_tokens": 1014483.0, "reward": 0.3474999964237213, "reward_std": 0.538013756275177, "rewards/reward_func/mean": 0.3474999964237213, "rewards/reward_func/std": 0.5186452269554138, "sampling/importance_sampling_ratio/max": 2.6314172744750977, "sampling/importance_sampling_ratio/mean": 1.0734989643096924, "sampling/importance_sampling_ratio/min": 0.10073666274547577, "sampling/sampling_logp_difference/max": 1.0612448453903198, "sampling/sampling_logp_difference/mean": 0.027826346457004547, "step": 181, "step_time": 31.0331748949975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3266671299934387, "epoch": 0.364, "frac_reward_zero_std": 0.0, "grad_norm": 1.193312168121338, "kl": 0.09049133956432343, "learning_rate": 4.706946477140396e-06, "loss": -0.1725, "num_tokens": 1019583.0, "reward": 0.46000003814697266, "reward_std": 0.5269915461540222, "rewards/reward_func/mean": 0.46000003814697266, "rewards/reward_func/std": 0.55250084400177, "sampling/importance_sampling_ratio/max": 1.7039434909820557, "sampling/importance_sampling_ratio/mean": 1.0256106853485107, "sampling/importance_sampling_ratio/min": 0.3618232309818268, "sampling/sampling_logp_difference/max": 0.6861605644226074, "sampling/sampling_logp_difference/mean": 0.02597002312541008, "step": 182, "step_time": 32.926690759995836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 45.25, "completions/mean_terminated_length": 45.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3671197295188904, "epoch": 0.366, "frac_reward_zero_std": 0.0, "grad_norm": 1.113112211227417, "kl": 0.1199764832854271, "learning_rate": 4.703131077631498e-06, "loss": -0.1404, "num_tokens": 1025042.0, "reward": 0.08250000327825546, "reward_std": 0.27332803606987, "rewards/reward_func/mean": 0.08250000327825546, "rewards/reward_func/std": 0.37297070026397705, "sampling/importance_sampling_ratio/max": 1.334097146987915, "sampling/importance_sampling_ratio/mean": 0.5801923274993896, "sampling/importance_sampling_ratio/min": 0.1981949657201767, "sampling/sampling_logp_difference/max": 1.5971126556396484, "sampling/sampling_logp_difference/mean": 0.035717226564884186, "step": 183, "step_time": 34.52302819299803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 52.625, "completions/mean_terminated_length": 52.625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3453419804573059, "epoch": 0.368, "frac_reward_zero_std": 0.0, "grad_norm": 1.4540451765060425, "kl": 0.02787589468061924, "learning_rate": 4.699292568320097e-06, "loss": -0.1431, "num_tokens": 1030316.0, "reward": 0.20125000178813934, "reward_std": 0.32598677277565, "rewards/reward_func/mean": 0.20125000178813934, "rewards/reward_func/std": 0.4764583110809326, "sampling/importance_sampling_ratio/max": 2.2556746006011963, "sampling/importance_sampling_ratio/mean": 1.255730390548706, "sampling/importance_sampling_ratio/min": 0.7337526679039001, "sampling/sampling_logp_difference/max": 0.6424871683120728, "sampling/sampling_logp_difference/mean": 0.03114481456577778, "step": 184, "step_time": 40.47120292400359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 50.625, "completions/mean_terminated_length": 50.625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3271094262599945, "epoch": 0.37, "frac_reward_zero_std": 0.0, "grad_norm": 0.8317610025405884, "kl": 0.015337463468313217, "learning_rate": 4.6954309894703435e-06, "loss": 0.132, "num_tokens": 1035537.0, "reward": 0.33500000834465027, "reward_std": 0.571208119392395, "rewards/reward_func/mean": 0.33500000834465027, "rewards/reward_func/std": 0.5463123321533203, "sampling/importance_sampling_ratio/max": 1.6981804370880127, "sampling/importance_sampling_ratio/mean": 0.7978900671005249, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0647196769714355, "sampling/sampling_logp_difference/mean": 0.025768490508198738, "step": 185, "step_time": 36.403069805994164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 48.875, "completions/mean_terminated_length": 48.875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3013562560081482, "epoch": 0.372, "frac_reward_zero_std": 0.0, "grad_norm": 1.3188611268997192, "kl": 0.0544801689684391, "learning_rate": 4.69154638158837e-06, "loss": 0.3933, "num_tokens": 1041100.0, "reward": 0.06624999642372131, "reward_std": 0.28185734152793884, "rewards/reward_func/mean": 0.06624999642372131, "rewards/reward_func/std": 0.3781132698059082, "sampling/importance_sampling_ratio/max": 1.8907493352890015, "sampling/importance_sampling_ratio/mean": 0.9408384561538696, "sampling/importance_sampling_ratio/min": 0.3894776403903961, "sampling/sampling_logp_difference/max": 0.856117844581604, "sampling/sampling_logp_difference/mean": 0.027976665645837784, "step": 186, "step_time": 45.57103870699939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 48.125, "completions/mean_terminated_length": 48.125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.39340823888778687, "epoch": 0.374, "frac_reward_zero_std": 0.0, "grad_norm": 1.1754374504089355, "kl": 0.020966093987226486, "learning_rate": 4.687638785421875e-06, "loss": -0.0169, "num_tokens": 1048354.0, "reward": 0.32375001907348633, "reward_std": 0.5538316965103149, "rewards/reward_func/mean": 0.32375001907348633, "rewards/reward_func/std": 0.5343871712684631, "sampling/importance_sampling_ratio/max": 2.006873846054077, "sampling/importance_sampling_ratio/mean": 0.89984130859375, "sampling/importance_sampling_ratio/min": 0.4372103810310364, "sampling/sampling_logp_difference/max": 0.740842342376709, "sampling/sampling_logp_difference/mean": 0.029992148280143738, "step": 187, "step_time": 50.629349813010776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 52.875, "completions/mean_terminated_length": 52.875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.3264714479446411, "epoch": 0.376, "frac_reward_zero_std": 0.0, "grad_norm": 1.4700989723205566, "kl": 0.01745157688856125, "learning_rate": 4.683708241959694e-06, "loss": 0.3489, "num_tokens": 1054088.0, "reward": 0.5987499952316284, "reward_std": 0.545418381690979, "rewards/reward_func/mean": 0.5987499952316284, "rewards/reward_func/std": 0.524034857749939, "sampling/importance_sampling_ratio/max": 2.040449857711792, "sampling/importance_sampling_ratio/mean": 1.0357085466384888, "sampling/importance_sampling_ratio/min": 0.47571852803230286, "sampling/sampling_logp_difference/max": 0.6641407012939453, "sampling/sampling_logp_difference/mean": 0.027400383725762367, "step": 188, "step_time": 30.43117203100701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 49.125, "completions/mean_terminated_length": 49.125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.3622972071170807, "epoch": 0.378, "frac_reward_zero_std": 0.0, "grad_norm": 0.803978681564331, "kl": 0.015578195452690125, "learning_rate": 4.679754792431368e-06, "loss": 0.0812, "num_tokens": 1059205.0, "reward": 0.4424999952316284, "reward_std": 0.6157840490341187, "rewards/reward_func/mean": 0.4424999952316284, "rewards/reward_func/std": 0.5730806589126587, "sampling/importance_sampling_ratio/max": 1.8119560480117798, "sampling/importance_sampling_ratio/mean": 0.7570701837539673, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7887172698974609, "sampling/sampling_logp_difference/mean": 0.023639146238565445, "step": 189, "step_time": 29.315127633002703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 48.625, "completions/mean_terminated_length": 48.625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.36829447746276855, "epoch": 0.38, "frac_reward_zero_std": 0.0, "grad_norm": 1.311635136604309, "kl": 0.03190737962722778, "learning_rate": 4.675778478306712e-06, "loss": 0.068, "num_tokens": 1064888.0, "reward": 0.0625000074505806, "reward_std": 0.2896197438240051, "rewards/reward_func/mean": 0.0625000074505806, "rewards/reward_func/std": 0.3778038024902344, "sampling/importance_sampling_ratio/max": 2.0182361602783203, "sampling/importance_sampling_ratio/mean": 1.091983675956726, "sampling/importance_sampling_ratio/min": 0.5430738925933838, "sampling/sampling_logp_difference/max": 0.522942304611206, "sampling/sampling_logp_difference/mean": 0.02682231366634369, "step": 190, "step_time": 38.76032414600195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 51.625, "completions/mean_terminated_length": 51.625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.33380353450775146, "epoch": 0.382, "frac_reward_zero_std": 0.0, "grad_norm": 1.2356117963790894, "kl": 0.012882580980658531, "learning_rate": 4.671779341295378e-06, "loss": -0.0225, "num_tokens": 1069880.0, "reward": 0.4675000011920929, "reward_std": 0.5203424692153931, "rewards/reward_func/mean": 0.4675000011920929, "rewards/reward_func/std": 0.548341691493988, "sampling/importance_sampling_ratio/max": 1.7154066562652588, "sampling/importance_sampling_ratio/mean": 1.2481818199157715, "sampling/importance_sampling_ratio/min": 0.59772789478302, "sampling/sampling_logp_difference/max": 0.43304991722106934, "sampling/sampling_logp_difference/mean": 0.022705163806676865, "step": 191, "step_time": 34.24753138900269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 54.375, "completions/mean_terminated_length": 54.375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.3223872184753418, "epoch": 0.384, "frac_reward_zero_std": 0.0, "grad_norm": 1.1335781812667847, "kl": 0.05066206306219101, "learning_rate": 4.667757423346423e-06, "loss": -0.2483, "num_tokens": 1074868.0, "reward": 0.32625001668930054, "reward_std": 0.5714980363845825, "rewards/reward_func/mean": 0.32625001668930054, "rewards/reward_func/std": 0.5501672029495239, "sampling/importance_sampling_ratio/max": 2.406949520111084, "sampling/importance_sampling_ratio/mean": 1.1955797672271729, "sampling/importance_sampling_ratio/min": 0.2920358180999756, "sampling/sampling_logp_difference/max": 1.1931378841400146, "sampling/sampling_logp_difference/mean": 0.02424553781747818, "step": 192, "step_time": 29.6137595110049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3726162314414978, "epoch": 0.386, "frac_reward_zero_std": 0.0, "grad_norm": 0.8356767892837524, "kl": 0.020604528486728668, "learning_rate": 4.663712766647862e-06, "loss": 0.143, "num_tokens": 1081294.0, "reward": 0.06499999761581421, "reward_std": 0.3063734769821167, "rewards/reward_func/mean": 0.06499999761581421, "rewards/reward_func/std": 0.38172540068626404, "sampling/importance_sampling_ratio/max": 1.441202163696289, "sampling/importance_sampling_ratio/mean": 0.7192156314849854, "sampling/importance_sampling_ratio/min": 0.4092276692390442, "sampling/sampling_logp_difference/max": 0.7641327381134033, "sampling/sampling_logp_difference/mean": 0.0280342735350132, "step": 193, "step_time": 40.439407575002406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.2863295376300812, "epoch": 0.388, "frac_reward_zero_std": 0.0, "grad_norm": 0.9479438662528992, "kl": 0.02047712728381157, "learning_rate": 4.65964541362623e-06, "loss": -0.1188, "num_tokens": 1086752.0, "reward": 0.33375000953674316, "reward_std": 0.5453760623931885, "rewards/reward_func/mean": 0.33375000953674316, "rewards/reward_func/std": 0.5247839689254761, "sampling/importance_sampling_ratio/max": 1.8372682332992554, "sampling/importance_sampling_ratio/mean": 0.9729915857315063, "sampling/importance_sampling_ratio/min": 0.4610009491443634, "sampling/sampling_logp_difference/max": 0.9207382202148438, "sampling/sampling_logp_difference/mean": 0.022392306476831436, "step": 194, "step_time": 35.24215532100061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.32116204500198364, "epoch": 0.39, "frac_reward_zero_std": 0.0, "grad_norm": 1.1504021883010864, "kl": 0.0485202930867672, "learning_rate": 4.655555406946135e-06, "loss": 0.0483, "num_tokens": 1091989.0, "reward": 0.19874998927116394, "reward_std": 0.5272426605224609, "rewards/reward_func/mean": 0.19874998927116394, "rewards/reward_func/std": 0.4883042871952057, "sampling/importance_sampling_ratio/max": 2.2885639667510986, "sampling/importance_sampling_ratio/mean": 0.93783038854599, "sampling/importance_sampling_ratio/min": 0.25737592577934265, "sampling/sampling_logp_difference/max": 0.6654841899871826, "sampling/sampling_logp_difference/mean": 0.022658096626400948, "step": 195, "step_time": 32.107545826991554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.32712772488594055, "epoch": 0.392, "frac_reward_zero_std": 0.0, "grad_norm": 0.951930820941925, "kl": 0.027490120381116867, "learning_rate": 4.651442789509813e-06, "loss": -0.1811, "num_tokens": 1097346.0, "reward": 0.08249999582767487, "reward_std": 0.2915714383125305, "rewards/reward_func/mean": 0.08249999582767487, "rewards/reward_func/std": 0.373430073261261, "sampling/importance_sampling_ratio/max": 1.5322624444961548, "sampling/importance_sampling_ratio/mean": 0.8581938743591309, "sampling/importance_sampling_ratio/min": 0.5205777883529663, "sampling/sampling_logp_difference/max": 0.3183736801147461, "sampling/sampling_logp_difference/mean": 0.018254410475492477, "step": 196, "step_time": 35.83734907700273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 46.375, "completions/mean_terminated_length": 46.375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.39830613136291504, "epoch": 0.394, "frac_reward_zero_std": 0.0, "grad_norm": 1.7966833114624023, "kl": 0.026175260543823242, "learning_rate": 4.647307604456675e-06, "loss": -0.195, "num_tokens": 1103633.0, "reward": 0.20124998688697815, "reward_std": 0.5211325883865356, "rewards/reward_func/mean": 0.20124998688697815, "rewards/reward_func/std": 0.48300954699516296, "sampling/importance_sampling_ratio/max": 1.6678045988082886, "sampling/importance_sampling_ratio/mean": 0.7011741399765015, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3222107887268066, "sampling/sampling_logp_difference/mean": 0.031116489320993423, "step": 197, "step_time": 42.751525571002276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 52.375, "completions/mean_terminated_length": 52.375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3397493362426758, "epoch": 0.396, "frac_reward_zero_std": 0.0, "grad_norm": 1.7519938945770264, "kl": 0.02400045096874237, "learning_rate": 4.643149895162854e-06, "loss": -0.0213, "num_tokens": 1108925.0, "reward": 0.05000000447034836, "reward_std": 0.28527507185935974, "rewards/reward_func/mean": 0.05000000447034836, "rewards/reward_func/std": 0.38455912470817566, "sampling/importance_sampling_ratio/max": 2.140333890914917, "sampling/importance_sampling_ratio/mean": 1.1589810848236084, "sampling/importance_sampling_ratio/min": 0.36448001861572266, "sampling/sampling_logp_difference/max": 0.5630743503570557, "sampling/sampling_logp_difference/mean": 0.02603762596845627, "step": 198, "step_time": 34.99613412701001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.3567110598087311, "epoch": 0.398, "frac_reward_zero_std": 0.0, "grad_norm": 1.1194331645965576, "kl": 0.018017902970314026, "learning_rate": 4.6389697052407535e-06, "loss": -0.1716, "num_tokens": 1114668.0, "reward": 0.05625000596046448, "reward_std": 0.29238367080688477, "rewards/reward_func/mean": 0.05625000596046448, "rewards/reward_func/std": 0.36862435936927795, "sampling/importance_sampling_ratio/max": 1.573565125465393, "sampling/importance_sampling_ratio/mean": 1.1634211540222168, "sampling/importance_sampling_ratio/min": 0.45238780975341797, "sampling/sampling_logp_difference/max": 0.3439953327178955, "sampling/sampling_logp_difference/mean": 0.020245444029569626, "step": 199, "step_time": 37.81699361599749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 53.125, "completions/mean_terminated_length": 53.125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.33191120624542236, "epoch": 0.4, "frac_reward_zero_std": 0.0, "grad_norm": 1.3887816667556763, "kl": 0.012048996984958649, "learning_rate": 4.634767078538589e-06, "loss": -0.3053, "num_tokens": 1120285.0, "reward": 0.16124999523162842, "reward_std": 0.5615929365158081, "rewards/reward_func/mean": 0.16124999523162842, "rewards/reward_func/std": 0.5200669765472412, "sampling/importance_sampling_ratio/max": 2.2076382637023926, "sampling/importance_sampling_ratio/mean": 0.9878512620925903, "sampling/importance_sampling_ratio/min": 0.5573294162750244, "sampling/sampling_logp_difference/max": 0.4176292419433594, "sampling/sampling_logp_difference/mean": 0.025101102888584137, "step": 200, "step_time": 40.36116412401316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 48.625, "completions/mean_terminated_length": 48.625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3108067512512207, "epoch": 0.402, "frac_reward_zero_std": 0.0, "grad_norm": 2.10872745513916, "kl": 0.01511746272444725, "learning_rate": 4.630542059139923e-06, "loss": 0.1775, "num_tokens": 1125744.0, "reward": 0.06999999284744263, "reward_std": 0.28674817085266113, "rewards/reward_func/mean": 0.06999999284744263, "rewards/reward_func/std": 0.3770941495895386, "sampling/importance_sampling_ratio/max": 1.9700719118118286, "sampling/importance_sampling_ratio/mean": 1.2726753950119019, "sampling/importance_sampling_ratio/min": 0.38142406940460205, "sampling/sampling_logp_difference/max": 0.5226891040802002, "sampling/sampling_logp_difference/mean": 0.024665817618370056, "step": 201, "step_time": 40.36325747499359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 52.875, "completions/mean_terminated_length": 52.875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.3040258288383484, "epoch": 0.404, "frac_reward_zero_std": 0.0, "grad_norm": 1.3096510171890259, "kl": 0.024469975382089615, "learning_rate": 4.626294691363213e-06, "loss": -0.0912, "num_tokens": 1131445.0, "reward": 0.16249999403953552, "reward_std": 0.5160882472991943, "rewards/reward_func/mean": 0.16249999403953552, "rewards/reward_func/std": 0.47918832302093506, "sampling/importance_sampling_ratio/max": 2.0163538455963135, "sampling/importance_sampling_ratio/mean": 1.1956194639205933, "sampling/importance_sampling_ratio/min": 0.6150079369544983, "sampling/sampling_logp_difference/max": 0.5746273994445801, "sampling/sampling_logp_difference/mean": 0.020709635689854622, "step": 202, "step_time": 47.36363880299905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 49.25, "completions/mean_terminated_length": 49.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3916836977005005, "epoch": 0.406, "frac_reward_zero_std": 0.0, "grad_norm": 1.0519437789916992, "kl": 0.02574199251830578, "learning_rate": 4.622025019761336e-06, "loss": 0.1733, "num_tokens": 1136838.0, "reward": 0.32750001549720764, "reward_std": 0.5597944259643555, "rewards/reward_func/mean": 0.32750001549720764, "rewards/reward_func/std": 0.5453636050224304, "sampling/importance_sampling_ratio/max": 1.9452093839645386, "sampling/importance_sampling_ratio/mean": 1.0770931243896484, "sampling/importance_sampling_ratio/min": 0.3852846026420593, "sampling/sampling_logp_difference/max": 0.5705301761627197, "sampling/sampling_logp_difference/mean": 0.027909845113754272, "step": 203, "step_time": 33.44002888299292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.36012089252471924, "epoch": 0.408, "frac_reward_zero_std": 0.0, "grad_norm": 1.2741502523422241, "kl": 0.014535860158503056, "learning_rate": 4.617733089121127e-06, "loss": 0.0169, "num_tokens": 1141938.0, "reward": 0.21375000476837158, "reward_std": 0.32701846957206726, "rewards/reward_func/mean": 0.21375000476837158, "rewards/reward_func/std": 0.48091989755630493, "sampling/importance_sampling_ratio/max": 1.4291470050811768, "sampling/importance_sampling_ratio/mean": 0.9600297212600708, "sampling/importance_sampling_ratio/min": 0.23359182476997375, "sampling/sampling_logp_difference/max": 0.5247984528541565, "sampling/sampling_logp_difference/mean": 0.02588305063545704, "step": 204, "step_time": 34.81417223700555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 46.625, "completions/mean_terminated_length": 46.625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3439635932445526, "epoch": 0.41, "frac_reward_zero_std": 0.0, "grad_norm": 0.9802369475364685, "kl": 0.047367069870233536, "learning_rate": 4.613418944462907e-06, "loss": 0.1454, "num_tokens": 1147427.0, "reward": 0.19749999046325684, "reward_std": 0.5340633392333984, "rewards/reward_func/mean": 0.19749999046325684, "rewards/reward_func/std": 0.49505412578582764, "sampling/importance_sampling_ratio/max": 1.7386454343795776, "sampling/importance_sampling_ratio/mean": 0.8837832808494568, "sampling/importance_sampling_ratio/min": 0.38633158802986145, "sampling/sampling_logp_difference/max": 0.8758201599121094, "sampling/sampling_logp_difference/mean": 0.03084520995616913, "step": 205, "step_time": 34.74832553599845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 49.375, "completions/mean_terminated_length": 49.375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.3894272446632385, "epoch": 0.412, "frac_reward_zero_std": 0.0, "grad_norm": 1.4726861715316772, "kl": 0.014239441603422165, "learning_rate": 4.609082631040012e-06, "loss": -0.0745, "num_tokens": 1152633.0, "reward": 0.20000000298023224, "reward_std": 0.5340527892112732, "rewards/reward_func/mean": 0.20000000298023224, "rewards/reward_func/std": 0.4956093430519104, "sampling/importance_sampling_ratio/max": 1.6325794458389282, "sampling/importance_sampling_ratio/mean": 0.9623055458068848, "sampling/importance_sampling_ratio/min": 0.37618598341941833, "sampling/sampling_logp_difference/max": 0.5434499979019165, "sampling/sampling_logp_difference/mean": 0.027271058410406113, "step": 206, "step_time": 33.83777969199582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.37837862968444824, "epoch": 0.414, "frac_reward_zero_std": 0.0, "grad_norm": 2.0193114280700684, "kl": 0.02881534770131111, "learning_rate": 4.604724194338318e-06, "loss": 0.2341, "num_tokens": 1157895.0, "reward": -0.05375000089406967, "reward_std": 0.03522847592830658, "rewards/reward_func/mean": -0.05375000089406967, "rewards/reward_func/std": 0.03335416316986084, "sampling/importance_sampling_ratio/max": 2.33683180809021, "sampling/importance_sampling_ratio/mean": 1.0109095573425293, "sampling/importance_sampling_ratio/min": 0.38880789279937744, "sampling/sampling_logp_difference/max": 0.8555021286010742, "sampling/sampling_logp_difference/mean": 0.030559619888663292, "step": 207, "step_time": 42.82284573499055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.30469053983688354, "epoch": 0.416, "frac_reward_zero_std": 0.0, "grad_norm": 1.4579194784164429, "kl": 0.02601119875907898, "learning_rate": 4.600343680075764e-06, "loss": 0.0158, "num_tokens": 1163475.0, "reward": 0.4625000059604645, "reward_std": 0.03025972843170166, "rewards/reward_func/mean": 0.4625000059604645, "rewards/reward_func/std": 0.559508740901947, "sampling/importance_sampling_ratio/max": 2.2815041542053223, "sampling/importance_sampling_ratio/mean": 1.3477814197540283, "sampling/importance_sampling_ratio/min": 0.47539442777633667, "sampling/sampling_logp_difference/max": 0.38339972496032715, "sampling/sampling_logp_difference/mean": 0.01848520152270794, "step": 208, "step_time": 29.89221857400844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.33555877208709717, "epoch": 0.418, "frac_reward_zero_std": 0.0, "grad_norm": 1.3514803647994995, "kl": 0.015731051564216614, "learning_rate": 4.5959411342018715e-06, "loss": -0.0843, "num_tokens": 1168580.0, "reward": 0.5887500047683716, "reward_std": 0.5457277894020081, "rewards/reward_func/mean": 0.5887500047683716, "rewards/reward_func/std": 0.5272689461708069, "sampling/importance_sampling_ratio/max": 1.7015126943588257, "sampling/importance_sampling_ratio/mean": 1.1384941339492798, "sampling/importance_sampling_ratio/min": 0.5749424695968628, "sampling/sampling_logp_difference/max": 0.4752795696258545, "sampling/sampling_logp_difference/mean": 0.024254605174064636, "step": 209, "step_time": 27.062604751001345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.36218172311782837, "epoch": 0.42, "frac_reward_zero_std": 0.0, "grad_norm": 0.8917838335037231, "kl": 0.012122070416808128, "learning_rate": 4.591516602897263e-06, "loss": -0.0551, "num_tokens": 1174312.0, "reward": 0.17500001192092896, "reward_std": 0.5405874252319336, "rewards/reward_func/mean": 0.17500001192092896, "rewards/reward_func/std": 0.5007423162460327, "sampling/importance_sampling_ratio/max": 1.3169264793395996, "sampling/importance_sampling_ratio/mean": 0.9016479849815369, "sampling/importance_sampling_ratio/min": 0.5373066663742065, "sampling/sampling_logp_difference/max": 0.4783933162689209, "sampling/sampling_logp_difference/mean": 0.021929645910859108, "step": 210, "step_time": 43.65166889700049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 51.375, "completions/mean_terminated_length": 51.375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.34002870321273804, "epoch": 0.422, "frac_reward_zero_std": 0.0, "grad_norm": 0.7652380466461182, "kl": 0.0239486712962389, "learning_rate": 4.587070132573178e-06, "loss": 0.2205, "num_tokens": 1180592.0, "reward": 0.19624999165534973, "reward_std": 0.5029024481773376, "rewards/reward_func/mean": 0.19624999165534973, "rewards/reward_func/std": 0.46650490164756775, "sampling/importance_sampling_ratio/max": 1.5237053632736206, "sampling/importance_sampling_ratio/mean": 0.8609182238578796, "sampling/importance_sampling_ratio/min": 0.4023110866546631, "sampling/sampling_logp_difference/max": 0.5878002643585205, "sampling/sampling_logp_difference/mean": 0.0249373409897089, "step": 211, "step_time": 39.88526947100763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 59.75, "completions/mean_terminated_length": 59.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.34226059913635254, "epoch": 0.424, "frac_reward_zero_std": 0.0, "grad_norm": 1.0378315448760986, "kl": 0.01290669571608305, "learning_rate": 4.582601769870988e-06, "loss": 0.3469, "num_tokens": 1185853.0, "reward": 0.1850000023841858, "reward_std": 0.5251417756080627, "rewards/reward_func/mean": 0.1850000023841858, "rewards/reward_func/std": 0.4867970943450928, "sampling/importance_sampling_ratio/max": 2.886341094970703, "sampling/importance_sampling_ratio/mean": 1.2305059432983398, "sampling/importance_sampling_ratio/min": 0.39663147926330566, "sampling/sampling_logp_difference/max": 0.7663023471832275, "sampling/sampling_logp_difference/mean": 0.022634129971265793, "step": 212, "step_time": 34.38220458901196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 47.625, "completions/mean_terminated_length": 47.625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3151271939277649, "epoch": 0.426, "frac_reward_zero_std": 0.0, "grad_norm": 0.8756702542304993, "kl": 0.03317102789878845, "learning_rate": 4.578111561661702e-06, "loss": 0.167, "num_tokens": 1191085.0, "reward": 0.19499999284744263, "reward_std": 0.5074241161346436, "rewards/reward_func/mean": 0.19499999284744263, "rewards/reward_func/std": 0.4701063930988312, "sampling/importance_sampling_ratio/max": 1.702741026878357, "sampling/importance_sampling_ratio/mean": 0.949587881565094, "sampling/importance_sampling_ratio/min": 0.2907824218273163, "sampling/sampling_logp_difference/max": 0.7317852973937988, "sampling/sampling_logp_difference/mean": 0.023712046444416046, "step": 213, "step_time": 32.26193818300089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 51.875, "completions/mean_terminated_length": 51.875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.4092119336128235, "epoch": 0.428, "frac_reward_zero_std": 0.0, "grad_norm": 1.1255816221237183, "kl": 0.019677717238664627, "learning_rate": 4.57359955504548e-06, "loss": 0.1728, "num_tokens": 1196632.0, "reward": 0.20499999821186066, "reward_std": 0.526760458946228, "rewards/reward_func/mean": 0.20499999821186066, "rewards/reward_func/std": 0.4881451725959778, "sampling/importance_sampling_ratio/max": 2.468197822570801, "sampling/importance_sampling_ratio/mean": 1.0661287307739258, "sampling/importance_sampling_ratio/min": 0.45330435037612915, "sampling/sampling_logp_difference/max": 0.42670774459838867, "sampling/sampling_logp_difference/mean": 0.029793616384267807, "step": 214, "step_time": 35.49122104200069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 50.125, "completions/mean_terminated_length": 50.125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.37527209520339966, "epoch": 0.43, "frac_reward_zero_std": 0.0, "grad_norm": 0.897345781326294, "kl": 0.02400030940771103, "learning_rate": 4.569065797351135e-06, "loss": 0.2748, "num_tokens": 1202705.0, "reward": 0.2224999964237213, "reward_std": 0.5121018886566162, "rewards/reward_func/mean": 0.2224999964237213, "rewards/reward_func/std": 0.47415342926979065, "sampling/importance_sampling_ratio/max": 1.8278297185897827, "sampling/importance_sampling_ratio/mean": 1.0692496299743652, "sampling/importance_sampling_ratio/min": 0.609551191329956, "sampling/sampling_logp_difference/max": 0.5747532844543457, "sampling/sampling_logp_difference/mean": 0.027012761682271957, "step": 215, "step_time": 37.0583976469934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 52.125, "completions/mean_terminated_length": 52.125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.32061967253685, "epoch": 0.432, "frac_reward_zero_std": 0.0, "grad_norm": 1.4469271898269653, "kl": 0.014168168418109417, "learning_rate": 4.564510336135642e-06, "loss": 0.0253, "num_tokens": 1208393.0, "reward": 0.3487499952316284, "reward_std": 0.5584208965301514, "rewards/reward_func/mean": 0.3487499952316284, "rewards/reward_func/std": 0.539958655834198, "sampling/importance_sampling_ratio/max": 1.755660891532898, "sampling/importance_sampling_ratio/mean": 0.914620041847229, "sampling/importance_sampling_ratio/min": 0.2794102728366852, "sampling/sampling_logp_difference/max": 0.5640288591384888, "sampling/sampling_logp_difference/mean": 0.026515036821365356, "step": 216, "step_time": 30.74552646500524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 45.125, "completions/mean_terminated_length": 45.125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3240736722946167, "epoch": 0.434, "frac_reward_zero_std": 0.0, "grad_norm": 0.9326397776603699, "kl": 0.03814445063471794, "learning_rate": 4.559933219183631e-06, "loss": 0.2147, "num_tokens": 1213972.0, "reward": 0.21375000476837158, "reward_std": 0.5260501503944397, "rewards/reward_func/mean": 0.21375000476837158, "rewards/reward_func/std": 0.4870299696922302, "sampling/importance_sampling_ratio/max": 2.4522364139556885, "sampling/importance_sampling_ratio/mean": 0.9913116693496704, "sampling/importance_sampling_ratio/min": 0.1499728411436081, "sampling/sampling_logp_difference/max": 1.3662075996398926, "sampling/sampling_logp_difference/mean": 0.03047903999686241, "step": 217, "step_time": 30.938302820009994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 49.25, "completions/mean_terminated_length": 49.25, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3097589612007141, "epoch": 0.436, "frac_reward_zero_std": 0.0, "grad_norm": 1.0612341165542603, "kl": 0.014435656368732452, "learning_rate": 4.555334494506895e-06, "loss": 0.0446, "num_tokens": 1219147.0, "reward": 0.33249998092651367, "reward_std": 0.5609448552131653, "rewards/reward_func/mean": 0.33249998092651367, "rewards/reward_func/std": 0.5397817492485046, "sampling/importance_sampling_ratio/max": 1.438905119895935, "sampling/importance_sampling_ratio/mean": 0.8124980926513672, "sampling/importance_sampling_ratio/min": 0.436778724193573, "sampling/sampling_logp_difference/max": 0.38486456871032715, "sampling/sampling_logp_difference/mean": 0.024791575968265533, "step": 218, "step_time": 33.168488173992955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3692587614059448, "epoch": 0.438, "frac_reward_zero_std": 0.0, "grad_norm": 1.0102483034133911, "kl": 0.06603902578353882, "learning_rate": 4.550714210343879e-06, "loss": -0.1641, "num_tokens": 1224789.0, "reward": 0.17499999701976776, "reward_std": 0.32527798414230347, "rewards/reward_func/mean": 0.17499999701976776, "rewards/reward_func/std": 0.47343727946281433, "sampling/importance_sampling_ratio/max": 1.1977847814559937, "sampling/importance_sampling_ratio/mean": 0.8777039051055908, "sampling/importance_sampling_ratio/min": 0.13160440325737, "sampling/sampling_logp_difference/max": 1.4867031574249268, "sampling/sampling_logp_difference/mean": 0.02460072562098503, "step": 219, "step_time": 42.85501969201141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3599325716495514, "epoch": 0.44, "frac_reward_zero_std": 0.0, "grad_norm": 1.6400805711746216, "kl": 0.010766595602035522, "learning_rate": 4.546072415159179e-06, "loss": -0.3985, "num_tokens": 1230387.0, "reward": 0.6000000238418579, "reward_std": 0.5655125379562378, "rewards/reward_func/mean": 0.6000000238418579, "rewards/reward_func/std": 0.543901264667511, "sampling/importance_sampling_ratio/max": 2.3945584297180176, "sampling/importance_sampling_ratio/mean": 1.2363660335540771, "sampling/importance_sampling_ratio/min": 0.5109630227088928, "sampling/sampling_logp_difference/max": 0.43884068727493286, "sampling/sampling_logp_difference/mean": 0.027709752321243286, "step": 220, "step_time": 31.669988991998252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 50.125, "completions/mean_terminated_length": 50.125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.37169408798217773, "epoch": 0.442, "frac_reward_zero_std": 0.0, "grad_norm": 1.099573016166687, "kl": 0.021356448531150818, "learning_rate": 4.541409157643027e-06, "loss": 0.2744, "num_tokens": 1235517.0, "reward": 0.07499999552965164, "reward_std": 0.2952970862388611, "rewards/reward_func/mean": 0.07499999552965164, "rewards/reward_func/std": 0.37305688858032227, "sampling/importance_sampling_ratio/max": 1.954587459564209, "sampling/importance_sampling_ratio/mean": 0.9145734906196594, "sampling/importance_sampling_ratio/min": 0.13262362778186798, "sampling/sampling_logp_difference/max": 0.8386803865432739, "sampling/sampling_logp_difference/mean": 0.031054425984621048, "step": 221, "step_time": 38.17915139699471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.38324177265167236, "epoch": 0.444, "frac_reward_zero_std": 0.0, "grad_norm": 1.4401034116744995, "kl": 0.012282771989703178, "learning_rate": 4.5367244867107905e-06, "loss": -0.0521, "num_tokens": 1240753.0, "reward": 0.19875000417232513, "reward_std": 0.5245675444602966, "rewards/reward_func/mean": 0.19875000417232513, "rewards/reward_func/std": 0.48572295904159546, "sampling/importance_sampling_ratio/max": 1.123025894165039, "sampling/importance_sampling_ratio/mean": 0.8605712652206421, "sampling/importance_sampling_ratio/min": 0.6056551933288574, "sampling/sampling_logp_difference/max": 0.31289446353912354, "sampling/sampling_logp_difference/mean": 0.02160695195198059, "step": 222, "step_time": 34.58787501300685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 45.75, "completions/mean_terminated_length": 45.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.38685908913612366, "epoch": 0.446, "frac_reward_zero_std": 0.0, "grad_norm": 1.5898383855819702, "kl": 0.008172457106411457, "learning_rate": 4.53201845150245e-06, "loss": -0.1864, "num_tokens": 1246752.0, "reward": 0.06750000268220901, "reward_std": 0.3026241660118103, "rewards/reward_func/mean": 0.06750000268220901, "rewards/reward_func/std": 0.37594643235206604, "sampling/importance_sampling_ratio/max": 1.4219330549240112, "sampling/importance_sampling_ratio/mean": 0.9449641704559326, "sampling/importance_sampling_ratio/min": 0.49323397874832153, "sampling/sampling_logp_difference/max": 0.4340834617614746, "sampling/sampling_logp_difference/mean": 0.02197321318089962, "step": 223, "step_time": 43.65093676799734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 55.875, "completions/mean_terminated_length": 55.875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.3796464502811432, "epoch": 0.448, "frac_reward_zero_std": 0.0, "grad_norm": 1.050976276397705, "kl": 0.013385515660047531, "learning_rate": 4.527291101382088e-06, "loss": -0.0046, "num_tokens": 1252131.0, "reward": 0.32124999165534973, "reward_std": 0.5853705406188965, "rewards/reward_func/mean": 0.32124999165534973, "rewards/reward_func/std": 0.5598070621490479, "sampling/importance_sampling_ratio/max": 1.8649810552597046, "sampling/importance_sampling_ratio/mean": 0.9387291669845581, "sampling/importance_sampling_ratio/min": 0.3916337490081787, "sampling/sampling_logp_difference/max": 0.418468713760376, "sampling/sampling_logp_difference/mean": 0.02320447750389576, "step": 224, "step_time": 25.26183516200399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.32041484117507935, "epoch": 0.45, "frac_reward_zero_std": 0.0, "grad_norm": 1.1802082061767578, "kl": 0.01214616559445858, "learning_rate": 4.522542485937369e-06, "loss": -0.0569, "num_tokens": 1257710.0, "reward": 0.29124999046325684, "reward_std": 0.5801128149032593, "rewards/reward_func/mean": 0.29124999046325684, "rewards/reward_func/std": 0.5516064167022705, "sampling/importance_sampling_ratio/max": 1.574925422668457, "sampling/importance_sampling_ratio/mean": 1.1270374059677124, "sampling/importance_sampling_ratio/min": 0.7811345458030701, "sampling/sampling_logp_difference/max": 0.3047366142272949, "sampling/sampling_logp_difference/mean": 0.018625199794769287, "step": 225, "step_time": 34.37411819701083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.375, "completions/mean_terminated_length": 54.375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.3748812973499298, "epoch": 0.452, "frac_reward_zero_std": 0.0, "grad_norm": 1.2252881526947021, "kl": 0.05437376722693443, "learning_rate": 4.517772654979024e-06, "loss": 0.0628, "num_tokens": 1263716.0, "reward": 0.0925000011920929, "reward_std": 0.2703875005245209, "rewards/reward_func/mean": 0.0925000011920929, "rewards/reward_func/std": 0.35591936111450195, "sampling/importance_sampling_ratio/max": 1.9437322616577148, "sampling/importance_sampling_ratio/mean": 0.9380783438682556, "sampling/importance_sampling_ratio/min": 0.49177488684654236, "sampling/sampling_logp_difference/max": 0.42700815200805664, "sampling/sampling_logp_difference/mean": 0.027065452188253403, "step": 226, "step_time": 42.97744282700296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.34751734137535095, "epoch": 0.454, "frac_reward_zero_std": 0.0, "grad_norm": 0.7339047193527222, "kl": 0.019938740879297256, "learning_rate": 4.512981658540321e-06, "loss": 0.0353, "num_tokens": 1269133.0, "reward": 0.45624998211860657, "reward_std": 0.5175138711929321, "rewards/reward_func/mean": 0.45624998211860657, "rewards/reward_func/std": 0.5480859875679016, "sampling/importance_sampling_ratio/max": 1.4849156141281128, "sampling/importance_sampling_ratio/mean": 0.8701044321060181, "sampling/importance_sampling_ratio/min": 0.37403663992881775, "sampling/sampling_logp_difference/max": 0.3767660856246948, "sampling/sampling_logp_difference/mean": 0.024450505152344704, "step": 227, "step_time": 36.81575430399971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 45.0, "completions/mean_terminated_length": 45.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.31638604402542114, "epoch": 0.456, "frac_reward_zero_std": 0.0, "grad_norm": 1.3729960918426514, "kl": 0.022523336112499237, "learning_rate": 4.508169546876547e-06, "loss": 0.1049, "num_tokens": 1274836.0, "reward": 0.08625000715255737, "reward_std": 0.27319854497909546, "rewards/reward_func/mean": 0.08625000715255737, "rewards/reward_func/std": 0.35824722051620483, "sampling/importance_sampling_ratio/max": 1.3125501871109009, "sampling/importance_sampling_ratio/mean": 0.9860361814498901, "sampling/importance_sampling_ratio/min": 0.7861380577087402, "sampling/sampling_logp_difference/max": 0.4707716703414917, "sampling/sampling_logp_difference/mean": 0.021509695798158646, "step": 228, "step_time": 37.25821232999442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.31023848056793213, "epoch": 0.458, "frac_reward_zero_std": 0.0, "grad_norm": 1.1774016618728638, "kl": 0.011443953029811382, "learning_rate": 4.503336370464476e-06, "loss": -0.1194, "num_tokens": 1280640.0, "reward": 0.08624999225139618, "reward_std": 0.28077811002731323, "rewards/reward_func/mean": 0.08624999225139618, "rewards/reward_func/std": 0.3710193634033203, "sampling/importance_sampling_ratio/max": 1.4775018692016602, "sampling/importance_sampling_ratio/mean": 1.0044816732406616, "sampling/importance_sampling_ratio/min": 0.7999988794326782, "sampling/sampling_logp_difference/max": 0.5553348064422607, "sampling/sampling_logp_difference/mean": 0.019959062337875366, "step": 229, "step_time": 38.90176290499221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.39089083671569824, "epoch": 0.46, "frac_reward_zero_std": 0.0, "grad_norm": 1.016448974609375, "kl": 0.01295425184071064, "learning_rate": 4.49848218000184e-06, "loss": -0.1282, "num_tokens": 1286908.0, "reward": 0.07874999940395355, "reward_std": 0.2610425353050232, "rewards/reward_func/mean": 0.07874999940395355, "rewards/reward_func/std": 0.3425717055797577, "sampling/importance_sampling_ratio/max": 2.0288777351379395, "sampling/importance_sampling_ratio/mean": 0.9484986066818237, "sampling/importance_sampling_ratio/min": 0.4408459961414337, "sampling/sampling_logp_difference/max": 0.4210505485534668, "sampling/sampling_logp_difference/mean": 0.02616976760327816, "step": 230, "step_time": 46.000103306010715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3809959292411804, "epoch": 0.462, "frac_reward_zero_std": 0.0, "grad_norm": 1.7100056409835815, "kl": 0.0341075174510479, "learning_rate": 4.493607026406802e-06, "loss": 0.1388, "num_tokens": 1291925.0, "reward": 0.45249998569488525, "reward_std": 0.5185065269470215, "rewards/reward_func/mean": 0.45249998569488525, "rewards/reward_func/std": 0.5677461624145508, "sampling/importance_sampling_ratio/max": 1.71293306350708, "sampling/importance_sampling_ratio/mean": 1.0076119899749756, "sampling/importance_sampling_ratio/min": 0.2770913541316986, "sampling/sampling_logp_difference/max": 0.845590353012085, "sampling/sampling_logp_difference/mean": 0.028274953365325928, "step": 231, "step_time": 31.11168534900935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.3521654009819031, "epoch": 0.464, "frac_reward_zero_std": 0.0, "grad_norm": 1.1459904909133911, "kl": 0.014776136726140976, "learning_rate": 4.488710960817416e-06, "loss": 0.2149, "num_tokens": 1297601.0, "reward": 0.20250000059604645, "reward_std": 0.31824785470962524, "rewards/reward_func/mean": 0.20250000059604645, "rewards/reward_func/std": 0.4914047420024872, "sampling/importance_sampling_ratio/max": 1.8615068197250366, "sampling/importance_sampling_ratio/mean": 0.9899470806121826, "sampling/importance_sampling_ratio/min": 0.4704228937625885, "sampling/sampling_logp_difference/max": 0.34760284423828125, "sampling/sampling_logp_difference/mean": 0.02256855182349682, "step": 232, "step_time": 38.94866470299894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 52.875, "completions/mean_terminated_length": 52.875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.3941933214664459, "epoch": 0.466, "frac_reward_zero_std": 0.0, "grad_norm": 1.7119557857513428, "kl": 0.02081628516316414, "learning_rate": 4.483794034591092e-06, "loss": -0.0682, "num_tokens": 1302788.0, "reward": 0.3425000011920929, "reward_std": 0.5602338910102844, "rewards/reward_func/mean": 0.3425000011920929, "rewards/reward_func/std": 0.5391727089881897, "sampling/importance_sampling_ratio/max": 2.1718764305114746, "sampling/importance_sampling_ratio/mean": 1.3146883249282837, "sampling/importance_sampling_ratio/min": 0.7097762823104858, "sampling/sampling_logp_difference/max": 1.0572991371154785, "sampling/sampling_logp_difference/mean": 0.030411353334784508, "step": 233, "step_time": 28.17538536399661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.3587740659713745, "epoch": 0.468, "frac_reward_zero_std": 0.0, "grad_norm": 1.6080266237258911, "kl": 0.015071025118231773, "learning_rate": 4.4788562993040615e-06, "loss": -0.037, "num_tokens": 1308217.0, "reward": 0.21000000834465027, "reward_std": 0.5147542953491211, "rewards/reward_func/mean": 0.21000000834465027, "rewards/reward_func/std": 0.47683480381965637, "sampling/importance_sampling_ratio/max": 2.2702443599700928, "sampling/importance_sampling_ratio/mean": 1.0888086557388306, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2313389778137207, "sampling/sampling_logp_difference/mean": 0.027467992156744003, "step": 234, "step_time": 33.06014819799748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.388597309589386, "epoch": 0.47, "frac_reward_zero_std": 0.0, "grad_norm": 1.2065192461013794, "kl": 0.012199870310723782, "learning_rate": 4.473897806750829e-06, "loss": 0.0499, "num_tokens": 1313978.0, "reward": 0.32625001668930054, "reward_std": 0.5687720775604248, "rewards/reward_func/mean": 0.32625001668930054, "rewards/reward_func/std": 0.5401041507720947, "sampling/importance_sampling_ratio/max": 1.6197782754898071, "sampling/importance_sampling_ratio/mean": 1.1549599170684814, "sampling/importance_sampling_ratio/min": 0.8198900818824768, "sampling/sampling_logp_difference/max": 0.28801941871643066, "sampling/sampling_logp_difference/mean": 0.01991274021565914, "step": 235, "step_time": 33.60801069700392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.30618584156036377, "epoch": 0.472, "frac_reward_zero_std": 0.0, "grad_norm": 1.3444198369979858, "kl": 0.01667897030711174, "learning_rate": 4.4689186089436365e-06, "loss": 0.268, "num_tokens": 1319260.0, "reward": 0.08624999970197678, "reward_std": 0.2683984041213989, "rewards/reward_func/mean": 0.08624999970197678, "rewards/reward_func/std": 0.35900411009788513, "sampling/importance_sampling_ratio/max": 1.9262661933898926, "sampling/importance_sampling_ratio/mean": 1.1909027099609375, "sampling/importance_sampling_ratio/min": 0.7153211832046509, "sampling/sampling_logp_difference/max": 0.4925405979156494, "sampling/sampling_logp_difference/mean": 0.020151756703853607, "step": 236, "step_time": 40.63417864299845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 43.625, "completions/mean_terminated_length": 43.625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3836323022842407, "epoch": 0.474, "frac_reward_zero_std": 0.0, "grad_norm": 1.1380821466445923, "kl": 0.01986077055335045, "learning_rate": 4.463918758111912e-06, "loss": 0.0285, "num_tokens": 1325046.0, "reward": -0.03500000014901161, "reward_std": 0.015408330596983433, "rewards/reward_func/mean": -0.03500000014901161, "rewards/reward_func/std": 0.03545621037483215, "sampling/importance_sampling_ratio/max": 1.6017059087753296, "sampling/importance_sampling_ratio/mean": 0.8632282018661499, "sampling/importance_sampling_ratio/min": 0.3874957859516144, "sampling/sampling_logp_difference/max": 0.42335569858551025, "sampling/sampling_logp_difference/mean": 0.02702953666448593, "step": 237, "step_time": 47.89170832399395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 53.125, "completions/mean_terminated_length": 53.125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.343445360660553, "epoch": 0.476, "frac_reward_zero_std": 0.0, "grad_norm": 1.6716041564941406, "kl": 0.022784311324357986, "learning_rate": 4.4588983067017255e-06, "loss": -0.1186, "num_tokens": 1331213.0, "reward": 0.3474999964237213, "reward_std": 0.5585798621177673, "rewards/reward_func/mean": 0.3474999964237213, "rewards/reward_func/std": 0.5411298274993896, "sampling/importance_sampling_ratio/max": 1.6867166757583618, "sampling/importance_sampling_ratio/mean": 1.0986011028289795, "sampling/importance_sampling_ratio/min": 0.353174090385437, "sampling/sampling_logp_difference/max": 0.5639722347259521, "sampling/sampling_logp_difference/mean": 0.023318957537412643, "step": 238, "step_time": 36.82882352700108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 47.625, "completions/mean_terminated_length": 47.625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.38994842767715454, "epoch": 0.478, "frac_reward_zero_std": 0.0, "grad_norm": 1.0082672834396362, "kl": 0.010606281459331512, "learning_rate": 4.4538573073752365e-06, "loss": 0.0566, "num_tokens": 1336322.0, "reward": 0.06499999761581421, "reward_std": 0.28198638558387756, "rewards/reward_func/mean": 0.06499999761581421, "rewards/reward_func/std": 0.3786064684391022, "sampling/importance_sampling_ratio/max": 1.146636962890625, "sampling/importance_sampling_ratio/mean": 0.8590273261070251, "sampling/importance_sampling_ratio/min": 0.4441068470478058, "sampling/sampling_logp_difference/max": 0.47839975357055664, "sampling/sampling_logp_difference/mean": 0.023978423327207565, "step": 239, "step_time": 40.39131770400854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 52.375, "completions/mean_terminated_length": 52.375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.32200464606285095, "epoch": 0.48, "frac_reward_zero_std": 0.0, "grad_norm": 0.661300778388977, "kl": 0.01102864183485508, "learning_rate": 4.448795813010142e-06, "loss": -0.0057, "num_tokens": 1341775.0, "reward": 0.3475000262260437, "reward_std": 0.5485206842422485, "rewards/reward_func/mean": 0.3475000262260437, "rewards/reward_func/std": 0.5300067663192749, "sampling/importance_sampling_ratio/max": 1.252677321434021, "sampling/importance_sampling_ratio/mean": 0.7707798480987549, "sampling/importance_sampling_ratio/min": 0.4847687780857086, "sampling/sampling_logp_difference/max": 0.5437004566192627, "sampling/sampling_logp_difference/mean": 0.02054433897137642, "step": 240, "step_time": 40.95854337599303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3526754379272461, "epoch": 0.482, "frac_reward_zero_std": 0.0, "grad_norm": 0.9396883249282837, "kl": 0.01238167379051447, "learning_rate": 4.443713876699124e-06, "loss": 0.2082, "num_tokens": 1347458.0, "reward": 0.3349999785423279, "reward_std": 0.5716516971588135, "rewards/reward_func/mean": 0.3349999785423279, "rewards/reward_func/std": 0.5458676218986511, "sampling/importance_sampling_ratio/max": 1.5820939540863037, "sampling/importance_sampling_ratio/mean": 0.9924447536468506, "sampling/importance_sampling_ratio/min": 0.37981370091438293, "sampling/sampling_logp_difference/max": 0.507704496383667, "sampling/sampling_logp_difference/mean": 0.02272692322731018, "step": 241, "step_time": 35.560629765997874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3843768239021301, "epoch": 0.484, "frac_reward_zero_std": 0.0, "grad_norm": 0.9482317566871643, "kl": 0.016451558098196983, "learning_rate": 4.438611551749288e-06, "loss": 0.0273, "num_tokens": 1353650.0, "reward": 0.5887500047683716, "reward_std": 0.2841942012310028, "rewards/reward_func/mean": 0.5887500047683716, "rewards/reward_func/std": 0.5169259905815125, "sampling/importance_sampling_ratio/max": 1.5082266330718994, "sampling/importance_sampling_ratio/mean": 0.8422431945800781, "sampling/importance_sampling_ratio/min": 0.381510853767395, "sampling/sampling_logp_difference/max": 0.3755103349685669, "sampling/sampling_logp_difference/mean": 0.025023218244314194, "step": 242, "step_time": 37.83947860999615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.356048047542572, "epoch": 0.486, "frac_reward_zero_std": 0.0, "grad_norm": 0.7156015634536743, "kl": 0.032819997519254684, "learning_rate": 4.4334888916816096e-06, "loss": 0.1398, "num_tokens": 1359023.0, "reward": 0.05500000715255737, "reward_std": 0.30337631702423096, "rewards/reward_func/mean": 0.05500000715255737, "rewards/reward_func/std": 0.38619017601013184, "sampling/importance_sampling_ratio/max": 1.3351407051086426, "sampling/importance_sampling_ratio/mean": 0.8373122811317444, "sampling/importance_sampling_ratio/min": 0.15892520546913147, "sampling/sampling_logp_difference/max": 0.9354909658432007, "sampling/sampling_logp_difference/mean": 0.025785677134990692, "step": 243, "step_time": 40.353877371002454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 49.25, "completions/mean_terminated_length": 49.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.3383341431617737, "epoch": 0.488, "frac_reward_zero_std": 0.0, "grad_norm": 1.6086397171020508, "kl": 0.009784823283553123, "learning_rate": 4.42834595023037e-06, "loss": -0.1637, "num_tokens": 1363662.0, "reward": 0.46000000834465027, "reward_std": 0.5897172689437866, "rewards/reward_func/mean": 0.46000000834465027, "rewards/reward_func/std": 0.5466260313987732, "sampling/importance_sampling_ratio/max": 1.7768175601959229, "sampling/importance_sampling_ratio/mean": 1.0315792560577393, "sampling/importance_sampling_ratio/min": 0.5198686718940735, "sampling/sampling_logp_difference/max": 0.41960763931274414, "sampling/sampling_logp_difference/mean": 0.022437244653701782, "step": 244, "step_time": 31.396963074992527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.2941707968711853, "epoch": 0.49, "frac_reward_zero_std": 0.0, "grad_norm": 1.212313175201416, "kl": 0.018501581624150276, "learning_rate": 4.423182781342589e-06, "loss": 0.0024, "num_tokens": 1368921.0, "reward": 0.1850000023841858, "reward_std": 0.34044092893600464, "rewards/reward_func/mean": 0.1850000023841858, "rewards/reward_func/std": 0.5024511218070984, "sampling/importance_sampling_ratio/max": 1.3489513397216797, "sampling/importance_sampling_ratio/mean": 0.8641585111618042, "sampling/importance_sampling_ratio/min": 0.25969162583351135, "sampling/sampling_logp_difference/max": 0.4768340587615967, "sampling/sampling_logp_difference/mean": 0.022857919335365295, "step": 245, "step_time": 40.36035842899582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 48.125, "completions/mean_terminated_length": 48.125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.33388686180114746, "epoch": 0.492, "frac_reward_zero_std": 0.0, "grad_norm": 0.8927440047264099, "kl": 0.021221470087766647, "learning_rate": 4.417999439177465e-06, "loss": 0.0843, "num_tokens": 1374815.0, "reward": 0.3537500202655792, "reward_std": 0.5465495586395264, "rewards/reward_func/mean": 0.3537500202655792, "rewards/reward_func/std": 0.5216988325119019, "sampling/importance_sampling_ratio/max": 1.4427800178527832, "sampling/importance_sampling_ratio/mean": 0.8960850238800049, "sampling/importance_sampling_ratio/min": 0.3982952833175659, "sampling/sampling_logp_difference/max": 0.543989896774292, "sampling/sampling_logp_difference/mean": 0.022352319210767746, "step": 246, "step_time": 37.63646215200424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.30894017219543457, "epoch": 0.494, "frac_reward_zero_std": 0.0, "grad_norm": 0.6700345277786255, "kl": 0.016512254253029823, "learning_rate": 4.412795978105807e-06, "loss": -0.1905, "num_tokens": 1380366.0, "reward": 0.19624999165534973, "reward_std": 0.5302596092224121, "rewards/reward_func/mean": 0.19624999165534973, "rewards/reward_func/std": 0.4911484718322754, "sampling/importance_sampling_ratio/max": 1.5147849321365356, "sampling/importance_sampling_ratio/mean": 0.7344578504562378, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0827524662017822, "sampling/sampling_logp_difference/mean": 0.02441200613975525, "step": 247, "step_time": 39.67168925999431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.36346104741096497, "epoch": 0.496, "frac_reward_zero_std": 0.0, "grad_norm": 0.8582754135131836, "kl": 0.013538405299186707, "learning_rate": 4.407572452709459e-06, "loss": -0.1299, "num_tokens": 1385762.0, "reward": 0.45249998569488525, "reward_std": 0.6115769147872925, "rewards/reward_func/mean": 0.45249998569488525, "rewards/reward_func/std": 0.566385805606842, "sampling/importance_sampling_ratio/max": 1.3523950576782227, "sampling/importance_sampling_ratio/mean": 0.7945364117622375, "sampling/importance_sampling_ratio/min": 0.34108811616897583, "sampling/sampling_logp_difference/max": 0.6163909435272217, "sampling/sampling_logp_difference/mean": 0.026093991473317146, "step": 248, "step_time": 34.434571129997494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.38116583228111267, "epoch": 0.498, "frac_reward_zero_std": 0.0, "grad_norm": 1.1535700559616089, "kl": 0.011749478057026863, "learning_rate": 4.402328917780728e-06, "loss": -0.1905, "num_tokens": 1391256.0, "reward": 0.3187500238418579, "reward_std": 0.5815805792808533, "rewards/reward_func/mean": 0.3187500238418579, "rewards/reward_func/std": 0.5606231093406677, "sampling/importance_sampling_ratio/max": 1.868465542793274, "sampling/importance_sampling_ratio/mean": 0.9490823745727539, "sampling/importance_sampling_ratio/min": 0.3271576762199402, "sampling/sampling_logp_difference/max": 0.7011950016021729, "sampling/sampling_logp_difference/mean": 0.028965875506401062, "step": 249, "step_time": 35.004241451999405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.34539151191711426, "epoch": 0.5, "frac_reward_zero_std": 0.0, "grad_norm": 1.1324069499969482, "kl": 0.01937723159790039, "learning_rate": 4.397065428321818e-06, "loss": -0.0797, "num_tokens": 1396691.0, "reward": 0.5799999833106995, "reward_std": 0.559051513671875, "rewards/reward_func/mean": 0.5799999833106995, "rewards/reward_func/std": 0.536443293094635, "sampling/importance_sampling_ratio/max": 1.660628080368042, "sampling/importance_sampling_ratio/mean": 0.9389946460723877, "sampling/importance_sampling_ratio/min": 0.34798264503479004, "sampling/sampling_logp_difference/max": 0.4318201541900635, "sampling/sampling_logp_difference/mean": 0.026005076244473457, "step": 250, "step_time": 39.65480143901368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 47.125, "completions/mean_terminated_length": 47.125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.2987058758735657, "epoch": 0.502, "frac_reward_zero_std": 0.0, "grad_norm": 0.6359943747520447, "kl": 0.0072782086208462715, "learning_rate": 4.391782039544239e-06, "loss": 0.0501, "num_tokens": 1402379.0, "reward": 0.34375, "reward_std": 0.5505853891372681, "rewards/reward_func/mean": 0.34375, "rewards/reward_func/std": 0.5326735377311707, "sampling/importance_sampling_ratio/max": 0.9640963673591614, "sampling/importance_sampling_ratio/mean": 0.7049010396003723, "sampling/importance_sampling_ratio/min": 0.3560267686843872, "sampling/sampling_logp_difference/max": 0.34845149517059326, "sampling/sampling_logp_difference/mean": 0.02234082669019699, "step": 251, "step_time": 40.00721578199591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 53.625, "completions/mean_terminated_length": 53.625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3475812077522278, "epoch": 0.504, "frac_reward_zero_std": 0.0, "grad_norm": 1.947122573852539, "kl": 0.012996821664273739, "learning_rate": 4.386478806868242e-06, "loss": -0.3883, "num_tokens": 1407741.0, "reward": 0.20624999701976776, "reward_std": 0.5270916223526001, "rewards/reward_func/mean": 0.20624999701976776, "rewards/reward_func/std": 0.4883481562137604, "sampling/importance_sampling_ratio/max": 2.576594352722168, "sampling/importance_sampling_ratio/mean": 1.3187074661254883, "sampling/importance_sampling_ratio/min": 0.5899462699890137, "sampling/sampling_logp_difference/max": 0.4354560375213623, "sampling/sampling_logp_difference/mean": 0.02314494550228119, "step": 252, "step_time": 36.81614575999265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.2992243468761444, "epoch": 0.506, "frac_reward_zero_std": 0.0, "grad_norm": 1.242324709892273, "kl": 0.016226768493652344, "learning_rate": 4.381155785922226e-06, "loss": 0.0511, "num_tokens": 1413174.0, "reward": 0.32750001549720764, "reward_std": 0.5569428205490112, "rewards/reward_func/mean": 0.32750001549720764, "rewards/reward_func/std": 0.5400992035865784, "sampling/importance_sampling_ratio/max": 1.7072101831436157, "sampling/importance_sampling_ratio/mean": 1.1310184001922607, "sampling/importance_sampling_ratio/min": 0.5692970752716064, "sampling/sampling_logp_difference/max": 0.4309711456298828, "sampling/sampling_logp_difference/mean": 0.019555598497390747, "step": 253, "step_time": 36.43075303900696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 53.875, "completions/mean_terminated_length": 53.875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.33849331736564636, "epoch": 0.508, "frac_reward_zero_std": 0.0, "grad_norm": 1.3686050176620483, "kl": 0.01706480048596859, "learning_rate": 4.375813032542164e-06, "loss": -0.0148, "num_tokens": 1418790.0, "reward": 0.2212499976158142, "reward_std": 0.5178108215332031, "rewards/reward_func/mean": 0.2212499976158142, "rewards/reward_func/std": 0.47944724559783936, "sampling/importance_sampling_ratio/max": 1.8649122714996338, "sampling/importance_sampling_ratio/mean": 1.1233329772949219, "sampling/importance_sampling_ratio/min": 0.349303662776947, "sampling/sampling_logp_difference/max": 0.4339733123779297, "sampling/sampling_logp_difference/mean": 0.023459486663341522, "step": 254, "step_time": 37.26218052499462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.4087204933166504, "epoch": 0.51, "frac_reward_zero_std": 0.0, "grad_norm": 1.3738113641738892, "kl": 0.03434944152832031, "learning_rate": 4.37045060277101e-06, "loss": -0.2582, "num_tokens": 1424255.0, "reward": 0.08124999701976776, "reward_std": 0.284479558467865, "rewards/reward_func/mean": 0.08124999701976776, "rewards/reward_func/std": 0.3745640218257904, "sampling/importance_sampling_ratio/max": 1.7234750986099243, "sampling/importance_sampling_ratio/mean": 0.9960125684738159, "sampling/importance_sampling_ratio/min": 0.38904088735580444, "sampling/sampling_logp_difference/max": 0.6690880060195923, "sampling/sampling_logp_difference/mean": 0.029072267934679985, "step": 255, "step_time": 41.67740165100258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 45.375, "completions/mean_terminated_length": 45.375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.33174747228622437, "epoch": 0.512, "frac_reward_zero_std": 0.0, "grad_norm": 1.5522558689117432, "kl": 0.024778520688414574, "learning_rate": 4.365068552858116e-06, "loss": -0.0654, "num_tokens": 1430284.0, "reward": 0.3137499988079071, "reward_std": 0.5697016716003418, "rewards/reward_func/mean": 0.3137499988079071, "rewards/reward_func/std": 0.5417943000793457, "sampling/importance_sampling_ratio/max": 1.6336612701416016, "sampling/importance_sampling_ratio/mean": 0.8506813049316406, "sampling/importance_sampling_ratio/min": 0.30304935574531555, "sampling/sampling_logp_difference/max": 0.8601186275482178, "sampling/sampling_logp_difference/mean": 0.03514246642589569, "step": 256, "step_time": 40.16726562000986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 49.125, "completions/mean_terminated_length": 49.125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3332866132259369, "epoch": 0.514, "frac_reward_zero_std": 0.0, "grad_norm": 1.05963933467865, "kl": 0.05120334029197693, "learning_rate": 4.359666939258637e-06, "loss": -0.143, "num_tokens": 1435988.0, "reward": 0.08000000566244125, "reward_std": 0.2796610891819, "rewards/reward_func/mean": 0.08000000566244125, "rewards/reward_func/std": 0.3729420006275177, "sampling/importance_sampling_ratio/max": 1.3755680322647095, "sampling/importance_sampling_ratio/mean": 0.8436367511749268, "sampling/importance_sampling_ratio/min": 0.3388536870479584, "sampling/sampling_logp_difference/max": 1.2603816986083984, "sampling/sampling_logp_difference/mean": 0.028931111097335815, "step": 257, "step_time": 41.19875999999931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 52.375, "completions/mean_terminated_length": 52.375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.35343602299690247, "epoch": 0.516, "frac_reward_zero_std": 0.0, "grad_norm": 0.8309938907623291, "kl": 0.031175322830677032, "learning_rate": 4.354245818632944e-06, "loss": -0.0113, "num_tokens": 1441589.0, "reward": 0.32249999046325684, "reward_std": 0.5780496597290039, "rewards/reward_func/mean": 0.32249999046325684, "rewards/reward_func/std": 0.5562823414802551, "sampling/importance_sampling_ratio/max": 1.013479232788086, "sampling/importance_sampling_ratio/mean": 0.7156726717948914, "sampling/importance_sampling_ratio/min": 0.31125888228416443, "sampling/sampling_logp_difference/max": 0.8599154949188232, "sampling/sampling_logp_difference/mean": 0.029068203642964363, "step": 258, "step_time": 32.516722486005165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 49.875, "completions/mean_terminated_length": 49.875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.3191331624984741, "epoch": 0.518, "frac_reward_zero_std": 0.0, "grad_norm": 1.0082281827926636, "kl": 0.0865778848528862, "learning_rate": 4.348805247846027e-06, "loss": 0.004, "num_tokens": 1447611.0, "reward": 0.35625001788139343, "reward_std": 0.2691054940223694, "rewards/reward_func/mean": 0.35625001788139343, "rewards/reward_func/std": 0.5278781652450562, "sampling/importance_sampling_ratio/max": 1.7518271207809448, "sampling/importance_sampling_ratio/mean": 0.9650399684906006, "sampling/importance_sampling_ratio/min": 0.23583200573921204, "sampling/sampling_logp_difference/max": 0.9998927116394043, "sampling/sampling_logp_difference/mean": 0.026092462241649628, "step": 259, "step_time": 39.387949302996276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 52.875, "completions/mean_terminated_length": 52.875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3031730651855469, "epoch": 0.52, "frac_reward_zero_std": 0.0, "grad_norm": 0.7463305592536926, "kl": 0.01488940604031086, "learning_rate": 4.343345283966901e-06, "loss": -0.1232, "num_tokens": 1452554.0, "reward": 0.32375001907348633, "reward_std": 0.5791251063346863, "rewards/reward_func/mean": 0.32375001907348633, "rewards/reward_func/std": 0.5504786968231201, "sampling/importance_sampling_ratio/max": 1.5403791666030884, "sampling/importance_sampling_ratio/mean": 1.0024334192276, "sampling/importance_sampling_ratio/min": 0.6112100481987, "sampling/sampling_logp_difference/max": 0.3157918453216553, "sampling/sampling_logp_difference/mean": 0.01894155889749527, "step": 260, "step_time": 36.034989626001334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 51.875, "completions/mean_terminated_length": 51.875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.35179832577705383, "epoch": 0.522, "frac_reward_zero_std": 0.0, "grad_norm": 1.0443854331970215, "kl": 0.0215410478413105, "learning_rate": 4.337865984268002e-06, "loss": -0.4475, "num_tokens": 1458072.0, "reward": 0.4775000214576721, "reward_std": 0.5977118015289307, "rewards/reward_func/mean": 0.4775000214576721, "rewards/reward_func/std": 0.5537083148956299, "sampling/importance_sampling_ratio/max": 1.6334444284439087, "sampling/importance_sampling_ratio/mean": 1.0017789602279663, "sampling/importance_sampling_ratio/min": 0.33515238761901855, "sampling/sampling_logp_difference/max": 0.43834519386291504, "sampling/sampling_logp_difference/mean": 0.02636917680501938, "step": 261, "step_time": 29.26977622200502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 52.375, "completions/mean_terminated_length": 52.375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3282299041748047, "epoch": 0.524, "frac_reward_zero_std": 0.0, "grad_norm": 1.814078450202942, "kl": 0.04120957478880882, "learning_rate": 4.33236740622459e-06, "loss": -0.017, "num_tokens": 1464432.0, "reward": 0.19875000417232513, "reward_std": 0.47839945554733276, "rewards/reward_func/mean": 0.19875000417232513, "rewards/reward_func/std": 0.444311261177063, "sampling/importance_sampling_ratio/max": 1.5190731287002563, "sampling/importance_sampling_ratio/mean": 1.122732162475586, "sampling/importance_sampling_ratio/min": 0.47047433257102966, "sampling/sampling_logp_difference/max": 0.6926932334899902, "sampling/sampling_logp_difference/mean": 0.022747617214918137, "step": 262, "step_time": 44.020091628990485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 57.25, "completions/mean_terminated_length": 57.25, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.357543408870697, "epoch": 0.526, "frac_reward_zero_std": 0.0, "grad_norm": 0.761099636554718, "kl": 0.055734507739543915, "learning_rate": 4.326849607514149e-06, "loss": 0.2722, "num_tokens": 1469989.0, "reward": 0.33500000834465027, "reward_std": 0.546402633190155, "rewards/reward_func/mean": 0.33500000834465027, "rewards/reward_func/std": 0.5292582511901855, "sampling/importance_sampling_ratio/max": 1.8936946392059326, "sampling/importance_sampling_ratio/mean": 0.8968003988265991, "sampling/importance_sampling_ratio/min": 0.1377258002758026, "sampling/sampling_logp_difference/max": 1.1581766605377197, "sampling/sampling_logp_difference/mean": 0.030442828312516212, "step": 263, "step_time": 37.87564268500137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.3408206105232239, "epoch": 0.528, "frac_reward_zero_std": 0.0, "grad_norm": 1.1230255365371704, "kl": 0.16760079562664032, "learning_rate": 4.321312646015775e-06, "loss": 0.0372, "num_tokens": 1474707.0, "reward": 0.4675000309944153, "reward_std": 0.6010903120040894, "rewards/reward_func/mean": 0.4675000309944153, "rewards/reward_func/std": 0.5568469762802124, "sampling/importance_sampling_ratio/max": 1.2672181129455566, "sampling/importance_sampling_ratio/mean": 0.7914547920227051, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.173736572265625, "sampling/sampling_logp_difference/mean": 0.028644565492868423, "step": 264, "step_time": 27.348199279993423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.3111991286277771, "epoch": 0.53, "frac_reward_zero_std": 0.0, "grad_norm": 0.8316683173179626, "kl": 0.02439902350306511, "learning_rate": 4.315756579809575e-06, "loss": 0.0201, "num_tokens": 1479489.0, "reward": 0.0637499988079071, "reward_std": 0.2797112762928009, "rewards/reward_func/mean": 0.0637499988079071, "rewards/reward_func/std": 0.3812362551689148, "sampling/importance_sampling_ratio/max": 1.1946301460266113, "sampling/importance_sampling_ratio/mean": 0.888809323310852, "sampling/importance_sampling_ratio/min": 0.5050463676452637, "sampling/sampling_logp_difference/max": 0.7642940282821655, "sampling/sampling_logp_difference/mean": 0.02073574811220169, "step": 265, "step_time": 39.65966210300394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.32368624210357666, "epoch": 0.532, "frac_reward_zero_std": 0.0, "grad_norm": 1.282889485359192, "kl": 0.029365966096520424, "learning_rate": 4.3101814671760546e-06, "loss": -0.3961, "num_tokens": 1484739.0, "reward": 0.07999999821186066, "reward_std": 0.2752854824066162, "rewards/reward_func/mean": 0.07999999821186066, "rewards/reward_func/std": 0.3690141439437866, "sampling/importance_sampling_ratio/max": 2.9063940048217773, "sampling/importance_sampling_ratio/mean": 1.0859081745147705, "sampling/importance_sampling_ratio/min": 0.20924752950668335, "sampling/sampling_logp_difference/max": 0.8566403388977051, "sampling/sampling_logp_difference/mean": 0.026934346184134483, "step": 266, "step_time": 38.628261532008764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 50.875, "completions/mean_terminated_length": 50.875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.33091580867767334, "epoch": 0.534, "frac_reward_zero_std": 0.0, "grad_norm": 0.8325208425521851, "kl": 0.04467456787824631, "learning_rate": 4.304587366595506e-06, "loss": -0.1739, "num_tokens": 1490415.0, "reward": -0.04374999552965164, "reward_std": 0.03216441348195076, "rewards/reward_func/mean": -0.04374999552965164, "rewards/reward_func/std": 0.04749060049653053, "sampling/importance_sampling_ratio/max": 1.7320250272750854, "sampling/importance_sampling_ratio/mean": 0.6510964632034302, "sampling/importance_sampling_ratio/min": 0.38174349069595337, "sampling/sampling_logp_difference/max": 0.8815808296203613, "sampling/sampling_logp_difference/mean": 0.030356641858816147, "step": 267, "step_time": 39.55001159000676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 53.625, "completions/mean_terminated_length": 53.625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.3176058530807495, "epoch": 0.536, "frac_reward_zero_std": 0.0, "grad_norm": 1.0742965936660767, "kl": 0.022770049050450325, "learning_rate": 4.298974336747397e-06, "loss": -0.0013, "num_tokens": 1495288.0, "reward": 0.3375000059604645, "reward_std": 0.5672158598899841, "rewards/reward_func/mean": 0.3375000059604645, "rewards/reward_func/std": 0.5421057343482971, "sampling/importance_sampling_ratio/max": 1.8634246587753296, "sampling/importance_sampling_ratio/mean": 1.0370787382125854, "sampling/importance_sampling_ratio/min": 0.7139883637428284, "sampling/sampling_logp_difference/max": 0.35963261127471924, "sampling/sampling_logp_difference/mean": 0.025577042251825333, "step": 268, "step_time": 28.42022311600158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3482901453971863, "epoch": 0.538, "frac_reward_zero_std": 0.0, "grad_norm": 2.574179172515869, "kl": 0.020543672144412994, "learning_rate": 4.2933424365097565e-06, "loss": -0.3582, "num_tokens": 1501351.0, "reward": 0.19875000417232513, "reward_std": 0.5214530229568481, "rewards/reward_func/mean": 0.19875000417232513, "rewards/reward_func/std": 0.48300954699516296, "sampling/importance_sampling_ratio/max": 2.9328081607818604, "sampling/importance_sampling_ratio/mean": 1.0341973304748535, "sampling/importance_sampling_ratio/min": 0.38898590207099915, "sampling/sampling_logp_difference/max": 0.5442459583282471, "sampling/sampling_logp_difference/mean": 0.024556942284107208, "step": 269, "step_time": 41.42594156700943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 46.125, "completions/mean_terminated_length": 46.125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.33427339792251587, "epoch": 0.54, "frac_reward_zero_std": 0.0, "grad_norm": 0.8722326159477234, "kl": 0.0236725602298975, "learning_rate": 4.287691724958551e-06, "loss": -0.0266, "num_tokens": 1506597.0, "reward": -0.05250000208616257, "reward_std": 0.053679704666137695, "rewards/reward_func/mean": -0.05250000208616257, "rewards/reward_func/std": 0.05391792953014374, "sampling/importance_sampling_ratio/max": 0.9804509878158569, "sampling/importance_sampling_ratio/mean": 0.7802181839942932, "sampling/importance_sampling_ratio/min": 0.44377338886260986, "sampling/sampling_logp_difference/max": 0.7215894460678101, "sampling/sampling_logp_difference/mean": 0.02426327019929886, "step": 270, "step_time": 43.879117930002394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 54.125, "completions/mean_terminated_length": 54.125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3636426031589508, "epoch": 0.542, "frac_reward_zero_std": 0.0, "grad_norm": 0.8976055979728699, "kl": 0.025950593873858452, "learning_rate": 4.282022261367074e-06, "loss": -0.0976, "num_tokens": 1512282.0, "reward": 0.5975000262260437, "reward_std": 0.541487455368042, "rewards/reward_func/mean": 0.5975000262260437, "rewards/reward_func/std": 0.5228424668312073, "sampling/importance_sampling_ratio/max": 2.1136491298675537, "sampling/importance_sampling_ratio/mean": 0.9269047379493713, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7309963703155518, "sampling/sampling_logp_difference/mean": 0.02674085833132267, "step": 271, "step_time": 34.07076389199938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 51.125, "completions/mean_terminated_length": 51.125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3050827085971832, "epoch": 0.544, "frac_reward_zero_std": 0.0, "grad_norm": 1.4390958547592163, "kl": 0.03130275756120682, "learning_rate": 4.276334105205312e-06, "loss": 0.0677, "num_tokens": 1517354.0, "reward": -0.038750000298023224, "reward_std": 0.03077373281121254, "rewards/reward_func/mean": -0.038750000298023224, "rewards/reward_func/std": 0.0425734668970108, "sampling/importance_sampling_ratio/max": 1.9479724168777466, "sampling/importance_sampling_ratio/mean": 1.1777501106262207, "sampling/importance_sampling_ratio/min": 0.8560696840286255, "sampling/sampling_logp_difference/max": 0.40088653564453125, "sampling/sampling_logp_difference/mean": 0.02281448245048523, "step": 272, "step_time": 33.767667939988314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 57.375, "completions/mean_terminated_length": 57.375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.35996952652931213, "epoch": 0.546, "frac_reward_zero_std": 0.0, "grad_norm": 0.6506816148757935, "kl": 0.018656428903341293, "learning_rate": 4.270627316139333e-06, "loss": 0.0589, "num_tokens": 1523210.0, "reward": 0.1912499964237213, "reward_std": 0.5389974117279053, "rewards/reward_func/mean": 0.1912499964237213, "rewards/reward_func/std": 0.4990401864051819, "sampling/importance_sampling_ratio/max": 1.3763803243637085, "sampling/importance_sampling_ratio/mean": 0.8559229969978333, "sampling/importance_sampling_ratio/min": 0.3611724078655243, "sampling/sampling_logp_difference/max": 0.7112209796905518, "sampling/sampling_logp_difference/mean": 0.025731002911925316, "step": 273, "step_time": 43.23826266299875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 51.375, "completions/mean_terminated_length": 51.375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.3215975761413574, "epoch": 0.548, "frac_reward_zero_std": 0.0, "grad_norm": 0.8488715887069702, "kl": 0.0160782802850008, "learning_rate": 4.264901954030655e-06, "loss": 0.0883, "num_tokens": 1528646.0, "reward": 0.22625000774860382, "reward_std": 0.5068466663360596, "rewards/reward_func/mean": 0.22625000774860382, "rewards/reward_func/std": 0.46940505504608154, "sampling/importance_sampling_ratio/max": 1.3615248203277588, "sampling/importance_sampling_ratio/mean": 0.8312337398529053, "sampling/importance_sampling_ratio/min": 0.3102983236312866, "sampling/sampling_logp_difference/max": 0.6900758743286133, "sampling/sampling_logp_difference/mean": 0.023976415395736694, "step": 274, "step_time": 34.28163969999878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.31386834383010864, "epoch": 0.55, "frac_reward_zero_std": 0.0, "grad_norm": 2.1750917434692383, "kl": 0.030527833849191666, "learning_rate": 4.259158078935616e-06, "loss": -0.12, "num_tokens": 1534719.0, "reward": 0.4662500023841858, "reward_std": 0.5187262296676636, "rewards/reward_func/mean": 0.4662500023841858, "rewards/reward_func/std": 0.5655575394630432, "sampling/importance_sampling_ratio/max": 2.275832414627075, "sampling/importance_sampling_ratio/mean": 1.2086929082870483, "sampling/importance_sampling_ratio/min": 0.4686194062232971, "sampling/sampling_logp_difference/max": 0.6889495849609375, "sampling/sampling_logp_difference/mean": 0.02524813637137413, "step": 275, "step_time": 41.8534838570049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 49.25, "completions/mean_terminated_length": 49.25, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.34471046924591064, "epoch": 0.552, "frac_reward_zero_std": 0.0, "grad_norm": 0.9192721843719482, "kl": 0.02106521837413311, "learning_rate": 4.2533957511047485e-06, "loss": 0.1216, "num_tokens": 1540498.0, "reward": 0.33124998211860657, "reward_std": 0.5458966493606567, "rewards/reward_func/mean": 0.33124998211860657, "rewards/reward_func/std": 0.5239530205726624, "sampling/importance_sampling_ratio/max": 1.4334321022033691, "sampling/importance_sampling_ratio/mean": 0.9884998798370361, "sampling/importance_sampling_ratio/min": 0.6590894460678101, "sampling/sampling_logp_difference/max": 0.477333664894104, "sampling/sampling_logp_difference/mean": 0.026044394820928574, "step": 276, "step_time": 43.16134984300879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 45.375, "completions/mean_terminated_length": 45.375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.33258211612701416, "epoch": 0.554, "frac_reward_zero_std": 0.0, "grad_norm": 1.4911836385726929, "kl": 0.019767843186855316, "learning_rate": 4.247615030982144e-06, "loss": -0.3075, "num_tokens": 1546073.0, "reward": 0.21875, "reward_std": 0.5179763436317444, "rewards/reward_func/mean": 0.21875, "rewards/reward_func/std": 0.479804664850235, "sampling/importance_sampling_ratio/max": 1.665633201599121, "sampling/importance_sampling_ratio/mean": 0.8182989954948425, "sampling/importance_sampling_ratio/min": 0.2058224380016327, "sampling/sampling_logp_difference/max": 0.527796745300293, "sampling/sampling_logp_difference/mean": 0.027309097349643707, "step": 277, "step_time": 35.166972905994044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 49.875, "completions/mean_terminated_length": 49.875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.3460892140865326, "epoch": 0.556, "frac_reward_zero_std": 0.0, "grad_norm": 0.7702014446258545, "kl": 0.013081185519695282, "learning_rate": 4.241815979204822e-06, "loss": 0.2004, "num_tokens": 1552244.0, "reward": 0.35750001668930054, "reward_std": 0.5452839136123657, "rewards/reward_func/mean": 0.35750001668930054, "rewards/reward_func/std": 0.5240433812141418, "sampling/importance_sampling_ratio/max": 1.3788411617279053, "sampling/importance_sampling_ratio/mean": 0.8693941235542297, "sampling/importance_sampling_ratio/min": 0.21280725300312042, "sampling/sampling_logp_difference/max": 0.6975330710411072, "sampling/sampling_logp_difference/mean": 0.022449234500527382, "step": 278, "step_time": 35.046001802998944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 49.125, "completions/mean_terminated_length": 49.125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.34101182222366333, "epoch": 0.558, "frac_reward_zero_std": 0.0, "grad_norm": 0.937264621257782, "kl": 0.03165788948535919, "learning_rate": 4.235998656602091e-06, "loss": -0.0022, "num_tokens": 1558256.0, "reward": 0.19625000655651093, "reward_std": 0.331611692905426, "rewards/reward_func/mean": 0.19625000655651093, "rewards/reward_func/std": 0.48567885160446167, "sampling/importance_sampling_ratio/max": 1.5198726654052734, "sampling/importance_sampling_ratio/mean": 1.026062250137329, "sampling/importance_sampling_ratio/min": 0.6299855709075928, "sampling/sampling_logp_difference/max": 0.6371855735778809, "sampling/sampling_logp_difference/mean": 0.02409491315484047, "step": 279, "step_time": 43.56885852699634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.36255669593811035, "epoch": 0.56, "frac_reward_zero_std": 0.0, "grad_norm": 0.7901662588119507, "kl": 0.013471327722072601, "learning_rate": 4.230163124194913e-06, "loss": -0.1862, "num_tokens": 1564016.0, "reward": -0.04874999821186066, "reward_std": 0.06202464923262596, "rewards/reward_func/mean": -0.04874999821186066, "rewards/reward_func/std": 0.062435686588287354, "sampling/importance_sampling_ratio/max": 1.5341285467147827, "sampling/importance_sampling_ratio/mean": 0.8721048831939697, "sampling/importance_sampling_ratio/min": 0.4120750427246094, "sampling/sampling_logp_difference/max": 0.49291160702705383, "sampling/sampling_logp_difference/mean": 0.02896583452820778, "step": 280, "step_time": 38.55246208499011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 55.375, "completions/mean_terminated_length": 55.375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.33529117703437805, "epoch": 0.562, "frac_reward_zero_std": 0.0, "grad_norm": 0.5709347724914551, "kl": 0.01782386749982834, "learning_rate": 4.224309443195261e-06, "loss": -0.1666, "num_tokens": 1569853.0, "reward": 0.5987499952316284, "reward_std": 0.5424566864967346, "rewards/reward_func/mean": 0.5987499952316284, "rewards/reward_func/std": 0.5208903551101685, "sampling/importance_sampling_ratio/max": 1.183791995048523, "sampling/importance_sampling_ratio/mean": 0.8074554204940796, "sampling/importance_sampling_ratio/min": 0.19970110058784485, "sampling/sampling_logp_difference/max": 0.8792259097099304, "sampling/sampling_logp_difference/mean": 0.02342084050178528, "step": 281, "step_time": 36.61759810600779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 57.375, "completions/mean_terminated_length": 57.375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.3397712707519531, "epoch": 0.564, "frac_reward_zero_std": 0.0, "grad_norm": 0.9551227688789368, "kl": 0.03373869135975838, "learning_rate": 4.218437675005479e-06, "loss": -0.0608, "num_tokens": 1575868.0, "reward": 0.33500000834465027, "reward_std": 0.2718449532985687, "rewards/reward_func/mean": 0.33500000834465027, "rewards/reward_func/std": 0.540925145149231, "sampling/importance_sampling_ratio/max": 1.8021279573440552, "sampling/importance_sampling_ratio/mean": 1.119571566581726, "sampling/importance_sampling_ratio/min": 0.7593724727630615, "sampling/sampling_logp_difference/max": 0.6536552906036377, "sampling/sampling_logp_difference/mean": 0.023292632773518562, "step": 282, "step_time": 45.945890542003326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 51.625, "completions/mean_terminated_length": 51.625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.36750608682632446, "epoch": 0.566, "frac_reward_zero_std": 0.0, "grad_norm": 0.7954268455505371, "kl": 0.017600055783987045, "learning_rate": 4.212547881217637e-06, "loss": 0.0046, "num_tokens": 1582017.0, "reward": 0.48374998569488525, "reward_std": 0.5109193921089172, "rewards/reward_func/mean": 0.48374998569488525, "rewards/reward_func/std": 0.5493616461753845, "sampling/importance_sampling_ratio/max": 1.646396517753601, "sampling/importance_sampling_ratio/mean": 0.9751088619232178, "sampling/importance_sampling_ratio/min": 0.5646697282791138, "sampling/sampling_logp_difference/max": 0.4884145259857178, "sampling/sampling_logp_difference/mean": 0.024717465043067932, "step": 283, "step_time": 35.48707418401318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 46.25, "completions/mean_terminated_length": 46.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.2997041642665863, "epoch": 0.568, "frac_reward_zero_std": 0.0, "grad_norm": 0.9606561660766602, "kl": 0.021286074072122574, "learning_rate": 4.206640123612885e-06, "loss": 0.034, "num_tokens": 1587373.0, "reward": 0.5887500047683716, "reward_std": 0.5757082104682922, "rewards/reward_func/mean": 0.5887500047683716, "rewards/reward_func/std": 0.551217794418335, "sampling/importance_sampling_ratio/max": 1.1931737661361694, "sampling/importance_sampling_ratio/mean": 0.8760452270507812, "sampling/importance_sampling_ratio/min": 0.608650267124176, "sampling/sampling_logp_difference/max": 0.6686441898345947, "sampling/sampling_logp_difference/mean": 0.022089166566729546, "step": 284, "step_time": 30.727206259995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3434939980506897, "epoch": 0.57, "frac_reward_zero_std": 0.0, "grad_norm": 0.880317211151123, "kl": 0.013526841066777706, "learning_rate": 4.2007144641608035e-06, "loss": 0.0272, "num_tokens": 1592909.0, "reward": 0.2224999964237213, "reward_std": 0.5191160440444946, "rewards/reward_func/mean": 0.2224999964237213, "rewards/reward_func/std": 0.4809737205505371, "sampling/importance_sampling_ratio/max": 1.8023751974105835, "sampling/importance_sampling_ratio/mean": 0.8636828064918518, "sampling/importance_sampling_ratio/min": 0.35050126910209656, "sampling/sampling_logp_difference/max": 0.8070380687713623, "sampling/sampling_logp_difference/mean": 0.022738801315426826, "step": 285, "step_time": 35.6797738460009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.32169610261917114, "epoch": 0.572, "frac_reward_zero_std": 0.0, "grad_norm": 0.8008424043655396, "kl": 0.016660645604133606, "learning_rate": 4.194770965018758e-06, "loss": 0.093, "num_tokens": 1598805.0, "reward": 0.3537500202655792, "reward_std": 0.2641814053058624, "rewards/reward_func/mean": 0.3537500202655792, "rewards/reward_func/std": 0.5189808011054993, "sampling/importance_sampling_ratio/max": 1.3268340826034546, "sampling/importance_sampling_ratio/mean": 0.8526763319969177, "sampling/importance_sampling_ratio/min": 0.5635418891906738, "sampling/sampling_logp_difference/max": 0.5913662910461426, "sampling/sampling_logp_difference/mean": 0.0236099511384964, "step": 286, "step_time": 45.18931570999848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 47.125, "completions/mean_terminated_length": 47.125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.31263816356658936, "epoch": 0.574, "frac_reward_zero_std": 0.0, "grad_norm": 1.337044358253479, "kl": 0.03852120786905289, "learning_rate": 4.188809688531241e-06, "loss": 0.18, "num_tokens": 1604416.0, "reward": 0.2225000113248825, "reward_std": 0.2900446355342865, "rewards/reward_func/mean": 0.2225000113248825, "rewards/reward_func/std": 0.46191370487213135, "sampling/importance_sampling_ratio/max": 1.3615566492080688, "sampling/importance_sampling_ratio/mean": 0.8375392556190491, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9422614574432373, "sampling/sampling_logp_difference/mean": 0.02491070330142975, "step": 287, "step_time": 41.96923489900655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.33257660269737244, "epoch": 0.576, "frac_reward_zero_std": 0.0, "grad_norm": 1.0255043506622314, "kl": 0.02244836464524269, "learning_rate": 4.182830697229223e-06, "loss": 0.1785, "num_tokens": 1610302.0, "reward": 0.34375, "reward_std": 0.5376863479614258, "rewards/reward_func/mean": 0.34375, "rewards/reward_func/std": 0.516193151473999, "sampling/importance_sampling_ratio/max": 1.905393362045288, "sampling/importance_sampling_ratio/mean": 1.0131361484527588, "sampling/importance_sampling_ratio/min": 0.235921248793602, "sampling/sampling_logp_difference/max": 0.5580523014068604, "sampling/sampling_logp_difference/mean": 0.025940991938114166, "step": 288, "step_time": 40.204126273994916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 60.0, "completions/mean_terminated_length": 60.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.3540540933609009, "epoch": 0.578, "frac_reward_zero_std": 0.0, "grad_norm": 0.8518127202987671, "kl": 0.01784059964120388, "learning_rate": 4.176834053829492e-06, "loss": 0.2212, "num_tokens": 1615656.0, "reward": 0.07500000298023224, "reward_std": 0.2555869221687317, "rewards/reward_func/mean": 0.07500000298023224, "rewards/reward_func/std": 0.32802441716194153, "sampling/importance_sampling_ratio/max": 1.4949760437011719, "sampling/importance_sampling_ratio/mean": 0.8921905159950256, "sampling/importance_sampling_ratio/min": 0.345406174659729, "sampling/sampling_logp_difference/max": 0.3340027332305908, "sampling/sampling_logp_difference/mean": 0.024094369262456894, "step": 289, "step_time": 42.1554738059931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 46.375, "completions/mean_terminated_length": 46.375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.33654820919036865, "epoch": 0.58, "frac_reward_zero_std": 0.0, "grad_norm": 0.7503964900970459, "kl": 0.015095679089426994, "learning_rate": 4.170819821234001e-06, "loss": -0.026, "num_tokens": 1621328.0, "reward": 0.5987499952316284, "reward_std": 0.5505416393280029, "rewards/reward_func/mean": 0.5987499952316284, "rewards/reward_func/std": 0.5330354571342468, "sampling/importance_sampling_ratio/max": 1.6455559730529785, "sampling/importance_sampling_ratio/mean": 0.9360400438308716, "sampling/importance_sampling_ratio/min": 0.4872363209724426, "sampling/sampling_logp_difference/max": 0.3570232391357422, "sampling/sampling_logp_difference/mean": 0.020733939483761787, "step": 290, "step_time": 33.462042975996155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3192588984966278, "epoch": 0.582, "frac_reward_zero_std": 0.0, "grad_norm": 1.532139778137207, "kl": 0.03818577155470848, "learning_rate": 4.164788062529203e-06, "loss": -0.0941, "num_tokens": 1626908.0, "reward": 0.19750000536441803, "reward_std": 0.532228946685791, "rewards/reward_func/mean": 0.19750000536441803, "rewards/reward_func/std": 0.494534432888031, "sampling/importance_sampling_ratio/max": 1.8081889152526855, "sampling/importance_sampling_ratio/mean": 1.2644798755645752, "sampling/importance_sampling_ratio/min": 0.8442208170890808, "sampling/sampling_logp_difference/max": 0.6289647817611694, "sampling/sampling_logp_difference/mean": 0.02317667007446289, "step": 291, "step_time": 36.49553343700245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3839607238769531, "epoch": 0.584, "frac_reward_zero_std": 0.0, "grad_norm": 1.7589607238769531, "kl": 0.0189202930778265, "learning_rate": 4.158738840985393e-06, "loss": -0.2175, "num_tokens": 1632438.0, "reward": 0.3112500011920929, "reward_std": 0.5763314366340637, "rewards/reward_func/mean": 0.3112500011920929, "rewards/reward_func/std": 0.5487502217292786, "sampling/importance_sampling_ratio/max": 2.402517318725586, "sampling/importance_sampling_ratio/mean": 1.3916277885437012, "sampling/importance_sampling_ratio/min": 0.8243075013160706, "sampling/sampling_logp_difference/max": 0.3067154884338379, "sampling/sampling_logp_difference/mean": 0.023358281701803207, "step": 292, "step_time": 39.84433910700318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 45.5, "completions/mean_terminated_length": 45.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3257955014705658, "epoch": 0.586, "frac_reward_zero_std": 0.0, "grad_norm": 1.6930134296417236, "kl": 0.03379783034324646, "learning_rate": 4.1526722200560445e-06, "loss": -0.252, "num_tokens": 1637919.0, "reward": 0.3362500071525574, "reward_std": 0.5554932355880737, "rewards/reward_func/mean": 0.3362500071525574, "rewards/reward_func/std": 0.5346009731292725, "sampling/importance_sampling_ratio/max": 1.87800133228302, "sampling/importance_sampling_ratio/mean": 0.9086008071899414, "sampling/importance_sampling_ratio/min": 0.3146730959415436, "sampling/sampling_logp_difference/max": 1.1332507133483887, "sampling/sampling_logp_difference/mean": 0.030632250010967255, "step": 293, "step_time": 28.30901631899178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3299206495285034, "epoch": 0.588, "frac_reward_zero_std": 0.0, "grad_norm": 0.7860206961631775, "kl": 0.012436825782060623, "learning_rate": 4.146588263377137e-06, "loss": -0.22, "num_tokens": 1643427.0, "reward": 0.07249999791383743, "reward_std": 0.2847173810005188, "rewards/reward_func/mean": 0.07249999791383743, "rewards/reward_func/std": 0.3756803572177887, "sampling/importance_sampling_ratio/max": 1.1498537063598633, "sampling/importance_sampling_ratio/mean": 0.8128846883773804, "sampling/importance_sampling_ratio/min": 0.3219602108001709, "sampling/sampling_logp_difference/max": 0.7047772407531738, "sampling/sampling_logp_difference/mean": 0.02216794341802597, "step": 294, "step_time": 33.63188786900719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3545665740966797, "epoch": 0.59, "frac_reward_zero_std": 0.0, "grad_norm": 0.8703710436820984, "kl": 0.01798795722424984, "learning_rate": 4.140487034766499e-06, "loss": -0.0911, "num_tokens": 1649592.0, "reward": 0.08624999970197678, "reward_std": 0.2947118580341339, "rewards/reward_func/mean": 0.08624999970197678, "rewards/reward_func/std": 0.3655109107494354, "sampling/importance_sampling_ratio/max": 1.07265043258667, "sampling/importance_sampling_ratio/mean": 0.7273514866828918, "sampling/importance_sampling_ratio/min": 0.44319045543670654, "sampling/sampling_logp_difference/max": 0.669346034526825, "sampling/sampling_logp_difference/mean": 0.02738836780190468, "step": 295, "step_time": 43.65301357599674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 54.875, "completions/mean_terminated_length": 54.875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3765920400619507, "epoch": 0.592, "frac_reward_zero_std": 0.0, "grad_norm": 1.7091373205184937, "kl": 0.022772938013076782, "learning_rate": 4.134368598223132e-06, "loss": -0.2819, "num_tokens": 1654960.0, "reward": 0.05500000715255737, "reward_std": 0.2984171509742737, "rewards/reward_func/mean": 0.05500000715255737, "rewards/reward_func/std": 0.3879249095916748, "sampling/importance_sampling_ratio/max": 2.3806519508361816, "sampling/importance_sampling_ratio/mean": 1.4367578029632568, "sampling/importance_sampling_ratio/min": 0.5037611722946167, "sampling/sampling_logp_difference/max": 0.7325538396835327, "sampling/sampling_logp_difference/mean": 0.0233943872153759, "step": 296, "step_time": 45.81175713399716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.625, "completions/mean_terminated_length": 54.625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3774474859237671, "epoch": 0.594, "frac_reward_zero_std": 0.0, "grad_norm": 0.8588233590126038, "kl": 0.018994109705090523, "learning_rate": 4.128233017926538e-06, "loss": 0.1595, "num_tokens": 1660389.0, "reward": 0.20500001311302185, "reward_std": 0.512452244758606, "rewards/reward_func/mean": 0.20500001311302185, "rewards/reward_func/std": 0.47449222207069397, "sampling/importance_sampling_ratio/max": 1.57711660861969, "sampling/importance_sampling_ratio/mean": 0.9654600620269775, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6728287935256958, "sampling/sampling_logp_difference/mean": 0.022220637649297714, "step": 297, "step_time": 39.472764768011984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 55.75, "completions/mean_terminated_length": 55.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.3656435012817383, "epoch": 0.596, "frac_reward_zero_std": 0.0, "grad_norm": 0.9628560543060303, "kl": 0.019060224294662476, "learning_rate": 4.1220803582360545e-06, "loss": 0.0337, "num_tokens": 1666080.0, "reward": 0.07625000178813934, "reward_std": 0.2737163305282593, "rewards/reward_func/mean": 0.07625000178813934, "rewards/reward_func/std": 0.35940176248550415, "sampling/importance_sampling_ratio/max": 1.6562925577163696, "sampling/importance_sampling_ratio/mean": 0.9328627586364746, "sampling/importance_sampling_ratio/min": 0.4477497935295105, "sampling/sampling_logp_difference/max": 0.44923925399780273, "sampling/sampling_logp_difference/mean": 0.028040671721100807, "step": 298, "step_time": 39.901188599003945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 60.625, "completions/mean_terminated_length": 60.625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.34412166476249695, "epoch": 0.598, "frac_reward_zero_std": 0.0, "grad_norm": 0.6345298886299133, "kl": 0.02608601748943329, "learning_rate": 4.115910683690167e-06, "loss": 0.0772, "num_tokens": 1671266.0, "reward": 0.3125, "reward_std": 0.5876922607421875, "rewards/reward_func/mean": 0.3125, "rewards/reward_func/std": 0.5627166628837585, "sampling/importance_sampling_ratio/max": 1.680901050567627, "sampling/importance_sampling_ratio/mean": 0.8350234031677246, "sampling/importance_sampling_ratio/min": 0.4430753290653229, "sampling/sampling_logp_difference/max": 0.4961535930633545, "sampling/sampling_logp_difference/mean": 0.020545832812786102, "step": 299, "step_time": 28.69361345600919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 52.125, "completions/mean_terminated_length": 52.125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.3159938156604767, "epoch": 0.6, "frac_reward_zero_std": 0.0, "grad_norm": 1.383638620376587, "kl": 0.02010868862271309, "learning_rate": 4.109724059005844e-06, "loss": 0.0037, "num_tokens": 1676792.0, "reward": -0.0625, "reward_std": 0.036226607859134674, "rewards/reward_func/mean": -0.0625, "rewards/reward_func/std": 0.04949747398495674, "sampling/importance_sampling_ratio/max": 1.9275904893875122, "sampling/importance_sampling_ratio/mean": 1.0863583087921143, "sampling/importance_sampling_ratio/min": 0.5999199748039246, "sampling/sampling_logp_difference/max": 0.6911392211914062, "sampling/sampling_logp_difference/mean": 0.01991921104490757, "step": 300, "step_time": 38.05700801000057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.33787450194358826, "epoch": 0.602, "frac_reward_zero_std": 0.0, "grad_norm": 0.9712547063827515, "kl": 0.020664069801568985, "learning_rate": 4.1035205490778505e-06, "loss": 0.043, "num_tokens": 1682614.0, "reward": 0.20000001788139343, "reward_std": 0.31392204761505127, "rewards/reward_func/mean": 0.20000001788139343, "rewards/reward_func/std": 0.46882835030555725, "sampling/importance_sampling_ratio/max": 1.7084823846817017, "sampling/importance_sampling_ratio/mean": 1.151247262954712, "sampling/importance_sampling_ratio/min": 0.4232744872570038, "sampling/sampling_logp_difference/max": 0.7267614006996155, "sampling/sampling_logp_difference/mean": 0.02674541436135769, "step": 301, "step_time": 44.15970336600731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 55.375, "completions/mean_terminated_length": 55.375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.35346049070358276, "epoch": 0.604, "frac_reward_zero_std": 0.0, "grad_norm": 1.0315412282943726, "kl": 0.019526183605194092, "learning_rate": 4.09730021897807e-06, "loss": 0.0904, "num_tokens": 1688667.0, "reward": 0.45750001072883606, "reward_std": 0.5982871055603027, "rewards/reward_func/mean": 0.45750001072883606, "rewards/reward_func/std": 0.5541467070579529, "sampling/importance_sampling_ratio/max": 1.6532517671585083, "sampling/importance_sampling_ratio/mean": 1.1429578065872192, "sampling/importance_sampling_ratio/min": 0.5493549108505249, "sampling/sampling_logp_difference/max": 0.7110903263092041, "sampling/sampling_logp_difference/mean": 0.02418721280992031, "step": 302, "step_time": 36.97923886300123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 48.875, "completions/mean_terminated_length": 48.875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.3902936577796936, "epoch": 0.606, "frac_reward_zero_std": 0.0, "grad_norm": 1.2959387302398682, "kl": 0.022301558405160904, "learning_rate": 4.091063133954821e-06, "loss": 0.1146, "num_tokens": 1694644.0, "reward": 0.35249999165534973, "reward_std": 0.545343279838562, "rewards/reward_func/mean": 0.35249999165534973, "rewards/reward_func/std": 0.5202403664588928, "sampling/importance_sampling_ratio/max": 2.390007257461548, "sampling/importance_sampling_ratio/mean": 0.9220224618911743, "sampling/importance_sampling_ratio/min": 0.2947590947151184, "sampling/sampling_logp_difference/max": 0.6841628551483154, "sampling/sampling_logp_difference/mean": 0.03195720165967941, "step": 303, "step_time": 32.29864318299224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 56.125, "completions/mean_terminated_length": 56.125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.3676874041557312, "epoch": 0.608, "frac_reward_zero_std": 0.0, "grad_norm": 0.6491054892539978, "kl": 0.00750330463051796, "learning_rate": 4.084809359432175e-06, "loss": -0.1916, "num_tokens": 1700155.0, "reward": 0.3462499976158142, "reward_std": 0.5419092178344727, "rewards/reward_func/mean": 0.3462499976158142, "rewards/reward_func/std": 0.5199158787727356, "sampling/importance_sampling_ratio/max": 1.2273759841918945, "sampling/importance_sampling_ratio/mean": 0.6842429637908936, "sampling/importance_sampling_ratio/min": 0.40895017981529236, "sampling/sampling_logp_difference/max": 0.3567380905151367, "sampling/sampling_logp_difference/mean": 0.02262764982879162, "step": 304, "step_time": 40.56021591799799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 48.625, "completions/mean_terminated_length": 48.625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.34539082646369934, "epoch": 0.61, "frac_reward_zero_std": 0.0, "grad_norm": 0.7469095587730408, "kl": 0.02391725406050682, "learning_rate": 4.0785389610092684e-06, "loss": 0.0006, "num_tokens": 1705982.0, "reward": 0.3374999761581421, "reward_std": 0.5472263097763062, "rewards/reward_func/mean": 0.3374999761581421, "rewards/reward_func/std": 0.5298989415168762, "sampling/importance_sampling_ratio/max": 1.1961729526519775, "sampling/importance_sampling_ratio/mean": 0.8354759216308594, "sampling/importance_sampling_ratio/min": 0.4396790862083435, "sampling/sampling_logp_difference/max": 0.4689488410949707, "sampling/sampling_logp_difference/mean": 0.023250753059983253, "step": 305, "step_time": 38.657245167996734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 56.0, "completions/mean_terminated_length": 56.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.33104047179222107, "epoch": 0.612, "frac_reward_zero_std": 0.0, "grad_norm": 1.3925511837005615, "kl": 0.03172261267900467, "learning_rate": 4.072252004459612e-06, "loss": 0.0256, "num_tokens": 1711741.0, "reward": 0.036250002682209015, "reward_std": 0.2732139229774475, "rewards/reward_func/mean": 0.036250002682209015, "rewards/reward_func/std": 0.35991817712783813, "sampling/importance_sampling_ratio/max": 2.1978962421417236, "sampling/importance_sampling_ratio/mean": 1.1438350677490234, "sampling/importance_sampling_ratio/min": 0.12577073276042938, "sampling/sampling_logp_difference/max": 0.8553478717803955, "sampling/sampling_logp_difference/mean": 0.027399186044931412, "step": 306, "step_time": 42.64118599900394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 57.5, "completions/mean_terminated_length": 57.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.37816122174263, "epoch": 0.614, "frac_reward_zero_std": 0.0, "grad_norm": 0.8967448472976685, "kl": 0.01576385274529457, "learning_rate": 4.065948555730405e-06, "loss": -0.0581, "num_tokens": 1717802.0, "reward": 0.04249999672174454, "reward_std": 0.29508817195892334, "rewards/reward_func/mean": 0.04249999672174454, "rewards/reward_func/std": 0.37174299359321594, "sampling/importance_sampling_ratio/max": 1.6180919408798218, "sampling/importance_sampling_ratio/mean": 1.0993964672088623, "sampling/importance_sampling_ratio/min": 0.5014515519142151, "sampling/sampling_logp_difference/max": 0.4994962215423584, "sampling/sampling_logp_difference/mean": 0.024644112214446068, "step": 307, "step_time": 43.42916698999761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.2961156666278839, "epoch": 0.616, "frac_reward_zero_std": 0.0, "grad_norm": 0.7969692945480347, "kl": 0.02517705410718918, "learning_rate": 4.059628680941843e-06, "loss": -0.0153, "num_tokens": 1723439.0, "reward": 0.32624998688697815, "reward_std": 0.5344488620758057, "rewards/reward_func/mean": 0.32624998688697815, "rewards/reward_func/std": 0.521123468875885, "sampling/importance_sampling_ratio/max": 1.6039814949035645, "sampling/importance_sampling_ratio/mean": 0.8779284358024597, "sampling/importance_sampling_ratio/min": 0.5089890956878662, "sampling/sampling_logp_difference/max": 0.86330646276474, "sampling/sampling_logp_difference/mean": 0.023059822618961334, "step": 308, "step_time": 39.59198473599099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3278340995311737, "epoch": 0.618, "frac_reward_zero_std": 0.0, "grad_norm": 1.0694993734359741, "kl": 0.040127597749233246, "learning_rate": 4.053292446386422e-06, "loss": 0.0202, "num_tokens": 1728596.0, "reward": 0.7137500047683716, "reward_std": 0.32740655541419983, "rewards/reward_func/mean": 0.7137500047683716, "rewards/reward_func/std": 0.49684542417526245, "sampling/importance_sampling_ratio/max": 1.6734663248062134, "sampling/importance_sampling_ratio/mean": 0.9968644976615906, "sampling/importance_sampling_ratio/min": 0.4512510895729065, "sampling/sampling_logp_difference/max": 0.7595298290252686, "sampling/sampling_logp_difference/mean": 0.026049617677927017, "step": 309, "step_time": 34.06555557799584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 51.625, "completions/mean_terminated_length": 51.625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3629554510116577, "epoch": 0.62, "frac_reward_zero_std": 0.0, "grad_norm": 1.4105416536331177, "kl": 0.025173306465148926, "learning_rate": 4.046939918528243e-06, "loss": -0.1381, "num_tokens": 1734589.0, "reward": 0.21250000596046448, "reward_std": 0.519790768623352, "rewards/reward_func/mean": 0.21250000596046448, "rewards/reward_func/std": 0.48130035400390625, "sampling/importance_sampling_ratio/max": 1.744605541229248, "sampling/importance_sampling_ratio/mean": 1.0676430463790894, "sampling/importance_sampling_ratio/min": 0.6928181052207947, "sampling/sampling_logp_difference/max": 0.3814241886138916, "sampling/sampling_logp_difference/mean": 0.025705184787511826, "step": 310, "step_time": 40.15090221299033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 51.875, "completions/mean_terminated_length": 51.875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.32077813148498535, "epoch": 0.622, "frac_reward_zero_std": 0.0, "grad_norm": 3.09201979637146, "kl": 0.04986204952001572, "learning_rate": 4.040571164002319e-06, "loss": -0.5408, "num_tokens": 1740635.0, "reward": 0.1899999976158142, "reward_std": 0.5336418151855469, "rewards/reward_func/mean": 0.1899999976158142, "rewards/reward_func/std": 0.49503248929977417, "sampling/importance_sampling_ratio/max": 2.502277374267578, "sampling/importance_sampling_ratio/mean": 1.2795238494873047, "sampling/importance_sampling_ratio/min": 0.3709660768508911, "sampling/sampling_logp_difference/max": 1.0115962028503418, "sampling/sampling_logp_difference/mean": 0.027241632342338562, "step": 311, "step_time": 42.1582357169973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 51.375, "completions/mean_terminated_length": 51.375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.3752894401550293, "epoch": 0.624, "frac_reward_zero_std": 0.0, "grad_norm": 0.7901284694671631, "kl": 0.04521573334932327, "learning_rate": 4.034186249613869e-06, "loss": 0.1113, "num_tokens": 1746240.0, "reward": 0.07625000923871994, "reward_std": 0.2738334536552429, "rewards/reward_func/mean": 0.07625000923871994, "rewards/reward_func/std": 0.36629176139831543, "sampling/importance_sampling_ratio/max": 1.678858757019043, "sampling/importance_sampling_ratio/mean": 0.8888430595397949, "sampling/importance_sampling_ratio/min": 0.30794695019721985, "sampling/sampling_logp_difference/max": 0.8799378871917725, "sampling/sampling_logp_difference/mean": 0.026124432682991028, "step": 312, "step_time": 39.03588431800017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.33964860439300537, "epoch": 0.626, "frac_reward_zero_std": 0.0, "grad_norm": 0.9753314852714539, "kl": 0.018361952155828476, "learning_rate": 4.027785242337626e-06, "loss": -0.0936, "num_tokens": 1751664.0, "reward": 0.3450000286102295, "reward_std": 0.5599009394645691, "rewards/reward_func/mean": 0.3450000286102295, "rewards/reward_func/std": 0.5376669764518738, "sampling/importance_sampling_ratio/max": 1.7452173233032227, "sampling/importance_sampling_ratio/mean": 0.9294567704200745, "sampling/importance_sampling_ratio/min": 0.40640538930892944, "sampling/sampling_logp_difference/max": 0.7474876046180725, "sampling/sampling_logp_difference/mean": 0.02840811386704445, "step": 313, "step_time": 35.24361865199171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 53.875, "completions/mean_terminated_length": 53.875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.31101056933403015, "epoch": 0.628, "frac_reward_zero_std": 0.0, "grad_norm": 0.8589064478874207, "kl": 0.028528966009616852, "learning_rate": 4.021368209317126e-06, "loss": -0.0071, "num_tokens": 1756602.0, "reward": 0.33250001072883606, "reward_std": 0.5354459285736084, "rewards/reward_func/mean": 0.33250001072883606, "rewards/reward_func/std": 0.5180389285087585, "sampling/importance_sampling_ratio/max": 1.1891282796859741, "sampling/importance_sampling_ratio/mean": 0.6778733730316162, "sampling/importance_sampling_ratio/min": 0.2918332517147064, "sampling/sampling_logp_difference/max": 1.3020625114440918, "sampling/sampling_logp_difference/mean": 0.02469920739531517, "step": 314, "step_time": 29.316569301998243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.3299601674079895, "epoch": 0.63, "frac_reward_zero_std": 0.0, "grad_norm": 0.8943186402320862, "kl": 0.02374444343149662, "learning_rate": 4.014935217864009e-06, "loss": 0.125, "num_tokens": 1762205.0, "reward": 0.2212499976158142, "reward_std": 0.5198818445205688, "rewards/reward_func/mean": 0.2212499976158142, "rewards/reward_func/std": 0.4814394414424896, "sampling/importance_sampling_ratio/max": 1.4114490747451782, "sampling/importance_sampling_ratio/mean": 0.8938862085342407, "sampling/importance_sampling_ratio/min": 0.5008694529533386, "sampling/sampling_logp_difference/max": 0.34326255321502686, "sampling/sampling_logp_difference/mean": 0.020477421581745148, "step": 315, "step_time": 41.47385054100596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 57.625, "completions/mean_terminated_length": 57.625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.3791668713092804, "epoch": 0.632, "frac_reward_zero_std": 0.0, "grad_norm": 0.6989040374755859, "kl": 0.031720541417598724, "learning_rate": 4.008486335457312e-06, "loss": -0.2414, "num_tokens": 1767759.0, "reward": 0.07500000298023224, "reward_std": 0.30030137300491333, "rewards/reward_func/mean": 0.07500000298023224, "rewards/reward_func/std": 0.37826672196388245, "sampling/importance_sampling_ratio/max": 2.765488862991333, "sampling/importance_sampling_ratio/mean": 1.0680071115493774, "sampling/importance_sampling_ratio/min": 0.15137039124965668, "sampling/sampling_logp_difference/max": 0.9639625549316406, "sampling/sampling_logp_difference/mean": 0.028898822143673897, "step": 316, "step_time": 46.809593726997264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.3248460590839386, "epoch": 0.634, "frac_reward_zero_std": 0.0, "grad_norm": 0.9084358215332031, "kl": 0.021403685212135315, "learning_rate": 4.002021629742759e-06, "loss": -0.0513, "num_tokens": 1773686.0, "reward": 0.21000000834465027, "reward_std": 0.32260364294052124, "rewards/reward_func/mean": 0.21000000834465027, "rewards/reward_func/std": 0.4816637933254242, "sampling/importance_sampling_ratio/max": 1.897550106048584, "sampling/importance_sampling_ratio/mean": 1.1017444133758545, "sampling/importance_sampling_ratio/min": 0.3599690794944763, "sampling/sampling_logp_difference/max": 0.6600342988967896, "sampling/sampling_logp_difference/mean": 0.019616486504673958, "step": 317, "step_time": 44.18835420500545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.337682843208313, "epoch": 0.636, "frac_reward_zero_std": 0.0, "grad_norm": 0.7857117652893066, "kl": 0.021645016968250275, "learning_rate": 3.995541168532055e-06, "loss": -0.3915, "num_tokens": 1779077.0, "reward": 0.1925000101327896, "reward_std": 0.3348638415336609, "rewards/reward_func/mean": 0.1925000101327896, "rewards/reward_func/std": 0.4793075621128082, "sampling/importance_sampling_ratio/max": 1.4211463928222656, "sampling/importance_sampling_ratio/mean": 0.8854165077209473, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6028759479522705, "sampling/sampling_logp_difference/mean": 0.024257110431790352, "step": 318, "step_time": 39.01218066600268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 60.5, "completions/mean_terminated_length": 60.5, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.38314715027809143, "epoch": 0.638, "frac_reward_zero_std": 0.0, "grad_norm": 0.7992427945137024, "kl": 0.02151847444474697, "learning_rate": 3.989045019802171e-06, "loss": 0.108, "num_tokens": 1785372.0, "reward": 0.07500000298023224, "reward_std": 0.2610323429107666, "rewards/reward_func/mean": 0.07500000298023224, "rewards/reward_func/std": 0.3674623668193817, "sampling/importance_sampling_ratio/max": 1.8030726909637451, "sampling/importance_sampling_ratio/mean": 1.1737439632415771, "sampling/importance_sampling_ratio/min": 0.739587664604187, "sampling/sampling_logp_difference/max": 0.7126893997192383, "sampling/sampling_logp_difference/mean": 0.02029246836900711, "step": 319, "step_time": 43.12463011700311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.332579642534256, "epoch": 0.64, "frac_reward_zero_std": 0.0, "grad_norm": 2.3053195476531982, "kl": 0.724174439907074, "learning_rate": 3.982533251694632e-06, "loss": -0.22, "num_tokens": 1791652.0, "reward": 0.07124999910593033, "reward_std": 0.2992333769798279, "rewards/reward_func/mean": 0.07124999910593033, "rewards/reward_func/std": 0.3789623975753784, "sampling/importance_sampling_ratio/max": 1.868449330329895, "sampling/importance_sampling_ratio/mean": 0.8420298099517822, "sampling/importance_sampling_ratio/min": 0.3771025538444519, "sampling/sampling_logp_difference/max": 1.4212937355041504, "sampling/sampling_logp_difference/mean": 0.026473678648471832, "step": 320, "step_time": 53.79963376899832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 49.25, "completions/mean_terminated_length": 49.25, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.31947869062423706, "epoch": 0.642, "frac_reward_zero_std": 0.0, "grad_norm": 0.9473581314086914, "kl": 0.01975640282034874, "learning_rate": 3.976005932514807e-06, "loss": -0.0296, "num_tokens": 1796652.0, "reward": -0.051249995827674866, "reward_std": 0.0348023921251297, "rewards/reward_func/mean": -0.051249995827674866, "rewards/reward_func/std": 0.04642582684755325, "sampling/importance_sampling_ratio/max": 1.4474753141403198, "sampling/importance_sampling_ratio/mean": 0.9341865181922913, "sampling/importance_sampling_ratio/min": 0.4438150227069855, "sampling/sampling_logp_difference/max": 0.5270947217941284, "sampling/sampling_logp_difference/mean": 0.026481110602617264, "step": 321, "step_time": 31.841147099010414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.3830873966217041, "epoch": 0.644, "frac_reward_zero_std": 0.0, "grad_norm": 0.9753480553627014, "kl": 0.01734742894768715, "learning_rate": 3.969463130731183e-06, "loss": 0.0279, "num_tokens": 1802874.0, "reward": 0.1899999976158142, "reward_std": 0.5408111810684204, "rewards/reward_func/mean": 0.1899999976158142, "rewards/reward_func/std": 0.5016544461250305, "sampling/importance_sampling_ratio/max": 1.7382115125656128, "sampling/importance_sampling_ratio/mean": 0.9483833909034729, "sampling/importance_sampling_ratio/min": 0.5650532245635986, "sampling/sampling_logp_difference/max": 0.4500439167022705, "sampling/sampling_logp_difference/mean": 0.024530794471502304, "step": 322, "step_time": 40.43267983599799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 50.875, "completions/mean_terminated_length": 50.875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3294594883918762, "epoch": 0.646, "frac_reward_zero_std": 0.0, "grad_norm": 0.8316826820373535, "kl": 0.01684340089559555, "learning_rate": 3.962904914974656e-06, "loss": -0.0565, "num_tokens": 1808418.0, "reward": 0.19499999284744263, "reward_std": 0.33696448802948, "rewards/reward_func/mean": 0.19499999284744263, "rewards/reward_func/std": 0.4996570646762848, "sampling/importance_sampling_ratio/max": 1.5402275323867798, "sampling/importance_sampling_ratio/mean": 0.6515508890151978, "sampling/importance_sampling_ratio/min": 0.23761314153671265, "sampling/sampling_logp_difference/max": 0.7509863376617432, "sampling/sampling_logp_difference/mean": 0.02614228054881096, "step": 323, "step_time": 36.79304045200115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.33004504442214966, "epoch": 0.648, "frac_reward_zero_std": 0.0, "grad_norm": 1.1432764530181885, "kl": 0.02042960189282894, "learning_rate": 3.956331354037805e-06, "loss": -0.1958, "num_tokens": 1813504.0, "reward": 0.20375001430511475, "reward_std": 0.32482287287712097, "rewards/reward_func/mean": 0.20375001430511475, "rewards/reward_func/std": 0.4935567080974579, "sampling/importance_sampling_ratio/max": 2.893115758895874, "sampling/importance_sampling_ratio/mean": 1.2178199291229248, "sampling/importance_sampling_ratio/min": 0.6653203368186951, "sampling/sampling_logp_difference/max": 0.5842078924179077, "sampling/sampling_logp_difference/mean": 0.021406445652246475, "step": 324, "step_time": 28.069678256011684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.2968294024467468, "epoch": 0.65, "frac_reward_zero_std": 0.0, "grad_norm": 1.0796535015106201, "kl": 0.015129530802369118, "learning_rate": 3.949742516874175e-06, "loss": 0.0111, "num_tokens": 1819379.0, "reward": 0.07000000029802322, "reward_std": 0.260448157787323, "rewards/reward_func/mean": 0.07000000029802322, "rewards/reward_func/std": 0.3445908725261688, "sampling/importance_sampling_ratio/max": 1.8804163932800293, "sampling/importance_sampling_ratio/mean": 1.0742685794830322, "sampling/importance_sampling_ratio/min": 0.46257898211479187, "sampling/sampling_logp_difference/max": 0.3363761901855469, "sampling/sampling_logp_difference/mean": 0.019979190081357956, "step": 325, "step_time": 41.76114026000141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 57.875, "completions/mean_terminated_length": 57.875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3415643870830536, "epoch": 0.652, "frac_reward_zero_std": 0.0, "grad_norm": 1.1616402864456177, "kl": 0.011725490912795067, "learning_rate": 3.943138472597549e-06, "loss": 0.0107, "num_tokens": 1824630.0, "reward": 0.5975000262260437, "reward_std": 0.5574594736099243, "rewards/reward_func/mean": 0.5975000262260437, "rewards/reward_func/std": 0.5363035202026367, "sampling/importance_sampling_ratio/max": 1.4799103736877441, "sampling/importance_sampling_ratio/mean": 1.0802059173583984, "sampling/importance_sampling_ratio/min": 0.49768996238708496, "sampling/sampling_logp_difference/max": 0.31270480155944824, "sampling/sampling_logp_difference/mean": 0.022173818200826645, "step": 326, "step_time": 22.385102098996867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 54.125, "completions/mean_terminated_length": 54.125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.32594597339630127, "epoch": 0.654, "frac_reward_zero_std": 0.0, "grad_norm": 0.9298455119132996, "kl": 0.013603993691504002, "learning_rate": 3.936519290481226e-06, "loss": -0.1398, "num_tokens": 1830458.0, "reward": -0.04500000178813934, "reward_std": 0.03829461336135864, "rewards/reward_func/mean": -0.04500000178813934, "rewards/reward_func/std": 0.037032805383205414, "sampling/importance_sampling_ratio/max": 1.7700729370117188, "sampling/importance_sampling_ratio/mean": 0.8740437030792236, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.3294541835784912, "sampling/sampling_logp_difference/mean": 0.022188276052474976, "step": 327, "step_time": 40.74412981000205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 97.0, "completions/max_terminated_length": 97.0, "completions/mean_length": 60.5, "completions/mean_terminated_length": 60.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.344268262386322, "epoch": 0.656, "frac_reward_zero_std": 0.0, "grad_norm": 1.0750876665115356, "kl": 0.028728434816002846, "learning_rate": 3.929885039957296e-06, "loss": 0.1486, "num_tokens": 1835664.0, "reward": 0.19750000536441803, "reward_std": 0.5264096856117249, "rewards/reward_func/mean": 0.19750000536441803, "rewards/reward_func/std": 0.4876694083213806, "sampling/importance_sampling_ratio/max": 2.0909793376922607, "sampling/importance_sampling_ratio/mean": 0.974539577960968, "sampling/importance_sampling_ratio/min": 0.5380573868751526, "sampling/sampling_logp_difference/max": 0.4193446636199951, "sampling/sampling_logp_difference/mean": 0.023767180740833282, "step": 328, "step_time": 44.32392562199675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.2737228572368622, "epoch": 0.658, "frac_reward_zero_std": 0.0, "grad_norm": 0.8460615873336792, "kl": 0.04131521284580231, "learning_rate": 3.923235790615907e-06, "loss": -0.2082, "num_tokens": 1841050.0, "reward": 0.06874999403953552, "reward_std": 0.28597307205200195, "rewards/reward_func/mean": 0.06874999403953552, "rewards/reward_func/std": 0.37911316752433777, "sampling/importance_sampling_ratio/max": 1.7139233350753784, "sampling/importance_sampling_ratio/mean": 0.9048976898193359, "sampling/importance_sampling_ratio/min": 0.1554194986820221, "sampling/sampling_logp_difference/max": 0.7940307855606079, "sampling/sampling_logp_difference/mean": 0.022876432165503502, "step": 329, "step_time": 32.28045204099908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.30464908480644226, "epoch": 0.66, "frac_reward_zero_std": 0.0, "grad_norm": 0.9566115736961365, "kl": 0.021810725331306458, "learning_rate": 3.916571612204538e-06, "loss": -0.1048, "num_tokens": 1846399.0, "reward": 0.20374999940395355, "reward_std": 0.5155032873153687, "rewards/reward_func/mean": 0.20374999940395355, "rewards/reward_func/std": 0.477491557598114, "sampling/importance_sampling_ratio/max": 1.5081989765167236, "sampling/importance_sampling_ratio/mean": 0.9097875952720642, "sampling/importance_sampling_ratio/min": 0.47412431240081787, "sampling/sampling_logp_difference/max": 0.46178531646728516, "sampling/sampling_logp_difference/mean": 0.02363799698650837, "step": 330, "step_time": 32.77630696099368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 48.875, "completions/mean_terminated_length": 48.875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.29317551851272583, "epoch": 0.662, "frac_reward_zero_std": 0.0, "grad_norm": 0.9641599655151367, "kl": 0.019616033881902695, "learning_rate": 3.909892574627267e-06, "loss": -0.1627, "num_tokens": 1852264.0, "reward": 0.1824999898672104, "reward_std": 0.32395851612091064, "rewards/reward_func/mean": 0.1824999898672104, "rewards/reward_func/std": 0.4794565439224243, "sampling/importance_sampling_ratio/max": 2.4280335903167725, "sampling/importance_sampling_ratio/mean": 1.0602843761444092, "sampling/importance_sampling_ratio/min": 0.48485422134399414, "sampling/sampling_logp_difference/max": 0.336214542388916, "sampling/sampling_logp_difference/mean": 0.025283973664045334, "step": 331, "step_time": 40.58578193899302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 48.875, "completions/mean_terminated_length": 48.875, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.35180217027664185, "epoch": 0.664, "frac_reward_zero_std": 0.0, "grad_norm": 1.7843549251556396, "kl": 0.036747075617313385, "learning_rate": 3.903198747944037e-06, "loss": -0.0709, "num_tokens": 1858008.0, "reward": 0.08874999731779099, "reward_std": 0.27637845277786255, "rewards/reward_func/mean": 0.08874999731779099, "rewards/reward_func/std": 0.36910849809646606, "sampling/importance_sampling_ratio/max": 2.3475561141967773, "sampling/importance_sampling_ratio/mean": 1.002429485321045, "sampling/importance_sampling_ratio/min": 0.3496679663658142, "sampling/sampling_logp_difference/max": 0.7366769313812256, "sampling/sampling_logp_difference/mean": 0.029994945973157883, "step": 332, "step_time": 40.87763853299839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 51.125, "completions/mean_terminated_length": 51.125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3407575488090515, "epoch": 0.666, "frac_reward_zero_std": 0.0, "grad_norm": 1.428694486618042, "kl": 0.016911303624510765, "learning_rate": 3.896490202369924e-06, "loss": 0.0366, "num_tokens": 1863181.0, "reward": 0.48625001311302185, "reward_std": 0.590417206287384, "rewards/reward_func/mean": 0.48625001311302185, "rewards/reward_func/std": 0.5468333959579468, "sampling/importance_sampling_ratio/max": 1.7516218423843384, "sampling/importance_sampling_ratio/mean": 1.0637693405151367, "sampling/importance_sampling_ratio/min": 0.2746276259422302, "sampling/sampling_logp_difference/max": 1.1909523010253906, "sampling/sampling_logp_difference/mean": 0.026827020570635796, "step": 333, "step_time": 21.48538727000414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 47.375, "completions/mean_terminated_length": 47.375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3031199276447296, "epoch": 0.668, "frac_reward_zero_std": 0.0, "grad_norm": 0.957080066204071, "kl": 0.014239022508263588, "learning_rate": 3.889767008274396e-06, "loss": 0.02, "num_tokens": 1868717.0, "reward": 0.21875, "reward_std": 0.29802343249320984, "rewards/reward_func/mean": 0.21875, "rewards/reward_func/std": 0.467743456363678, "sampling/importance_sampling_ratio/max": 1.57808256149292, "sampling/importance_sampling_ratio/mean": 0.9328022599220276, "sampling/importance_sampling_ratio/min": 0.6014644503593445, "sampling/sampling_logp_difference/max": 0.4710826873779297, "sampling/sampling_logp_difference/mean": 0.019350770860910416, "step": 334, "step_time": 38.31029029999627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3382827043533325, "epoch": 0.67, "frac_reward_zero_std": 0.0, "grad_norm": 1.2987167835235596, "kl": 0.030748853459954262, "learning_rate": 3.883029236180577e-06, "loss": 0.4063, "num_tokens": 1874967.0, "reward": 0.0925000011920929, "reward_std": 0.26742854714393616, "rewards/reward_func/mean": 0.0925000011920929, "rewards/reward_func/std": 0.3517608642578125, "sampling/importance_sampling_ratio/max": 2.299546957015991, "sampling/importance_sampling_ratio/mean": 1.0671589374542236, "sampling/importance_sampling_ratio/min": 0.40785279870033264, "sampling/sampling_logp_difference/max": 0.44230735301971436, "sampling/sampling_logp_difference/mean": 0.023608166724443436, "step": 335, "step_time": 48.46738143800758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 47.375, "completions/mean_terminated_length": 47.375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.31930792331695557, "epoch": 0.672, "frac_reward_zero_std": 0.0, "grad_norm": 1.3972865343093872, "kl": 0.025625426322221756, "learning_rate": 3.876276956764509e-06, "loss": 0.0398, "num_tokens": 1880164.0, "reward": 0.19624999165534973, "reward_std": 0.311894953250885, "rewards/reward_func/mean": 0.19624999165534973, "rewards/reward_func/std": 0.47234785556793213, "sampling/importance_sampling_ratio/max": 1.2718799114227295, "sampling/importance_sampling_ratio/mean": 0.8168869018554688, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6838905811309814, "sampling/sampling_logp_difference/mean": 0.02987261861562729, "step": 336, "step_time": 31.000979021002422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 55.875, "completions/mean_terminated_length": 55.875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.3230094015598297, "epoch": 0.674, "frac_reward_zero_std": 0.0, "grad_norm": 1.0229469537734985, "kl": 0.015833809971809387, "learning_rate": 3.869510240854408e-06, "loss": 0.3128, "num_tokens": 1885758.0, "reward": 0.057499997317790985, "reward_std": 0.27753278613090515, "rewards/reward_func/mean": 0.057499997317790985, "rewards/reward_func/std": 0.3586781322956085, "sampling/importance_sampling_ratio/max": 2.7663161754608154, "sampling/importance_sampling_ratio/mean": 1.203728199005127, "sampling/importance_sampling_ratio/min": 0.38395655155181885, "sampling/sampling_logp_difference/max": 0.44419431686401367, "sampling/sampling_logp_difference/mean": 0.02450854331254959, "step": 337, "step_time": 40.59597251701052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.34267550706863403, "epoch": 0.676, "frac_reward_zero_std": 0.0, "grad_norm": 0.7642379999160767, "kl": 0.03843656927347183, "learning_rate": 3.862729159429921e-06, "loss": -0.2044, "num_tokens": 1891187.0, "reward": 0.7300000190734863, "reward_std": 0.5168682336807251, "rewards/reward_func/mean": 0.7300000190734863, "rewards/reward_func/std": 0.47883790731430054, "sampling/importance_sampling_ratio/max": 1.2588783502578735, "sampling/importance_sampling_ratio/mean": 0.7454515695571899, "sampling/importance_sampling_ratio/min": 0.35877272486686707, "sampling/sampling_logp_difference/max": 0.8279721736907959, "sampling/sampling_logp_difference/mean": 0.027465185150504112, "step": 338, "step_time": 28.354054954994353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 53.375, "completions/mean_terminated_length": 53.375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.3062908947467804, "epoch": 0.678, "frac_reward_zero_std": 0.0, "grad_norm": 0.8934028148651123, "kl": 0.017625048756599426, "learning_rate": 3.855933783621384e-06, "loss": -0.0268, "num_tokens": 1896594.0, "reward": 0.03374999761581421, "reward_std": 0.2926676571369171, "rewards/reward_func/mean": 0.03374999761581421, "rewards/reward_func/std": 0.38116130232810974, "sampling/importance_sampling_ratio/max": 2.695901870727539, "sampling/importance_sampling_ratio/mean": 1.0782511234283447, "sampling/importance_sampling_ratio/min": 0.34986376762390137, "sampling/sampling_logp_difference/max": 1.0531506538391113, "sampling/sampling_logp_difference/mean": 0.02146495133638382, "step": 339, "step_time": 38.300622745009605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.30942976474761963, "epoch": 0.68, "frac_reward_zero_std": 0.0, "grad_norm": 1.121816873550415, "kl": 0.02171071618795395, "learning_rate": 3.849124184709073e-06, "loss": -0.0232, "num_tokens": 1901989.0, "reward": 0.08500000089406967, "reward_std": 0.28684449195861816, "rewards/reward_func/mean": 0.08500000089406967, "rewards/reward_func/std": 0.37171417474746704, "sampling/importance_sampling_ratio/max": 1.164718508720398, "sampling/importance_sampling_ratio/mean": 0.9217618703842163, "sampling/importance_sampling_ratio/min": 0.47626274824142456, "sampling/sampling_logp_difference/max": 0.9351463317871094, "sampling/sampling_logp_difference/mean": 0.021724089980125427, "step": 340, "step_time": 41.05157758499263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.3553770184516907, "epoch": 0.682, "frac_reward_zero_std": 0.0, "grad_norm": 0.8589447140693665, "kl": 0.023396966978907585, "learning_rate": 3.84230043412246e-06, "loss": 0.1354, "num_tokens": 1907544.0, "reward": 0.16500000655651093, "reward_std": 0.5003730058670044, "rewards/reward_func/mean": 0.16500000655651093, "rewards/reward_func/std": 0.46800491213798523, "sampling/importance_sampling_ratio/max": 1.3289875984191895, "sampling/importance_sampling_ratio/mean": 0.7921176552772522, "sampling/importance_sampling_ratio/min": 0.35033443570137024, "sampling/sampling_logp_difference/max": 0.6369071006774902, "sampling/sampling_logp_difference/mean": 0.03014998510479927, "step": 341, "step_time": 35.57554464499117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 52.875, "completions/mean_terminated_length": 52.875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3291049003601074, "epoch": 0.684, "frac_reward_zero_std": 0.0, "grad_norm": 0.9054797887802124, "kl": 0.02581053599715233, "learning_rate": 3.835462603439458e-06, "loss": 0.0187, "num_tokens": 1912578.0, "reward": 0.3537500202655792, "reward_std": 0.5527259111404419, "rewards/reward_func/mean": 0.3537500202655792, "rewards/reward_func/std": 0.5297961831092834, "sampling/importance_sampling_ratio/max": 1.383819818496704, "sampling/importance_sampling_ratio/mean": 0.9456525444984436, "sampling/importance_sampling_ratio/min": 0.24566781520843506, "sampling/sampling_logp_difference/max": 1.197718858718872, "sampling/sampling_logp_difference/mean": 0.025123560801148415, "step": 342, "step_time": 33.69786287700117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.3272516131401062, "epoch": 0.686, "frac_reward_zero_std": 0.0, "grad_norm": 0.9231871366500854, "kl": 0.0600452721118927, "learning_rate": 3.828610764385676e-06, "loss": 0.1178, "num_tokens": 1918689.0, "reward": 0.11375001072883606, "reward_std": 0.24988916516304016, "rewards/reward_func/mean": 0.11375001072883606, "rewards/reward_func/std": 0.34616008400917053, "sampling/importance_sampling_ratio/max": 1.4086482524871826, "sampling/importance_sampling_ratio/mean": 0.9599908590316772, "sampling/importance_sampling_ratio/min": 0.3553762137889862, "sampling/sampling_logp_difference/max": 0.7473673820495605, "sampling/sampling_logp_difference/mean": 0.02560514770448208, "step": 343, "step_time": 38.41646344099718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3004932999610901, "epoch": 0.688, "frac_reward_zero_std": 0.0, "grad_norm": 0.7889523506164551, "kl": 0.015588251873850822, "learning_rate": 3.821744988833664e-06, "loss": 0.0061, "num_tokens": 1924332.0, "reward": 0.3400000035762787, "reward_std": 0.2622142732143402, "rewards/reward_func/mean": 0.3400000035762787, "rewards/reward_func/std": 0.5224120020866394, "sampling/importance_sampling_ratio/max": 0.943295955657959, "sampling/importance_sampling_ratio/mean": 0.6444682478904724, "sampling/importance_sampling_ratio/min": 0.2145964801311493, "sampling/sampling_logp_difference/max": 0.5309677124023438, "sampling/sampling_logp_difference/mean": 0.0254978034645319, "step": 344, "step_time": 31.367738123008166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3225318193435669, "epoch": 0.69, "frac_reward_zero_std": 0.0, "grad_norm": 0.6011006236076355, "kl": 0.022544488310813904, "learning_rate": 3.814865348802157e-06, "loss": 0.1901, "num_tokens": 1929199.0, "reward": 0.22499999403953552, "reward_std": 0.5106456279754639, "rewards/reward_func/mean": 0.22499999403953552, "rewards/reward_func/std": 0.4728938341140747, "sampling/importance_sampling_ratio/max": 1.5084651708602905, "sampling/importance_sampling_ratio/mean": 0.8030335903167725, "sampling/importance_sampling_ratio/min": 0.23240113258361816, "sampling/sampling_logp_difference/max": 0.6574427485466003, "sampling/sampling_logp_difference/mean": 0.023013845086097717, "step": 345, "step_time": 30.061961209998117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 56.375, "completions/mean_terminated_length": 56.375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.31706368923187256, "epoch": 0.692, "frac_reward_zero_std": 0.0, "grad_norm": 1.0651155710220337, "kl": 0.018591083586215973, "learning_rate": 3.807971916455325e-06, "loss": 0.2587, "num_tokens": 1934113.0, "reward": 0.33500000834465027, "reward_std": 0.5684312582015991, "rewards/reward_func/mean": 0.33500000834465027, "rewards/reward_func/std": 0.5465737581253052, "sampling/importance_sampling_ratio/max": 2.3241448402404785, "sampling/importance_sampling_ratio/mean": 1.1834617853164673, "sampling/importance_sampling_ratio/min": 0.557026743888855, "sampling/sampling_logp_difference/max": 0.49699926376342773, "sampling/sampling_logp_difference/mean": 0.019841229543089867, "step": 346, "step_time": 29.66081979201408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 60.75, "completions/mean_terminated_length": 60.75, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.3459808826446533, "epoch": 0.694, "frac_reward_zero_std": 0.0, "grad_norm": 0.9309832453727722, "kl": 0.017617210745811462, "learning_rate": 3.8010647641020116e-06, "loss": -0.013, "num_tokens": 1939744.0, "reward": 0.4462500214576721, "reward_std": 0.5236546397209167, "rewards/reward_func/mean": 0.4462500214576721, "rewards/reward_func/std": 0.568580687046051, "sampling/importance_sampling_ratio/max": 1.3764897584915161, "sampling/importance_sampling_ratio/mean": 1.0339674949645996, "sampling/importance_sampling_ratio/min": 0.5544477105140686, "sampling/sampling_logp_difference/max": 0.5542728900909424, "sampling/sampling_logp_difference/mean": 0.020336320623755455, "step": 347, "step_time": 39.97378583300451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.30096814036369324, "epoch": 0.696, "frac_reward_zero_std": 0.0, "grad_norm": 1.1978667974472046, "kl": 0.032052673399448395, "learning_rate": 3.794143964194976e-06, "loss": 0.2779, "num_tokens": 1945009.0, "reward": 0.20250000059604645, "reward_std": 0.516106367111206, "rewards/reward_func/mean": 0.20250000059604645, "rewards/reward_func/std": 0.47805407643318176, "sampling/importance_sampling_ratio/max": 1.9551745653152466, "sampling/importance_sampling_ratio/mean": 1.1093769073486328, "sampling/importance_sampling_ratio/min": 0.2619774639606476, "sampling/sampling_logp_difference/max": 0.6207488775253296, "sampling/sampling_logp_difference/mean": 0.025560472160577774, "step": 348, "step_time": 35.64131051799632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 58.875, "completions/mean_terminated_length": 58.875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.3116108775138855, "epoch": 0.698, "frac_reward_zero_std": 0.0, "grad_norm": 0.8886599540710449, "kl": 0.01983170211315155, "learning_rate": 3.7872095893301344e-06, "loss": -0.1341, "num_tokens": 1950903.0, "reward": 0.49125000834465027, "reward_std": 0.4993107318878174, "rewards/reward_func/mean": 0.49125000834465027, "rewards/reward_func/std": 0.5333302021026611, "sampling/importance_sampling_ratio/max": 1.8198500871658325, "sampling/importance_sampling_ratio/mean": 0.9573653340339661, "sampling/importance_sampling_ratio/min": 0.5384606122970581, "sampling/sampling_logp_difference/max": 0.4561774730682373, "sampling/sampling_logp_difference/mean": 0.01902041584253311, "step": 349, "step_time": 37.28226282200194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.35526877641677856, "epoch": 0.7, "frac_reward_zero_std": 0.0, "grad_norm": 1.0215078592300415, "kl": 0.01806233450770378, "learning_rate": 3.7802617122457976e-06, "loss": 0.0752, "num_tokens": 1956854.0, "reward": 0.21375000476837158, "reward_std": 0.5120877027511597, "rewards/reward_func/mean": 0.21375000476837158, "rewards/reward_func/std": 0.4747913181781769, "sampling/importance_sampling_ratio/max": 1.565601110458374, "sampling/importance_sampling_ratio/mean": 1.0413784980773926, "sampling/importance_sampling_ratio/min": 0.45870745182037354, "sampling/sampling_logp_difference/max": 0.5673609972000122, "sampling/sampling_logp_difference/mean": 0.02417255938053131, "step": 350, "step_time": 40.4494745760021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.380689412355423, "epoch": 0.702, "frac_reward_zero_std": 0.0, "grad_norm": 0.9029824733734131, "kl": 0.018018556758761406, "learning_rate": 3.773300405821908e-06, "loss": -0.0726, "num_tokens": 1962725.0, "reward": 0.3137499988079071, "reward_std": 0.583433210849762, "rewards/reward_func/mean": 0.3137499988079071, "rewards/reward_func/std": 0.5663400888442993, "sampling/importance_sampling_ratio/max": 1.4209295511245728, "sampling/importance_sampling_ratio/mean": 0.8946368098258972, "sampling/importance_sampling_ratio/min": 0.4468167722225189, "sampling/sampling_logp_difference/max": 0.4564931392669678, "sampling/sampling_logp_difference/mean": 0.02643968164920807, "step": 351, "step_time": 44.57280629100569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 62.0, "completions/mean_terminated_length": 62.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.31801438331604004, "epoch": 0.704, "frac_reward_zero_std": 0.0, "grad_norm": 0.9474357962608337, "kl": 0.013338024728000164, "learning_rate": 3.766325743079277e-06, "loss": 0.1357, "num_tokens": 1967643.0, "reward": 0.20499999821186066, "reward_std": 0.32446253299713135, "rewards/reward_func/mean": 0.20499999821186066, "rewards/reward_func/std": 0.48314449191093445, "sampling/importance_sampling_ratio/max": 1.5313472747802734, "sampling/importance_sampling_ratio/mean": 1.1058592796325684, "sampling/importance_sampling_ratio/min": 0.44263550639152527, "sampling/sampling_logp_difference/max": 0.45990777015686035, "sampling/sampling_logp_difference/mean": 0.020315904170274734, "step": 352, "step_time": 32.50577549599984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 56.625, "completions/mean_terminated_length": 56.625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.36904579401016235, "epoch": 0.706, "frac_reward_zero_std": 0.0, "grad_norm": 1.6299339532852173, "kl": 0.05941528081893921, "learning_rate": 3.7593377971788162e-06, "loss": -0.0727, "num_tokens": 1972573.0, "reward": 0.3474999964237213, "reward_std": 0.5575968027114868, "rewards/reward_func/mean": 0.3474999964237213, "rewards/reward_func/std": 0.5384302735328674, "sampling/importance_sampling_ratio/max": 2.091653823852539, "sampling/importance_sampling_ratio/mean": 1.1020057201385498, "sampling/importance_sampling_ratio/min": 0.5405357480049133, "sampling/sampling_logp_difference/max": 0.8576881885528564, "sampling/sampling_logp_difference/mean": 0.03374676778912544, "step": 353, "step_time": 30.564295716001652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.3194846212863922, "epoch": 0.708, "frac_reward_zero_std": 0.0, "grad_norm": 0.9248061776161194, "kl": 0.012258417904376984, "learning_rate": 3.752336641420772e-06, "loss": -0.0662, "num_tokens": 1977516.0, "reward": 0.4700000286102295, "reward_std": 0.5688798427581787, "rewards/reward_func/mean": 0.4700000286102295, "rewards/reward_func/std": 0.5268504619598389, "sampling/importance_sampling_ratio/max": 1.3270989656448364, "sampling/importance_sampling_ratio/mean": 1.0121691226959229, "sampling/importance_sampling_ratio/min": 0.6218995451927185, "sampling/sampling_logp_difference/max": 0.2814149856567383, "sampling/sampling_logp_difference/mean": 0.0207376666367054, "step": 354, "step_time": 32.0068911069975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 57.0, "completions/mean_terminated_length": 57.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.38322630524635315, "epoch": 0.71, "frac_reward_zero_std": 0.0, "grad_norm": 1.0300030708312988, "kl": 0.030064791440963745, "learning_rate": 3.7453223492439544e-06, "loss": -0.1406, "num_tokens": 1983794.0, "reward": 0.19500000774860382, "reward_std": 0.5388761758804321, "rewards/reward_func/mean": 0.19500000774860382, "rewards/reward_func/std": 0.4991707503795624, "sampling/importance_sampling_ratio/max": 1.4179537296295166, "sampling/importance_sampling_ratio/mean": 0.8694342374801636, "sampling/importance_sampling_ratio/min": 0.34229519963264465, "sampling/sampling_logp_difference/max": 0.7407898902893066, "sampling/sampling_logp_difference/mean": 0.028531817719340324, "step": 355, "step_time": 37.6383990119939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 52.875, "completions/mean_terminated_length": 52.875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.30376651883125305, "epoch": 0.712, "frac_reward_zero_std": 0.0, "grad_norm": 1.2527189254760742, "kl": 0.0377352349460125, "learning_rate": 3.7382949942249695e-06, "loss": 0.2867, "num_tokens": 1989118.0, "reward": 0.1837500035762787, "reward_std": 0.5111405253410339, "rewards/reward_func/mean": 0.1837500035762787, "rewards/reward_func/std": 0.4737672209739685, "sampling/importance_sampling_ratio/max": 2.962144136428833, "sampling/importance_sampling_ratio/mean": 1.4776464700698853, "sampling/importance_sampling_ratio/min": 0.5036495923995972, "sampling/sampling_logp_difference/max": 0.7229299545288086, "sampling/sampling_logp_difference/mean": 0.01985524222254753, "step": 356, "step_time": 36.960345953993965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 56.625, "completions/mean_terminated_length": 56.625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.31242305040359497, "epoch": 0.714, "frac_reward_zero_std": 0.0, "grad_norm": 1.8390851020812988, "kl": 0.11288845539093018, "learning_rate": 3.731254650077446e-06, "loss": -0.0201, "num_tokens": 1994550.0, "reward": 0.3174999952316284, "reward_std": 0.5620428323745728, "rewards/reward_func/mean": 0.3174999952316284, "rewards/reward_func/std": 0.5362235903739929, "sampling/importance_sampling_ratio/max": 2.0678493976593018, "sampling/importance_sampling_ratio/mean": 1.0567212104797363, "sampling/importance_sampling_ratio/min": 0.6045528650283813, "sampling/sampling_logp_difference/max": 0.8093851804733276, "sampling/sampling_logp_difference/mean": 0.02113654837012291, "step": 357, "step_time": 34.96656133400393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 48.375, "completions/mean_terminated_length": 48.375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3537905514240265, "epoch": 0.716, "frac_reward_zero_std": 0.0, "grad_norm": 1.3669836521148682, "kl": 0.07361885160207748, "learning_rate": 3.724201390651263e-06, "loss": 0.1908, "num_tokens": 2000048.0, "reward": 0.1937500238418579, "reward_std": 0.314365953207016, "rewards/reward_func/mean": 0.1937500238418579, "rewards/reward_func/std": 0.49517494440078735, "sampling/importance_sampling_ratio/max": 1.4376270771026611, "sampling/importance_sampling_ratio/mean": 0.7732110023498535, "sampling/importance_sampling_ratio/min": 0.14205169677734375, "sampling/sampling_logp_difference/max": 1.1421258449554443, "sampling/sampling_logp_difference/mean": 0.029162388294935226, "step": 358, "step_time": 42.63583009800641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 54.375, "completions/mean_terminated_length": 54.375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.38291653990745544, "epoch": 0.718, "frac_reward_zero_std": 0.0, "grad_norm": 1.0044505596160889, "kl": 0.022494001314044, "learning_rate": 3.7171352899317743e-06, "loss": -0.1117, "num_tokens": 2006426.0, "reward": -0.05999999865889549, "reward_std": 0.048807330429553986, "rewards/reward_func/mean": -0.05999999865889549, "rewards/reward_func/std": 0.05014265328645706, "sampling/importance_sampling_ratio/max": 1.710400104522705, "sampling/importance_sampling_ratio/mean": 1.183423638343811, "sampling/importance_sampling_ratio/min": 0.698122501373291, "sampling/sampling_logp_difference/max": 0.641355037689209, "sampling/sampling_logp_difference/mean": 0.02460392192006111, "step": 359, "step_time": 50.55940616400039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 48.875, "completions/mean_terminated_length": 48.875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3306000530719757, "epoch": 0.72, "frac_reward_zero_std": 0.0, "grad_norm": 0.998912513256073, "kl": 0.020860590040683746, "learning_rate": 3.710056422039033e-06, "loss": -0.1409, "num_tokens": 2012069.0, "reward": 0.22500000894069672, "reward_std": 0.5172683000564575, "rewards/reward_func/mean": 0.22500000894069672, "rewards/reward_func/std": 0.4788975417613983, "sampling/importance_sampling_ratio/max": 1.3302267789840698, "sampling/importance_sampling_ratio/mean": 0.8430871963500977, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8340984582901001, "sampling/sampling_logp_difference/mean": 0.02408684231340885, "step": 360, "step_time": 27.43443747400306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.34160998463630676, "epoch": 0.722, "frac_reward_zero_std": 0.0, "grad_norm": 0.9686026573181152, "kl": 0.014279250055551529, "learning_rate": 3.702964861227013e-06, "loss": 0.0154, "num_tokens": 2017315.0, "reward": 0.09125000238418579, "reward_std": 0.2781270742416382, "rewards/reward_func/mean": 0.09125000238418579, "rewards/reward_func/std": 0.3651394248008728, "sampling/importance_sampling_ratio/max": 1.65193510055542, "sampling/importance_sampling_ratio/mean": 0.9402889013290405, "sampling/importance_sampling_ratio/min": 0.45469439029693604, "sampling/sampling_logp_difference/max": 0.31305837631225586, "sampling/sampling_logp_difference/mean": 0.023152697831392288, "step": 361, "step_time": 39.433918608003296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.36318618059158325, "epoch": 0.724, "frac_reward_zero_std": 0.0, "grad_norm": 0.7241699695587158, "kl": 0.017520280554890633, "learning_rate": 3.695860681882832e-06, "loss": 0.0466, "num_tokens": 2023159.0, "reward": 0.07249999791383743, "reward_std": 0.2886144816875458, "rewards/reward_func/mean": 0.07249999791383743, "rewards/reward_func/std": 0.38074177503585815, "sampling/importance_sampling_ratio/max": 1.5708813667297363, "sampling/importance_sampling_ratio/mean": 0.8219673037528992, "sampling/importance_sampling_ratio/min": 0.18283437192440033, "sampling/sampling_logp_difference/max": 0.6703026294708252, "sampling/sampling_logp_difference/mean": 0.022826572880148888, "step": 362, "step_time": 46.71442539000418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 50.625, "completions/mean_terminated_length": 50.625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.3641355633735657, "epoch": 0.726, "frac_reward_zero_std": 0.0, "grad_norm": 0.7925549149513245, "kl": 0.019471367821097374, "learning_rate": 3.6887439585259693e-06, "loss": 0.1953, "num_tokens": 2028306.0, "reward": 0.2175000011920929, "reward_std": 0.512782871723175, "rewards/reward_func/mean": 0.2175000011920929, "rewards/reward_func/std": 0.4749361276626587, "sampling/importance_sampling_ratio/max": 1.7815968990325928, "sampling/importance_sampling_ratio/mean": 1.0899240970611572, "sampling/importance_sampling_ratio/min": 0.39993321895599365, "sampling/sampling_logp_difference/max": 0.7556244134902954, "sampling/sampling_logp_difference/mean": 0.02318955399096012, "step": 363, "step_time": 33.87657535800827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3745487332344055, "epoch": 0.728, "frac_reward_zero_std": 0.0, "grad_norm": 0.9719190001487732, "kl": 0.01766936480998993, "learning_rate": 3.6816147658074864e-06, "loss": -0.0714, "num_tokens": 2033664.0, "reward": 0.22875000536441803, "reward_std": 0.3135777711868286, "rewards/reward_func/mean": 0.22875000536441803, "rewards/reward_func/std": 0.47726717591285706, "sampling/importance_sampling_ratio/max": 1.5119142532348633, "sampling/importance_sampling_ratio/mean": 0.9923655986785889, "sampling/importance_sampling_ratio/min": 0.5845767259597778, "sampling/sampling_logp_difference/max": 0.570970892906189, "sampling/sampling_logp_difference/mean": 0.024401342496275902, "step": 364, "step_time": 39.12082826299593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 60.25, "completions/mean_terminated_length": 60.25, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.386588454246521, "epoch": 0.73, "frac_reward_zero_std": 0.0, "grad_norm": 1.0872100591659546, "kl": 0.018672293052077293, "learning_rate": 3.6744731785092396e-06, "loss": -0.0248, "num_tokens": 2038952.0, "reward": 0.20375001430511475, "reward_std": 0.32467734813690186, "rewards/reward_func/mean": 0.20375001430511475, "rewards/reward_func/std": 0.4832313358783722, "sampling/importance_sampling_ratio/max": 1.5475658178329468, "sampling/importance_sampling_ratio/mean": 1.1252450942993164, "sampling/importance_sampling_ratio/min": 0.6070107817649841, "sampling/sampling_logp_difference/max": 0.6747951507568359, "sampling/sampling_logp_difference/mean": 0.024049527943134308, "step": 365, "step_time": 36.08769363799365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.32434672117233276, "epoch": 0.732, "frac_reward_zero_std": 0.0, "grad_norm": 1.1058688163757324, "kl": 0.022890109568834305, "learning_rate": 3.6673192715431016e-06, "loss": -0.1808, "num_tokens": 2044744.0, "reward": 0.3387500047683716, "reward_std": 0.5674425363540649, "rewards/reward_func/mean": 0.3387500047683716, "rewards/reward_func/std": 0.5432820916175842, "sampling/importance_sampling_ratio/max": 1.6937315464019775, "sampling/importance_sampling_ratio/mean": 0.853196382522583, "sampling/importance_sampling_ratio/min": 0.4900035858154297, "sampling/sampling_logp_difference/max": 0.5789165496826172, "sampling/sampling_logp_difference/mean": 0.02218322455883026, "step": 366, "step_time": 34.316845288994955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.3331367075443268, "epoch": 0.734, "frac_reward_zero_std": 0.0, "grad_norm": 0.9906635880470276, "kl": 0.01753188669681549, "learning_rate": 3.6601531199501715e-06, "loss": -0.0365, "num_tokens": 2050626.0, "reward": 0.2150000035762787, "reward_std": 0.3200206756591797, "rewards/reward_func/mean": 0.2150000035762787, "rewards/reward_func/std": 0.4824047088623047, "sampling/importance_sampling_ratio/max": 1.387757658958435, "sampling/importance_sampling_ratio/mean": 1.0535778999328613, "sampling/importance_sampling_ratio/min": 0.5726510882377625, "sampling/sampling_logp_difference/max": 0.5058789253234863, "sampling/sampling_logp_difference/mean": 0.018796022981405258, "step": 367, "step_time": 43.17819735400553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 46.0, "completions/mean_terminated_length": 46.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.38309377431869507, "epoch": 0.736, "frac_reward_zero_std": 0.0, "grad_norm": 1.1113532781600952, "kl": 0.020980726927518845, "learning_rate": 3.652974798899988e-06, "loss": -0.0142, "num_tokens": 2056725.0, "reward": 0.09749999642372131, "reward_std": 0.26799049973487854, "rewards/reward_func/mean": 0.09749999642372131, "rewards/reward_func/std": 0.35784077644348145, "sampling/importance_sampling_ratio/max": 2.1293060779571533, "sampling/importance_sampling_ratio/mean": 1.0104879140853882, "sampling/importance_sampling_ratio/min": 0.22460012137889862, "sampling/sampling_logp_difference/max": 0.6540035009384155, "sampling/sampling_logp_difference/mean": 0.028138641268014908, "step": 368, "step_time": 43.222731264002505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.32224565744400024, "epoch": 0.738, "frac_reward_zero_std": 0.0, "grad_norm": 1.3402920961380005, "kl": 0.012260911986231804, "learning_rate": 3.645784383689742e-06, "loss": 0.1681, "num_tokens": 2061716.0, "reward": 0.32625001668930054, "reward_std": 0.5579476356506348, "rewards/reward_func/mean": 0.32625001668930054, "rewards/reward_func/std": 0.537532389163971, "sampling/importance_sampling_ratio/max": 1.767604947090149, "sampling/importance_sampling_ratio/mean": 0.9820557236671448, "sampling/importance_sampling_ratio/min": 0.3409233093261719, "sampling/sampling_logp_difference/max": 0.38132715225219727, "sampling/sampling_logp_difference/mean": 0.024098357185721397, "step": 369, "step_time": 35.26750406099018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 62.875, "completions/mean_terminated_length": 62.875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.3785794973373413, "epoch": 0.74, "frac_reward_zero_std": 0.0, "grad_norm": 1.0662072896957397, "kl": 0.037193797528743744, "learning_rate": 3.6385819497434877e-06, "loss": -0.1421, "num_tokens": 2066831.0, "reward": 0.45249998569488525, "reward_std": 0.6269056797027588, "rewards/reward_func/mean": 0.45249998569488525, "rewards/reward_func/std": 0.5811749696731567, "sampling/importance_sampling_ratio/max": 1.6641879081726074, "sampling/importance_sampling_ratio/mean": 0.9713910222053528, "sampling/importance_sampling_ratio/min": 0.28194481134414673, "sampling/sampling_logp_difference/max": 0.9472329616546631, "sampling/sampling_logp_difference/mean": 0.025759253650903702, "step": 370, "step_time": 25.88203328898817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 48.875, "completions/mean_terminated_length": 48.875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3221362233161926, "epoch": 0.742, "frac_reward_zero_std": 0.0, "grad_norm": 0.9221799969673157, "kl": 0.030139964073896408, "learning_rate": 3.631367572611348e-06, "loss": 0.0918, "num_tokens": 2073296.0, "reward": 0.08624999970197678, "reward_std": 0.28050848841667175, "rewards/reward_func/mean": 0.08624999970197678, "rewards/reward_func/std": 0.3702484667301178, "sampling/importance_sampling_ratio/max": 1.8722600936889648, "sampling/importance_sampling_ratio/mean": 0.9809074401855469, "sampling/importance_sampling_ratio/min": 0.40770983695983887, "sampling/sampling_logp_difference/max": 0.6183086037635803, "sampling/sampling_logp_difference/mean": 0.02641558088362217, "step": 371, "step_time": 48.31357828999171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.35156160593032837, "epoch": 0.744, "frac_reward_zero_std": 0.0, "grad_norm": 0.759164035320282, "kl": 0.023832060396671295, "learning_rate": 3.6241413279687256e-06, "loss": -0.0865, "num_tokens": 2079272.0, "reward": 0.3449999988079071, "reward_std": 0.5492082238197327, "rewards/reward_func/mean": 0.3449999988079071, "rewards/reward_func/std": 0.5293661952018738, "sampling/importance_sampling_ratio/max": 1.4884122610092163, "sampling/importance_sampling_ratio/mean": 0.8230412006378174, "sampling/importance_sampling_ratio/min": 0.43746134638786316, "sampling/sampling_logp_difference/max": 0.5400235652923584, "sampling/sampling_logp_difference/mean": 0.02419007569551468, "step": 372, "step_time": 45.951615555008175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 58.5, "completions/mean_terminated_length": 58.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.31760358810424805, "epoch": 0.746, "frac_reward_zero_std": 0.0, "grad_norm": 0.7639608979225159, "kl": 0.07498195022344589, "learning_rate": 3.616903291615506e-06, "loss": -0.0004, "num_tokens": 2084366.0, "reward": 0.3387500047683716, "reward_std": 0.5394263863563538, "rewards/reward_func/mean": 0.3387500047683716, "rewards/reward_func/std": 0.5155146718025208, "sampling/importance_sampling_ratio/max": 1.6529487371444702, "sampling/importance_sampling_ratio/mean": 0.9317770004272461, "sampling/importance_sampling_ratio/min": 0.2725166082382202, "sampling/sampling_logp_difference/max": 1.0274620056152344, "sampling/sampling_logp_difference/mean": 0.02573678269982338, "step": 373, "step_time": 26.18881739700737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 59.0, "completions/mean_terminated_length": 59.0, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.29528483748435974, "epoch": 0.748, "frac_reward_zero_std": 0.0, "grad_norm": 0.6288770437240601, "kl": 0.018295394256711006, "learning_rate": 3.609653539475268e-06, "loss": -0.0202, "num_tokens": 2090117.0, "reward": 0.3174999952316284, "reward_std": 0.3078264892101288, "rewards/reward_func/mean": 0.3174999952316284, "rewards/reward_func/std": 0.5612931847572327, "sampling/importance_sampling_ratio/max": 1.0934966802597046, "sampling/importance_sampling_ratio/mean": 0.6602436304092407, "sampling/importance_sampling_ratio/min": 0.35864314436912537, "sampling/sampling_logp_difference/max": 0.6380555629730225, "sampling/sampling_logp_difference/mean": 0.023511648178100586, "step": 374, "step_time": 49.95311562300776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 58.75, "completions/mean_terminated_length": 58.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.33941906690597534, "epoch": 0.75, "frac_reward_zero_std": 0.0, "grad_norm": 0.8127002120018005, "kl": 0.028118256479501724, "learning_rate": 3.6023921475944795e-06, "loss": -0.1582, "num_tokens": 2095578.0, "reward": 0.2199999988079071, "reward_std": 0.29089051485061646, "rewards/reward_func/mean": 0.2199999988079071, "rewards/reward_func/std": 0.4539981484413147, "sampling/importance_sampling_ratio/max": 1.3880345821380615, "sampling/importance_sampling_ratio/mean": 0.8715201020240784, "sampling/importance_sampling_ratio/min": 0.11366145312786102, "sampling/sampling_logp_difference/max": 0.8331606388092041, "sampling/sampling_logp_difference/mean": 0.023476149886846542, "step": 375, "step_time": 39.132226767003885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3131125569343567, "epoch": 0.752, "frac_reward_zero_std": 0.0, "grad_norm": 1.068045973777771, "kl": 0.0240363497287035, "learning_rate": 3.5951191921417063e-06, "loss": -0.0651, "num_tokens": 2100937.0, "reward": 0.1875, "reward_std": 0.5449391603469849, "rewards/reward_func/mean": 0.1875, "rewards/reward_func/std": 0.5059291124343872, "sampling/importance_sampling_ratio/max": 1.3649033308029175, "sampling/importance_sampling_ratio/mean": 0.7642968893051147, "sampling/importance_sampling_ratio/min": 0.24381500482559204, "sampling/sampling_logp_difference/max": 0.5894099473953247, "sampling/sampling_logp_difference/mean": 0.025731489062309265, "step": 376, "step_time": 36.06358368200017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 55.625, "completions/mean_terminated_length": 55.625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.36141708493232727, "epoch": 0.754, "frac_reward_zero_std": 0.0, "grad_norm": 1.1907614469528198, "kl": 0.018009314313530922, "learning_rate": 3.5878347494068083e-06, "loss": 0.3203, "num_tokens": 2106897.0, "reward": 0.45250001549720764, "reward_std": 0.6097580790519714, "rewards/reward_func/mean": 0.45250001549720764, "rewards/reward_func/std": 0.5646427273750305, "sampling/importance_sampling_ratio/max": 1.9138866662979126, "sampling/importance_sampling_ratio/mean": 1.0367978811264038, "sampling/importance_sampling_ratio/min": 0.4980725646018982, "sampling/sampling_logp_difference/max": 0.7002124786376953, "sampling/sampling_logp_difference/mean": 0.022084344178438187, "step": 377, "step_time": 42.03821306199825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 47.875, "completions/mean_terminated_length": 47.875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.31293684244155884, "epoch": 0.756, "frac_reward_zero_std": 0.0, "grad_norm": 0.9326619505882263, "kl": 0.024975117295980453, "learning_rate": 3.580538895800144e-06, "loss": -0.1336, "num_tokens": 2112259.0, "reward": 0.17624999582767487, "reward_std": 0.3398403823375702, "rewards/reward_func/mean": 0.17624999582767487, "rewards/reward_func/std": 0.5064142346382141, "sampling/importance_sampling_ratio/max": 1.1192891597747803, "sampling/importance_sampling_ratio/mean": 0.757171630859375, "sampling/importance_sampling_ratio/min": 0.45731785893440247, "sampling/sampling_logp_difference/max": 0.5144007205963135, "sampling/sampling_logp_difference/mean": 0.025648921728134155, "step": 378, "step_time": 37.88494844498928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.3656163811683655, "epoch": 0.758, "frac_reward_zero_std": 0.0, "grad_norm": 0.8823372721672058, "kl": 0.02256789244711399, "learning_rate": 3.573231707851765e-06, "loss": 0.2197, "num_tokens": 2118172.0, "reward": 0.4424999952316284, "reward_std": 0.6283525824546814, "rewards/reward_func/mean": 0.4424999952316284, "rewards/reward_func/std": 0.5822800397872925, "sampling/importance_sampling_ratio/max": 1.8701510429382324, "sampling/importance_sampling_ratio/mean": 0.8111326694488525, "sampling/importance_sampling_ratio/min": 0.30452919006347656, "sampling/sampling_logp_difference/max": 0.8101745843887329, "sampling/sampling_logp_difference/mean": 0.026399342343211174, "step": 379, "step_time": 43.92895183300425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3246749937534332, "epoch": 0.76, "frac_reward_zero_std": 0.0, "grad_norm": 0.9643809795379639, "kl": 0.020442264154553413, "learning_rate": 3.5659132622106152e-06, "loss": -0.1417, "num_tokens": 2123917.0, "reward": 0.35374999046325684, "reward_std": 0.5508840680122375, "rewards/reward_func/mean": 0.35374999046325684, "rewards/reward_func/std": 0.5303890109062195, "sampling/importance_sampling_ratio/max": 1.7333760261535645, "sampling/importance_sampling_ratio/mean": 0.9654305577278137, "sampling/importance_sampling_ratio/min": 0.37011101841926575, "sampling/sampling_logp_difference/max": 0.5683209896087646, "sampling/sampling_logp_difference/mean": 0.02381267584860325, "step": 380, "step_time": 32.53779908501019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 56.375, "completions/mean_terminated_length": 56.375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.33849892020225525, "epoch": 0.762, "frac_reward_zero_std": 0.0, "grad_norm": 0.9345517754554749, "kl": 0.01798607036471367, "learning_rate": 3.5585836356437266e-06, "loss": 0.1689, "num_tokens": 2129155.0, "reward": 0.4699999988079071, "reward_std": 0.5834348201751709, "rewards/reward_func/mean": 0.4699999988079071, "rewards/reward_func/std": 0.5406873822212219, "sampling/importance_sampling_ratio/max": 1.434735894203186, "sampling/importance_sampling_ratio/mean": 0.9351725578308105, "sampling/importance_sampling_ratio/min": 0.32867270708084106, "sampling/sampling_logp_difference/max": 0.8937342166900635, "sampling/sampling_logp_difference/mean": 0.023814164102077484, "step": 381, "step_time": 35.22910028499609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 51.125, "completions/mean_terminated_length": 51.125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.35224515199661255, "epoch": 0.764, "frac_reward_zero_std": 0.0, "grad_norm": 0.69114750623703, "kl": 0.020070038735866547, "learning_rate": 3.551242905035412e-06, "loss": 0.0692, "num_tokens": 2135558.0, "reward": 0.08374999463558197, "reward_std": 0.2716542184352875, "rewards/reward_func/mean": 0.08374999463558197, "rewards/reward_func/std": 0.3633549213409424, "sampling/importance_sampling_ratio/max": 1.3249363899230957, "sampling/importance_sampling_ratio/mean": 0.8756052255630493, "sampling/importance_sampling_ratio/min": 0.5583480596542358, "sampling/sampling_logp_difference/max": 0.3986530303955078, "sampling/sampling_logp_difference/mean": 0.02286503091454506, "step": 382, "step_time": 47.23492028898909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 52.625, "completions/mean_terminated_length": 52.625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.3356274664402008, "epoch": 0.766, "frac_reward_zero_std": 0.0, "grad_norm": 0.9842677116394043, "kl": 0.03034183755517006, "learning_rate": 3.5438911473864633e-06, "loss": 0.1462, "num_tokens": 2141777.0, "reward": 0.1912500113248825, "reward_std": 0.30746302008628845, "rewards/reward_func/mean": 0.1912500113248825, "rewards/reward_func/std": 0.47405359148979187, "sampling/importance_sampling_ratio/max": 1.4171618223190308, "sampling/importance_sampling_ratio/mean": 0.9373311400413513, "sampling/importance_sampling_ratio/min": 0.4872235059738159, "sampling/sampling_logp_difference/max": 0.6592245101928711, "sampling/sampling_logp_difference/mean": 0.024339091032743454, "step": 383, "step_time": 52.436622552995686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3243747651576996, "epoch": 0.768, "frac_reward_zero_std": 0.0, "grad_norm": 0.8072435259819031, "kl": 0.014891182072460651, "learning_rate": 3.5365284398133404e-06, "loss": -0.169, "num_tokens": 2146987.0, "reward": 0.3149999976158142, "reward_std": 0.543981671333313, "rewards/reward_func/mean": 0.3149999976158142, "rewards/reward_func/std": 0.5302021503448486, "sampling/importance_sampling_ratio/max": 1.3961093425750732, "sampling/importance_sampling_ratio/mean": 0.6877409219741821, "sampling/importance_sampling_ratio/min": 0.28648021817207336, "sampling/sampling_logp_difference/max": 0.7951034903526306, "sampling/sampling_logp_difference/mean": 0.02731979638338089, "step": 384, "step_time": 40.187907672996516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 52.625, "completions/mean_terminated_length": 52.625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.3296257555484772, "epoch": 0.77, "frac_reward_zero_std": 0.0, "grad_norm": 0.6858446598052979, "kl": 0.016891244798898697, "learning_rate": 3.52915485954736e-06, "loss": 0.0741, "num_tokens": 2152302.0, "reward": 0.04874999076128006, "reward_std": 0.3078903555870056, "rewards/reward_func/mean": 0.04874999076128006, "rewards/reward_func/std": 0.389154314994812, "sampling/importance_sampling_ratio/max": 1.2545758485794067, "sampling/importance_sampling_ratio/mean": 0.8542709946632385, "sampling/importance_sampling_ratio/min": 0.4029653072357178, "sampling/sampling_logp_difference/max": 0.4015458822250366, "sampling/sampling_logp_difference/mean": 0.023256313055753708, "step": 385, "step_time": 36.99427409100463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 58.0, "completions/mean_terminated_length": 58.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.35816895961761475, "epoch": 0.772, "frac_reward_zero_std": 0.0, "grad_norm": 1.0791374444961548, "kl": 0.06013531610369682, "learning_rate": 3.521770483933891e-06, "loss": 0.1586, "num_tokens": 2157697.0, "reward": 0.0625, "reward_std": 0.28779086470603943, "rewards/reward_func/mean": 0.0625, "rewards/reward_func/std": 0.3733152747154236, "sampling/importance_sampling_ratio/max": 1.4287514686584473, "sampling/importance_sampling_ratio/mean": 0.8348481059074402, "sampling/importance_sampling_ratio/min": 0.22553585469722748, "sampling/sampling_logp_difference/max": 0.8671650886535645, "sampling/sampling_logp_difference/mean": 0.02565944194793701, "step": 386, "step_time": 41.950704916001996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 52.625, "completions/mean_terminated_length": 52.625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.33600252866744995, "epoch": 0.774, "frac_reward_zero_std": 0.0, "grad_norm": 1.198814034461975, "kl": 0.012472853064537048, "learning_rate": 3.514375390431539e-06, "loss": 0.2883, "num_tokens": 2164144.0, "reward": 0.19749999046325684, "reward_std": 0.3289427161216736, "rewards/reward_func/mean": 0.19749999046325684, "rewards/reward_func/std": 0.4798734784126282, "sampling/importance_sampling_ratio/max": 1.8541417121887207, "sampling/importance_sampling_ratio/mean": 0.9226713180541992, "sampling/importance_sampling_ratio/min": 0.33913132548332214, "sampling/sampling_logp_difference/max": 0.598806619644165, "sampling/sampling_logp_difference/mean": 0.021760722622275352, "step": 387, "step_time": 44.073393629994825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 53.875, "completions/mean_terminated_length": 53.875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.4183434247970581, "epoch": 0.776, "frac_reward_zero_std": 0.0, "grad_norm": 1.299253225326538, "kl": 0.02365995943546295, "learning_rate": 3.5069696566113347e-06, "loss": -0.0277, "num_tokens": 2169925.0, "reward": 0.3137500286102295, "reward_std": 0.5675816535949707, "rewards/reward_func/mean": 0.3137500286102295, "rewards/reward_func/std": 0.5541515946388245, "sampling/importance_sampling_ratio/max": 1.9320272207260132, "sampling/importance_sampling_ratio/mean": 0.9678875803947449, "sampling/importance_sampling_ratio/min": 0.4237723648548126, "sampling/sampling_logp_difference/max": 0.9530621767044067, "sampling/sampling_logp_difference/mean": 0.028726529330015182, "step": 388, "step_time": 36.650615365986596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.4050993025302887, "epoch": 0.778, "frac_reward_zero_std": 0.0, "grad_norm": 1.352435827255249, "kl": 0.0169648639857769, "learning_rate": 3.499553360155923e-06, "loss": -0.2526, "num_tokens": 2176095.0, "reward": 0.06624999642372131, "reward_std": 0.2910441756248474, "rewards/reward_func/mean": 0.06624999642372131, "rewards/reward_func/std": 0.37965914607048035, "sampling/importance_sampling_ratio/max": 1.629567265510559, "sampling/importance_sampling_ratio/mean": 0.9991623163223267, "sampling/importance_sampling_ratio/min": 0.5820651650428772, "sampling/sampling_logp_difference/max": 0.25946611166000366, "sampling/sampling_logp_difference/mean": 0.021378565579652786, "step": 389, "step_time": 52.00782443599019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 51.875, "completions/mean_terminated_length": 51.875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.35824936628341675, "epoch": 0.78, "frac_reward_zero_std": 0.0, "grad_norm": 0.9068090319633484, "kl": 0.027212774381041527, "learning_rate": 3.4921265788587432e-06, "loss": -0.0936, "num_tokens": 2181657.0, "reward": 0.059999994933605194, "reward_std": 0.27817416191101074, "rewards/reward_func/mean": 0.059999994933605194, "rewards/reward_func/std": 0.35496482253074646, "sampling/importance_sampling_ratio/max": 2.0222272872924805, "sampling/importance_sampling_ratio/mean": 0.9855067729949951, "sampling/importance_sampling_ratio/min": 0.47639888525009155, "sampling/sampling_logp_difference/max": 0.9331116676330566, "sampling/sampling_logp_difference/mean": 0.027233093976974487, "step": 390, "step_time": 42.25256816399633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 50.375, "completions/mean_terminated_length": 50.375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.32872098684310913, "epoch": 0.782, "frac_reward_zero_std": 0.0, "grad_norm": 0.7149050235748291, "kl": 0.044429853558540344, "learning_rate": 3.484689390623218e-06, "loss": 0.1788, "num_tokens": 2187851.0, "reward": 0.0637499988079071, "reward_std": 0.2887832820415497, "rewards/reward_func/mean": 0.0637499988079071, "rewards/reward_func/std": 0.38029828667640686, "sampling/importance_sampling_ratio/max": 1.5726252794265747, "sampling/importance_sampling_ratio/mean": 0.7780969738960266, "sampling/importance_sampling_ratio/min": 0.21605902910232544, "sampling/sampling_logp_difference/max": 0.9326303005218506, "sampling/sampling_logp_difference/mean": 0.02885178104043007, "step": 391, "step_time": 44.16586426900176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 96.0, "completions/max_terminated_length": 96.0, "completions/mean_length": 58.25, "completions/mean_terminated_length": 58.25, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.353057861328125, "epoch": 0.784, "frac_reward_zero_std": 0.0, "grad_norm": 0.6402285695075989, "kl": 0.047673989087343216, "learning_rate": 3.4772418734619325e-06, "loss": -0.1065, "num_tokens": 2193337.0, "reward": 0.5874999761581421, "reward_std": 0.5623499155044556, "rewards/reward_func/mean": 0.5874999761581421, "rewards/reward_func/std": 0.5455207228660583, "sampling/importance_sampling_ratio/max": 1.056589126586914, "sampling/importance_sampling_ratio/mean": 0.6921772360801697, "sampling/importance_sampling_ratio/min": 0.2916169762611389, "sampling/sampling_logp_difference/max": 1.1112589836120605, "sampling/sampling_logp_difference/mean": 0.024330832064151764, "step": 392, "step_time": 37.52057623099245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.35051673650741577, "epoch": 0.786, "frac_reward_zero_std": 0.0, "grad_norm": 0.877177894115448, "kl": 0.013033976778388023, "learning_rate": 3.4697841054958163e-06, "loss": 0.1362, "num_tokens": 2199559.0, "reward": 0.1837500035762787, "reward_std": 0.5407979488372803, "rewards/reward_func/mean": 0.1837500035762787, "rewards/reward_func/std": 0.5013107657432556, "sampling/importance_sampling_ratio/max": 1.3691225051879883, "sampling/importance_sampling_ratio/mean": 0.940036416053772, "sampling/importance_sampling_ratio/min": 0.3252532482147217, "sampling/sampling_logp_difference/max": 0.3541145324707031, "sampling/sampling_logp_difference/mean": 0.023762091994285583, "step": 393, "step_time": 45.31983452399436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 51.875, "completions/mean_terminated_length": 51.875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.2770605981349945, "epoch": 0.788, "frac_reward_zero_std": 0.0, "grad_norm": 0.6571247577667236, "kl": 0.027253959327936172, "learning_rate": 3.4623161649533284e-06, "loss": -0.0846, "num_tokens": 2205060.0, "reward": -0.0625, "reward_std": 0.05641929805278778, "rewards/reward_func/mean": -0.0625, "rewards/reward_func/std": 0.05994044616818428, "sampling/importance_sampling_ratio/max": 1.2533941268920898, "sampling/importance_sampling_ratio/mean": 0.7182776927947998, "sampling/importance_sampling_ratio/min": 0.45736369490623474, "sampling/sampling_logp_difference/max": 0.8105928897857666, "sampling/sampling_logp_difference/mean": 0.0246109776198864, "step": 394, "step_time": 46.803369777990156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.625, "completions/mean_terminated_length": 53.625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.32860368490219116, "epoch": 0.79, "frac_reward_zero_std": 0.0, "grad_norm": 0.807575523853302, "kl": 0.039417944848537445, "learning_rate": 3.4548381301696298e-06, "loss": -0.1621, "num_tokens": 2210663.0, "reward": 0.09875000268220901, "reward_std": 0.26192745566368103, "rewards/reward_func/mean": 0.09875000268220901, "rewards/reward_func/std": 0.35385382175445557, "sampling/importance_sampling_ratio/max": 1.6175183057785034, "sampling/importance_sampling_ratio/mean": 0.8804988861083984, "sampling/importance_sampling_ratio/min": 0.41070258617401123, "sampling/sampling_logp_difference/max": 0.7530922889709473, "sampling/sampling_logp_difference/mean": 0.020096953958272934, "step": 395, "step_time": 40.773527258003014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 53.625, "completions/mean_terminated_length": 53.625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.31244707107543945, "epoch": 0.792, "frac_reward_zero_std": 0.0, "grad_norm": 1.046858310699463, "kl": 0.014741847291588783, "learning_rate": 3.4473500795857674e-06, "loss": 0.1284, "num_tokens": 2215955.0, "reward": 0.22750000655651093, "reward_std": 0.4991302490234375, "rewards/reward_func/mean": 0.22750000655651093, "rewards/reward_func/std": 0.46219196915626526, "sampling/importance_sampling_ratio/max": 2.1469762325286865, "sampling/importance_sampling_ratio/mean": 1.328494668006897, "sampling/importance_sampling_ratio/min": 0.5685848593711853, "sampling/sampling_logp_difference/max": 0.5777333974838257, "sampling/sampling_logp_difference/mean": 0.022995343431830406, "step": 396, "step_time": 37.15637919999426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3469958007335663, "epoch": 0.794, "frac_reward_zero_std": 0.0, "grad_norm": 1.2579119205474854, "kl": 0.045768074691295624, "learning_rate": 3.4398520917478478e-06, "loss": 0.1787, "num_tokens": 2221464.0, "reward": 0.3199999928474426, "reward_std": 0.29569682478904724, "rewards/reward_func/mean": 0.3199999928474426, "rewards/reward_func/std": 0.560943067073822, "sampling/importance_sampling_ratio/max": 1.4082008600234985, "sampling/importance_sampling_ratio/mean": 0.9037089347839355, "sampling/importance_sampling_ratio/min": 0.24859501421451569, "sampling/sampling_logp_difference/max": 1.1122647523880005, "sampling/sampling_logp_difference/mean": 0.028527939692139626, "step": 397, "step_time": 39.99640729100793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 55.375, "completions/mean_terminated_length": 55.375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3531041443347931, "epoch": 0.796, "frac_reward_zero_std": 0.0, "grad_norm": 1.0242723226547241, "kl": 0.017868993803858757, "learning_rate": 3.4323442453062173e-06, "loss": 0.2376, "num_tokens": 2226383.0, "reward": 0.5662500262260437, "reward_std": 0.28834110498428345, "rewards/reward_func/mean": 0.5662500262260437, "rewards/reward_func/std": 0.5615269541740417, "sampling/importance_sampling_ratio/max": 2.4575791358947754, "sampling/importance_sampling_ratio/mean": 1.1844841241836548, "sampling/importance_sampling_ratio/min": 0.8696222901344299, "sampling/sampling_logp_difference/max": 0.48058170080184937, "sampling/sampling_logp_difference/mean": 0.023350011557340622, "step": 398, "step_time": 32.65750913400552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.3498823940753937, "epoch": 0.798, "frac_reward_zero_std": 0.0, "grad_norm": 0.7218272686004639, "kl": 0.03050382062792778, "learning_rate": 3.4248266190146307e-06, "loss": -0.0799, "num_tokens": 2231912.0, "reward": 0.3075000047683716, "reward_std": 0.5727229714393616, "rewards/reward_func/mean": 0.3075000047683716, "rewards/reward_func/std": 0.5470113754272461, "sampling/importance_sampling_ratio/max": 1.0502070188522339, "sampling/importance_sampling_ratio/mean": 0.6887789964675903, "sampling/importance_sampling_ratio/min": 0.4914277195930481, "sampling/sampling_logp_difference/max": 0.7550356388092041, "sampling/sampling_logp_difference/mean": 0.027827613055706024, "step": 399, "step_time": 41.33596688800026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 56.125, "completions/mean_terminated_length": 56.125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.3193606436252594, "epoch": 0.8, "frac_reward_zero_std": 0.0, "grad_norm": 1.2495148181915283, "kl": 0.017775503918528557, "learning_rate": 3.417299291729431e-06, "loss": -0.0324, "num_tokens": 2236982.0, "reward": 0.3462499976158142, "reward_std": 0.5585935711860657, "rewards/reward_func/mean": 0.3462499976158142, "rewards/reward_func/std": 0.5422160029411316, "sampling/importance_sampling_ratio/max": 2.1713757514953613, "sampling/importance_sampling_ratio/mean": 1.0706489086151123, "sampling/importance_sampling_ratio/min": 0.2907283306121826, "sampling/sampling_logp_difference/max": 1.8012995719909668, "sampling/sampling_logp_difference/mean": 0.035895854234695435, "step": 400, "step_time": 35.204487941999105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3636338710784912, "epoch": 0.802, "frac_reward_zero_std": 0.0, "grad_norm": 0.8902538418769836, "kl": 0.013676811009645462, "learning_rate": 3.4097623424087196e-06, "loss": -0.1516, "num_tokens": 2242653.0, "reward": 0.3462499678134918, "reward_std": 0.26222485303878784, "rewards/reward_func/mean": 0.3462499678134918, "rewards/reward_func/std": 0.5304967164993286, "sampling/importance_sampling_ratio/max": 1.1426626443862915, "sampling/importance_sampling_ratio/mean": 0.8304384350776672, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0202627182006836, "sampling/sampling_logp_difference/mean": 0.029507692903280258, "step": 401, "step_time": 40.37125378000201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 57.0, "completions/mean_terminated_length": 57.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.36135992407798767, "epoch": 0.804, "frac_reward_zero_std": 0.0, "grad_norm": 0.7169837355613708, "kl": 0.02164129912853241, "learning_rate": 3.4022158501115283e-06, "loss": 0.0952, "num_tokens": 2248740.0, "reward": 0.08000000566244125, "reward_std": 0.2766728699207306, "rewards/reward_func/mean": 0.08000000566244125, "rewards/reward_func/std": 0.3709062337875366, "sampling/importance_sampling_ratio/max": 2.0062737464904785, "sampling/importance_sampling_ratio/mean": 0.9354610443115234, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5968524217605591, "sampling/sampling_logp_difference/mean": 0.02405213564634323, "step": 402, "step_time": 40.320551773998886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.3496766686439514, "epoch": 0.806, "frac_reward_zero_std": 0.0, "grad_norm": 1.099068522453308, "kl": 0.031258516013622284, "learning_rate": 3.39465989399699e-06, "loss": -0.186, "num_tokens": 2253924.0, "reward": 0.3424999713897705, "reward_std": 0.5592527389526367, "rewards/reward_func/mean": 0.3424999713897705, "rewards/reward_func/std": 0.5341682434082031, "sampling/importance_sampling_ratio/max": 1.7832975387573242, "sampling/importance_sampling_ratio/mean": 0.9368232488632202, "sampling/importance_sampling_ratio/min": 0.2631537914276123, "sampling/sampling_logp_difference/max": 1.3949875831604004, "sampling/sampling_logp_difference/mean": 0.022440284490585327, "step": 403, "step_time": 31.128040015988518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 54.375, "completions/mean_terminated_length": 54.375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.37068814039230347, "epoch": 0.808, "frac_reward_zero_std": 0.0, "grad_norm": 1.0627682209014893, "kl": 0.12361843883991241, "learning_rate": 3.3870945533235104e-06, "loss": 0.3472, "num_tokens": 2259059.0, "reward": 0.20624998211860657, "reward_std": 0.3201104700565338, "rewards/reward_func/mean": 0.20624998211860657, "rewards/reward_func/std": 0.47850772738456726, "sampling/importance_sampling_ratio/max": 1.4040093421936035, "sampling/importance_sampling_ratio/mean": 0.7943763732910156, "sampling/importance_sampling_ratio/min": 0.2126779854297638, "sampling/sampling_logp_difference/max": 1.2342114448547363, "sampling/sampling_logp_difference/mean": 0.030054152011871338, "step": 404, "step_time": 38.50842455399106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.32690513134002686, "epoch": 0.81, "frac_reward_zero_std": 0.0, "grad_norm": 1.05023193359375, "kl": 0.017625313252210617, "learning_rate": 3.3795199074479312e-06, "loss": 0.2463, "num_tokens": 2264094.0, "reward": 0.4337500035762787, "reward_std": 0.6317378282546997, "rewards/reward_func/mean": 0.4337500035762787, "rewards/reward_func/std": 0.5862212777137756, "sampling/importance_sampling_ratio/max": 1.7540080547332764, "sampling/importance_sampling_ratio/mean": 1.1875271797180176, "sampling/importance_sampling_ratio/min": 0.387142539024353, "sampling/sampling_logp_difference/max": 0.4821145534515381, "sampling/sampling_logp_difference/mean": 0.02176579087972641, "step": 405, "step_time": 30.347279070992954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 53.375, "completions/mean_terminated_length": 53.375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.342538058757782, "epoch": 0.812, "frac_reward_zero_std": 0.0, "grad_norm": 1.2553136348724365, "kl": 0.024068983271718025, "learning_rate": 3.3719360358247054e-06, "loss": 0.0313, "num_tokens": 2269524.0, "reward": 0.5887500047683716, "reward_std": 0.2793101370334625, "rewards/reward_func/mean": 0.5887500047683716, "rewards/reward_func/std": 0.5110895037651062, "sampling/importance_sampling_ratio/max": 1.5902272462844849, "sampling/importance_sampling_ratio/mean": 0.9752755165100098, "sampling/importance_sampling_ratio/min": 0.368798166513443, "sampling/sampling_logp_difference/max": 0.5483064651489258, "sampling/sampling_logp_difference/mean": 0.026680167764425278, "step": 406, "step_time": 32.252441548000206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.29614022374153137, "epoch": 0.814, "frac_reward_zero_std": 0.0, "grad_norm": 0.7736188769340515, "kl": 0.012905233539640903, "learning_rate": 3.3643430180050573e-06, "loss": 0.119, "num_tokens": 2275015.0, "reward": 0.22625000774860382, "reward_std": 0.5097081661224365, "rewards/reward_func/mean": 0.22625000774860382, "rewards/reward_func/std": 0.4719545245170593, "sampling/importance_sampling_ratio/max": 2.0418217182159424, "sampling/importance_sampling_ratio/mean": 1.0212173461914062, "sampling/importance_sampling_ratio/min": 0.137149840593338, "sampling/sampling_logp_difference/max": 0.8900790214538574, "sampling/sampling_logp_difference/mean": 0.023198578506708145, "step": 407, "step_time": 34.40931955000269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.125, "completions/mean_terminated_length": 54.125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.31210464239120483, "epoch": 0.816, "frac_reward_zero_std": 0.0, "grad_norm": 1.0732399225234985, "kl": 0.04985180124640465, "learning_rate": 3.3567409336361502e-06, "loss": 0.4186, "num_tokens": 2280251.0, "reward": 0.36250001192092896, "reward_std": 0.5389477610588074, "rewards/reward_func/mean": 0.36250001192092896, "rewards/reward_func/std": 0.5198007822036743, "sampling/importance_sampling_ratio/max": 2.857747793197632, "sampling/importance_sampling_ratio/mean": 1.1543896198272705, "sampling/importance_sampling_ratio/min": 0.2027559131383896, "sampling/sampling_logp_difference/max": 0.6502933502197266, "sampling/sampling_logp_difference/mean": 0.025806117802858353, "step": 408, "step_time": 26.751435946003767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.36082959175109863, "epoch": 0.818, "frac_reward_zero_std": 0.0, "grad_norm": 1.7728215456008911, "kl": 0.017035100609064102, "learning_rate": 3.3491298624602514e-06, "loss": -0.0713, "num_tokens": 2285456.0, "reward": 0.33375000953674316, "reward_std": 0.26373493671417236, "rewards/reward_func/mean": 0.33375000953674316, "rewards/reward_func/std": 0.5254912972450256, "sampling/importance_sampling_ratio/max": 2.1468451023101807, "sampling/importance_sampling_ratio/mean": 1.1487035751342773, "sampling/importance_sampling_ratio/min": 0.7357592582702637, "sampling/sampling_logp_difference/max": 0.2946791648864746, "sampling/sampling_logp_difference/mean": 0.020242050290107727, "step": 409, "step_time": 42.09881355499965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 45.75, "completions/mean_terminated_length": 45.75, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.3041061758995056, "epoch": 0.82, "frac_reward_zero_std": 0.0, "grad_norm": 1.2047560214996338, "kl": 0.03163205087184906, "learning_rate": 3.3415098843138972e-06, "loss": 0.0197, "num_tokens": 2291363.0, "reward": 0.2212499976158142, "reward_std": 0.30923277139663696, "rewards/reward_func/mean": 0.2212499976158142, "rewards/reward_func/std": 0.46932896971702576, "sampling/importance_sampling_ratio/max": 1.5919907093048096, "sampling/importance_sampling_ratio/mean": 0.974951982498169, "sampling/importance_sampling_ratio/min": 0.3237793445587158, "sampling/sampling_logp_difference/max": 0.8042126893997192, "sampling/sampling_logp_difference/mean": 0.026545334607362747, "step": 410, "step_time": 36.71569881300093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 53.375, "completions/mean_terminated_length": 53.375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.33315184712409973, "epoch": 0.822, "frac_reward_zero_std": 0.0, "grad_norm": 0.6845096349716187, "kl": 0.013183169066905975, "learning_rate": 3.333881079127052e-06, "loss": 0.0578, "num_tokens": 2297000.0, "reward": 0.21375000476837158, "reward_std": 0.5123411417007446, "rewards/reward_func/mean": 0.21375000476837158, "rewards/reward_func/std": 0.4748815596103668, "sampling/importance_sampling_ratio/max": 1.133974552154541, "sampling/importance_sampling_ratio/mean": 0.9385578632354736, "sampling/importance_sampling_ratio/min": 0.5827104449272156, "sampling/sampling_logp_difference/max": 0.30054330825805664, "sampling/sampling_logp_difference/mean": 0.02072637900710106, "step": 411, "step_time": 42.78445947699947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.35779449343681335, "epoch": 0.824, "frac_reward_zero_std": 0.0, "grad_norm": 0.8247691988945007, "kl": 0.012468487024307251, "learning_rate": 3.326243526922272e-06, "loss": 0.0478, "num_tokens": 2302481.0, "reward": 0.4762499928474426, "reward_std": 0.021619636565446854, "rewards/reward_func/mean": 0.4762499928474426, "rewards/reward_func/std": 0.5523698925971985, "sampling/importance_sampling_ratio/max": 1.451501488685608, "sampling/importance_sampling_ratio/mean": 0.9668034911155701, "sampling/importance_sampling_ratio/min": 0.5821980237960815, "sampling/sampling_logp_difference/max": 0.33460497856140137, "sampling/sampling_logp_difference/mean": 0.021230852231383324, "step": 412, "step_time": 41.77823870100838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 53.625, "completions/mean_terminated_length": 53.625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.31886088848114014, "epoch": 0.826, "frac_reward_zero_std": 0.0, "grad_norm": 0.7706137895584106, "kl": 0.01695754937827587, "learning_rate": 3.3185973078138665e-06, "loss": 0.2151, "num_tokens": 2308188.0, "reward": 0.19249999523162842, "reward_std": 0.519284188747406, "rewards/reward_func/mean": 0.19249999523162842, "rewards/reward_func/std": 0.4808846116065979, "sampling/importance_sampling_ratio/max": 1.734723687171936, "sampling/importance_sampling_ratio/mean": 1.0181288719177246, "sampling/importance_sampling_ratio/min": 0.5788209438323975, "sampling/sampling_logp_difference/max": 0.39677077531814575, "sampling/sampling_logp_difference/mean": 0.022095143795013428, "step": 413, "step_time": 39.981942358994274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 56.0, "completions/mean_terminated_length": 56.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.38825303316116333, "epoch": 0.828, "frac_reward_zero_std": 0.0, "grad_norm": 1.2646406888961792, "kl": 0.011753249913454056, "learning_rate": 3.3109425020070564e-06, "loss": -0.0857, "num_tokens": 2313426.0, "reward": 0.3412500023841858, "reward_std": 0.523719072341919, "rewards/reward_func/mean": 0.3412500023841858, "rewards/reward_func/std": 0.5026625990867615, "sampling/importance_sampling_ratio/max": 1.6973400115966797, "sampling/importance_sampling_ratio/mean": 1.1444008350372314, "sampling/importance_sampling_ratio/min": 0.6853067874908447, "sampling/sampling_logp_difference/max": 0.35372257232666016, "sampling/sampling_logp_difference/mean": 0.02703225240111351, "step": 414, "step_time": 34.05183301899524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.31559109687805176, "epoch": 0.83, "frac_reward_zero_std": 0.0, "grad_norm": 1.001054048538208, "kl": 0.025500796735286713, "learning_rate": 3.3032791897971313e-06, "loss": -0.0043, "num_tokens": 2318855.0, "reward": 0.4137499928474426, "reward_std": 0.5379934906959534, "rewards/reward_func/mean": 0.4137499928474426, "rewards/reward_func/std": 0.5837303400039673, "sampling/importance_sampling_ratio/max": 1.4571188688278198, "sampling/importance_sampling_ratio/mean": 0.7654373645782471, "sampling/importance_sampling_ratio/min": 0.521602988243103, "sampling/sampling_logp_difference/max": 0.4150291681289673, "sampling/sampling_logp_difference/mean": 0.024981288239359856, "step": 415, "step_time": 38.211065392009914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 57.625, "completions/mean_terminated_length": 57.625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.3610576391220093, "epoch": 0.832, "frac_reward_zero_std": 0.0, "grad_norm": 1.0423651933670044, "kl": 0.014878635294735432, "learning_rate": 3.2956074515686105e-06, "loss": 0.1218, "num_tokens": 2324058.0, "reward": 0.1875, "reward_std": 0.5408138036727905, "rewards/reward_func/mean": 0.1875, "rewards/reward_func/std": 0.5013624429702759, "sampling/importance_sampling_ratio/max": 1.7271286249160767, "sampling/importance_sampling_ratio/mean": 0.9100079536437988, "sampling/importance_sampling_ratio/min": 0.40812426805496216, "sampling/sampling_logp_difference/max": 0.3937739133834839, "sampling/sampling_logp_difference/mean": 0.02358619123697281, "step": 416, "step_time": 41.3646322049899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.3718615770339966, "epoch": 0.834, "frac_reward_zero_std": 0.0, "grad_norm": 0.7263632416725159, "kl": 0.02011047676205635, "learning_rate": 3.2879273677943972e-06, "loss": 0.0355, "num_tokens": 2329647.0, "reward": 0.17874999344348907, "reward_std": 0.32287517189979553, "rewards/reward_func/mean": 0.17874999344348907, "rewards/reward_func/std": 0.49820929765701294, "sampling/importance_sampling_ratio/max": 1.2329703569412231, "sampling/importance_sampling_ratio/mean": 0.6824724674224854, "sampling/importance_sampling_ratio/min": 0.3652595281600952, "sampling/sampling_logp_difference/max": 0.40844106674194336, "sampling/sampling_logp_difference/mean": 0.027271784842014313, "step": 417, "step_time": 43.37971075499081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 51.125, "completions/mean_terminated_length": 51.125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.360298216342926, "epoch": 0.836, "frac_reward_zero_std": 0.0, "grad_norm": 1.1417040824890137, "kl": 0.017974980175495148, "learning_rate": 3.2802390190349364e-06, "loss": 0.2216, "num_tokens": 2336044.0, "reward": 0.10249999910593033, "reward_std": 0.26619309186935425, "rewards/reward_func/mean": 0.10249999910593033, "rewards/reward_func/std": 0.363426148891449, "sampling/importance_sampling_ratio/max": 1.966254472732544, "sampling/importance_sampling_ratio/mean": 0.8495919108390808, "sampling/importance_sampling_ratio/min": 0.25239256024360657, "sampling/sampling_logp_difference/max": 1.2355303764343262, "sampling/sampling_logp_difference/mean": 0.029449839144945145, "step": 418, "step_time": 45.005784367007436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.35880693793296814, "epoch": 0.838, "frac_reward_zero_std": 0.0, "grad_norm": 0.6445991396903992, "kl": 0.02371111512184143, "learning_rate": 3.272542485937369e-06, "loss": -0.1228, "num_tokens": 2341625.0, "reward": -0.04999999701976776, "reward_std": 0.04591917246580124, "rewards/reward_func/mean": -0.04999999701976776, "rewards/reward_func/std": 0.04309457913041115, "sampling/importance_sampling_ratio/max": 1.6012232303619385, "sampling/importance_sampling_ratio/mean": 0.7206702828407288, "sampling/importance_sampling_ratio/min": 0.2793715298175812, "sampling/sampling_logp_difference/max": 0.6976406574249268, "sampling/sampling_logp_difference/mean": 0.02574693039059639, "step": 419, "step_time": 40.036054764001165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.38778284192085266, "epoch": 0.84, "frac_reward_zero_std": 0.0, "grad_norm": 0.8026552796363831, "kl": 0.021173562854528427, "learning_rate": 3.264837849234685e-06, "loss": -0.0381, "num_tokens": 2348306.0, "reward": -0.0625, "reward_std": 0.05533730238676071, "rewards/reward_func/mean": -0.0625, "rewards/reward_func/std": 0.054967522621154785, "sampling/importance_sampling_ratio/max": 2.244253396987915, "sampling/importance_sampling_ratio/mean": 1.0164496898651123, "sampling/importance_sampling_ratio/min": 0.2759284973144531, "sampling/sampling_logp_difference/max": 0.572641134262085, "sampling/sampling_logp_difference/mean": 0.02503090165555477, "step": 420, "step_time": 48.014933019003365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 59.75, "completions/mean_terminated_length": 59.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.3811195492744446, "epoch": 0.842, "frac_reward_zero_std": 0.0, "grad_norm": 0.8657627701759338, "kl": 0.011216644197702408, "learning_rate": 3.257125189744877e-06, "loss": -0.1309, "num_tokens": 2353901.0, "reward": 0.3362500071525574, "reward_std": 0.565845787525177, "rewards/reward_func/mean": 0.3362500071525574, "rewards/reward_func/std": 0.5429532527923584, "sampling/importance_sampling_ratio/max": 1.4307345151901245, "sampling/importance_sampling_ratio/mean": 0.824535608291626, "sampling/importance_sampling_ratio/min": 0.5268330574035645, "sampling/sampling_logp_difference/max": 0.5823192000389099, "sampling/sampling_logp_difference/mean": 0.021394170820713043, "step": 421, "step_time": 35.67191677300434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 50.125, "completions/mean_terminated_length": 50.125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.32794663310050964, "epoch": 0.844, "frac_reward_zero_std": 0.0, "grad_norm": 3.6034460067749023, "kl": 0.026636671274900436, "learning_rate": 3.249404588370095e-06, "loss": 0.2555, "num_tokens": 2358707.0, "reward": 0.4362500309944153, "reward_std": 0.5625, "rewards/reward_func/mean": 0.4362500309944153, "rewards/reward_func/std": 0.6054971218109131, "sampling/importance_sampling_ratio/max": 1.9666376113891602, "sampling/importance_sampling_ratio/mean": 1.0515501499176025, "sampling/importance_sampling_ratio/min": 0.25995469093322754, "sampling/sampling_logp_difference/max": 0.5747603178024292, "sampling/sampling_logp_difference/mean": 0.02525373175740242, "step": 422, "step_time": 33.49426256099832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 62.125, "completions/mean_terminated_length": 62.125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.32681041955947876, "epoch": 0.846, "frac_reward_zero_std": 0.0, "grad_norm": 0.7744244337081909, "kl": 0.007102203089743853, "learning_rate": 3.2416761260957925e-06, "loss": 0.043, "num_tokens": 2364549.0, "reward": -0.06749999523162842, "reward_std": 0.03864005580544472, "rewards/reward_func/mean": -0.06749999523162842, "rewards/reward_func/std": 0.04131758585572243, "sampling/importance_sampling_ratio/max": 2.1614413261413574, "sampling/importance_sampling_ratio/mean": 1.109354019165039, "sampling/importance_sampling_ratio/min": 0.5632383823394775, "sampling/sampling_logp_difference/max": 0.3571450710296631, "sampling/sampling_logp_difference/mean": 0.020130092278122902, "step": 423, "step_time": 40.15369539499807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 57.375, "completions/mean_terminated_length": 57.375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.31878265738487244, "epoch": 0.848, "frac_reward_zero_std": 0.0, "grad_norm": 1.3974499702453613, "kl": 0.032708972692489624, "learning_rate": 3.233939883989882e-06, "loss": -0.0634, "num_tokens": 2370417.0, "reward": 0.20625001192092896, "reward_std": 0.48608773946762085, "rewards/reward_func/mean": 0.20625001192092896, "rewards/reward_func/std": 0.45102858543395996, "sampling/importance_sampling_ratio/max": 2.078763246536255, "sampling/importance_sampling_ratio/mean": 1.177141547203064, "sampling/importance_sampling_ratio/min": 0.7995861768722534, "sampling/sampling_logp_difference/max": 0.32161664962768555, "sampling/sampling_logp_difference/mean": 0.016767999157309532, "step": 424, "step_time": 39.82506796899543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 58.5, "completions/mean_terminated_length": 58.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.38849684596061707, "epoch": 0.85, "frac_reward_zero_std": 0.0, "grad_norm": 0.8255501389503479, "kl": 0.013021498918533325, "learning_rate": 3.2261959432018834e-06, "loss": -0.1225, "num_tokens": 2375900.0, "reward": 0.4375, "reward_std": 0.5317496061325073, "rewards/reward_func/mean": 0.4375, "rewards/reward_func/std": 0.5842394232749939, "sampling/importance_sampling_ratio/max": 2.1501243114471436, "sampling/importance_sampling_ratio/mean": 0.9571436643600464, "sampling/importance_sampling_ratio/min": 0.4914774000644684, "sampling/sampling_logp_difference/max": 0.5219483375549316, "sampling/sampling_logp_difference/mean": 0.025387398898601532, "step": 425, "step_time": 39.655509564006934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 50.375, "completions/mean_terminated_length": 50.375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.33532607555389404, "epoch": 0.852, "frac_reward_zero_std": 0.0, "grad_norm": 0.8230807781219482, "kl": 0.01512499526143074, "learning_rate": 3.218444384962071e-06, "loss": -0.054, "num_tokens": 2381685.0, "reward": 0.3175000250339508, "reward_std": 0.2747558355331421, "rewards/reward_func/mean": 0.3175000250339508, "rewards/reward_func/std": 0.5468807816505432, "sampling/importance_sampling_ratio/max": 1.088066816329956, "sampling/importance_sampling_ratio/mean": 0.7654009461402893, "sampling/importance_sampling_ratio/min": 0.43912702798843384, "sampling/sampling_logp_difference/max": 0.6466556787490845, "sampling/sampling_logp_difference/mean": 0.025085650384426117, "step": 426, "step_time": 41.43392355799733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 53.875, "completions/mean_terminated_length": 53.875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.33623552322387695, "epoch": 0.854, "frac_reward_zero_std": 0.0, "grad_norm": 1.1959271430969238, "kl": 0.015206445939838886, "learning_rate": 3.210685290580622e-06, "loss": 0.0497, "num_tokens": 2386757.0, "reward": 0.20374999940395355, "reward_std": 0.5192359089851379, "rewards/reward_func/mean": 0.20374999940395355, "rewards/reward_func/std": 0.4818398654460907, "sampling/importance_sampling_ratio/max": 1.3981304168701172, "sampling/importance_sampling_ratio/mean": 0.8693457245826721, "sampling/importance_sampling_ratio/min": 0.3326525390148163, "sampling/sampling_logp_difference/max": 0.46013569831848145, "sampling/sampling_logp_difference/mean": 0.02562933787703514, "step": 427, "step_time": 32.31069487700006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.35203301906585693, "epoch": 0.856, "frac_reward_zero_std": 0.0, "grad_norm": 0.9189528226852417, "kl": 0.015850629657506943, "learning_rate": 3.2029187414467645e-06, "loss": -0.017, "num_tokens": 2392704.0, "reward": 0.3450000286102295, "reward_std": 0.5642583966255188, "rewards/reward_func/mean": 0.3450000286102295, "rewards/reward_func/std": 0.5430338382720947, "sampling/importance_sampling_ratio/max": 1.4162225723266602, "sampling/importance_sampling_ratio/mean": 0.7990955114364624, "sampling/importance_sampling_ratio/min": 0.42359423637390137, "sampling/sampling_logp_difference/max": 0.6078430414199829, "sampling/sampling_logp_difference/mean": 0.02648048661649227, "step": 428, "step_time": 33.69458929898974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 52.875, "completions/mean_terminated_length": 52.875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3973723351955414, "epoch": 0.858, "frac_reward_zero_std": 0.0, "grad_norm": 0.6990527510643005, "kl": 0.012423260137438774, "learning_rate": 3.1951448190279256e-06, "loss": 0.0692, "num_tokens": 2398486.0, "reward": 0.0949999988079071, "reward_std": 0.26899394392967224, "rewards/reward_func/mean": 0.0949999988079071, "rewards/reward_func/std": 0.36621618270874023, "sampling/importance_sampling_ratio/max": 1.2071622610092163, "sampling/importance_sampling_ratio/mean": 0.8269263505935669, "sampling/importance_sampling_ratio/min": 0.41573214530944824, "sampling/sampling_logp_difference/max": 0.488450288772583, "sampling/sampling_logp_difference/mean": 0.023476937785744667, "step": 429, "step_time": 35.47933077499329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.36242973804473877, "epoch": 0.86, "frac_reward_zero_std": 0.0, "grad_norm": 0.9864727258682251, "kl": 0.010072307661175728, "learning_rate": 3.1873636048688714e-06, "loss": -0.1138, "num_tokens": 2403892.0, "reward": 0.33000001311302185, "reward_std": 0.5705651044845581, "rewards/reward_func/mean": 0.33000001311302185, "rewards/reward_func/std": 0.5465214252471924, "sampling/importance_sampling_ratio/max": 1.5361573696136475, "sampling/importance_sampling_ratio/mean": 1.0155696868896484, "sampling/importance_sampling_ratio/min": 0.7467938661575317, "sampling/sampling_logp_difference/max": 0.23247402906417847, "sampling/sampling_logp_difference/mean": 0.01987922564148903, "step": 430, "step_time": 29.146404054001323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.35852545499801636, "epoch": 0.862, "frac_reward_zero_std": 0.0, "grad_norm": 1.280659556388855, "kl": 0.02400248870253563, "learning_rate": 3.1795751805908578e-06, "loss": -0.0427, "num_tokens": 2409261.0, "reward": 0.20374999940395355, "reward_std": 0.5336059331893921, "rewards/reward_func/mean": 0.20374999940395355, "rewards/reward_func/std": 0.4941641688346863, "sampling/importance_sampling_ratio/max": 1.468123197555542, "sampling/importance_sampling_ratio/mean": 0.9840533137321472, "sampling/importance_sampling_ratio/min": 0.4497135579586029, "sampling/sampling_logp_difference/max": 0.39689433574676514, "sampling/sampling_logp_difference/mean": 0.021220847964286804, "step": 431, "step_time": 33.816670406013145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.3159053921699524, "epoch": 0.864, "frac_reward_zero_std": 0.0, "grad_norm": 2.1149163246154785, "kl": 0.01616514101624489, "learning_rate": 3.171779627890769e-06, "loss": -0.2228, "num_tokens": 2413916.0, "reward": 0.09000000357627869, "reward_std": 0.25971826910972595, "rewards/reward_func/mean": 0.09000000357627869, "rewards/reward_func/std": 0.3446737825870514, "sampling/importance_sampling_ratio/max": 1.8468085527420044, "sampling/importance_sampling_ratio/mean": 1.1968073844909668, "sampling/importance_sampling_ratio/min": 0.751221776008606, "sampling/sampling_logp_difference/max": 0.3156614303588867, "sampling/sampling_logp_difference/mean": 0.02191685512661934, "step": 432, "step_time": 33.72210700699361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 50.125, "completions/mean_terminated_length": 50.125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.31895625591278076, "epoch": 0.866, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546246767044067, "kl": 0.020721694454550743, "learning_rate": 3.1639770285402632e-06, "loss": 0.0157, "num_tokens": 2419091.0, "reward": 0.1862500011920929, "reward_std": 0.5399943590164185, "rewards/reward_func/mean": 0.1862500011920929, "rewards/reward_func/std": 0.5013107657432556, "sampling/importance_sampling_ratio/max": 2.4649336338043213, "sampling/importance_sampling_ratio/mean": 0.9663840532302856, "sampling/importance_sampling_ratio/min": 0.36142030358314514, "sampling/sampling_logp_difference/max": 0.4676198959350586, "sampling/sampling_logp_difference/mean": 0.024064481258392334, "step": 433, "step_time": 30.73073644300166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 55.125, "completions/mean_terminated_length": 55.125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.39400357007980347, "epoch": 0.868, "frac_reward_zero_std": 0.0, "grad_norm": 1.1748162508010864, "kl": 0.023680763319134712, "learning_rate": 3.1561674643849173e-06, "loss": 0.2299, "num_tokens": 2424823.0, "reward": -0.04749999940395355, "reward_std": 0.035702817142009735, "rewards/reward_func/mean": -0.04749999940395355, "rewards/reward_func/std": 0.03845219686627388, "sampling/importance_sampling_ratio/max": 2.1316819190979004, "sampling/importance_sampling_ratio/mean": 1.2642168998718262, "sampling/importance_sampling_ratio/min": 0.5596238374710083, "sampling/sampling_logp_difference/max": 0.49070703983306885, "sampling/sampling_logp_difference/mean": 0.026236988604068756, "step": 434, "step_time": 46.65107938699657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.381875604391098, "epoch": 0.87, "frac_reward_zero_std": 0.0, "grad_norm": 1.038621187210083, "kl": 0.016581958159804344, "learning_rate": 3.148351017343363e-06, "loss": 0.0706, "num_tokens": 2431458.0, "reward": 0.3075000047683716, "reward_std": 0.28910407423973083, "rewards/reward_func/mean": 0.3075000047683716, "rewards/reward_func/std": 0.5689526796340942, "sampling/importance_sampling_ratio/max": 1.7760720252990723, "sampling/importance_sampling_ratio/mean": 1.0361217260360718, "sampling/importance_sampling_ratio/min": 0.5515484809875488, "sampling/sampling_logp_difference/max": 0.5586767196655273, "sampling/sampling_logp_difference/mean": 0.025216208770871162, "step": 435, "step_time": 51.65877502699732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.33468616008758545, "epoch": 0.872, "frac_reward_zero_std": 0.0, "grad_norm": 0.9653282761573792, "kl": 0.013657507486641407, "learning_rate": 3.1405277694064306e-06, "loss": 0.1378, "num_tokens": 2437082.0, "reward": 0.5924999713897705, "reward_std": 0.5440975427627563, "rewards/reward_func/mean": 0.5924999713897705, "rewards/reward_func/std": 0.5312720537185669, "sampling/importance_sampling_ratio/max": 1.5817586183547974, "sampling/importance_sampling_ratio/mean": 0.9748501777648926, "sampling/importance_sampling_ratio/min": 0.4865840971469879, "sampling/sampling_logp_difference/max": 0.43617498874664307, "sampling/sampling_logp_difference/mean": 0.022868365049362183, "step": 436, "step_time": 33.30673982600274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 51.375, "completions/mean_terminated_length": 51.375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3179436922073364, "epoch": 0.874, "frac_reward_zero_std": 0.0, "grad_norm": 1.7460964918136597, "kl": 0.009489338845014572, "learning_rate": 3.1326978026362907e-06, "loss": 0.2029, "num_tokens": 2443073.0, "reward": 0.22499999403953552, "reward_std": 0.3186895549297333, "rewards/reward_func/mean": 0.22499999403953552, "rewards/reward_func/std": 0.4789273738861084, "sampling/importance_sampling_ratio/max": 2.7308499813079834, "sampling/importance_sampling_ratio/mean": 1.1698341369628906, "sampling/importance_sampling_ratio/min": 0.4456416368484497, "sampling/sampling_logp_difference/max": 0.751288652420044, "sampling/sampling_logp_difference/mean": 0.02368471771478653, "step": 437, "step_time": 56.14041445599287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 49.375, "completions/mean_terminated_length": 49.375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.33749181032180786, "epoch": 0.876, "frac_reward_zero_std": 0.0, "grad_norm": 1.662105679512024, "kl": 0.013462947681546211, "learning_rate": 3.1248611991655885e-06, "loss": -0.0522, "num_tokens": 2448957.0, "reward": 0.32500001788139343, "reward_std": 0.5697466135025024, "rewards/reward_func/mean": 0.32500001788139343, "rewards/reward_func/std": 0.554307758808136, "sampling/importance_sampling_ratio/max": 2.088134288787842, "sampling/importance_sampling_ratio/mean": 1.2728486061096191, "sampling/importance_sampling_ratio/min": 0.7122271656990051, "sampling/sampling_logp_difference/max": 1.059885025024414, "sampling/sampling_logp_difference/mean": 0.023971613496541977, "step": 438, "step_time": 40.55881189700449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 54.375, "completions/mean_terminated_length": 54.375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.39506107568740845, "epoch": 0.878, "frac_reward_zero_std": 0.0, "grad_norm": 0.9108272194862366, "kl": 0.012316450476646423, "learning_rate": 3.1170180411965854e-06, "loss": -0.0778, "num_tokens": 2455782.0, "reward": 0.05125000327825546, "reward_std": 0.2902706265449524, "rewards/reward_func/mean": 0.05125000327825546, "rewards/reward_func/std": 0.386502206325531, "sampling/importance_sampling_ratio/max": 1.1981909275054932, "sampling/importance_sampling_ratio/mean": 0.7162038087844849, "sampling/importance_sampling_ratio/min": 0.44226300716400146, "sampling/sampling_logp_difference/max": 0.5306471586227417, "sampling/sampling_logp_difference/mean": 0.023498659953475, "step": 439, "step_time": 52.448768703994574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 58.5, "completions/mean_terminated_length": 58.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.37565258145332336, "epoch": 0.88, "frac_reward_zero_std": 0.0, "grad_norm": 0.8730466365814209, "kl": 0.11407122761011124, "learning_rate": 3.109168411000299e-06, "loss": 0.0533, "num_tokens": 2460686.0, "reward": 0.20750001072883606, "reward_std": 0.5263463258743286, "rewards/reward_func/mean": 0.20750001072883606, "rewards/reward_func/std": 0.4876108169555664, "sampling/importance_sampling_ratio/max": 1.0784844160079956, "sampling/importance_sampling_ratio/mean": 0.675607442855835, "sampling/importance_sampling_ratio/min": 0.0301599632948637, "sampling/sampling_logp_difference/max": 1.5674490928649902, "sampling/sampling_logp_difference/mean": 0.030301496386528015, "step": 440, "step_time": 40.517616327008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 96.0, "completions/max_terminated_length": 96.0, "completions/mean_length": 57.125, "completions/mean_terminated_length": 57.125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.34737882018089294, "epoch": 0.882, "frac_reward_zero_std": 0.0, "grad_norm": 0.7645708918571472, "kl": 0.01040720660239458, "learning_rate": 3.1013123909156347e-06, "loss": 0.2671, "num_tokens": 2465679.0, "reward": 0.0637499988079071, "reward_std": 0.2704658508300781, "rewards/reward_func/mean": 0.0637499988079071, "rewards/reward_func/std": 0.3515653908252716, "sampling/importance_sampling_ratio/max": 1.5223848819732666, "sampling/importance_sampling_ratio/mean": 1.0293443202972412, "sampling/importance_sampling_ratio/min": 0.5342814326286316, "sampling/sampling_logp_difference/max": 0.3400125503540039, "sampling/sampling_logp_difference/mean": 0.019414888694882393, "step": 441, "step_time": 32.846883123987936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.3078349232673645, "epoch": 0.884, "frac_reward_zero_std": 0.0, "grad_norm": 1.4805446863174438, "kl": 0.011887718923389912, "learning_rate": 3.093450063348525e-06, "loss": 0.193, "num_tokens": 2471241.0, "reward": 0.34375, "reward_std": 0.28617510199546814, "rewards/reward_func/mean": 0.34375, "rewards/reward_func/std": 0.5389391183853149, "sampling/importance_sampling_ratio/max": 2.141223192214966, "sampling/importance_sampling_ratio/mean": 1.1272714138031006, "sampling/importance_sampling_ratio/min": 0.6114374995231628, "sampling/sampling_logp_difference/max": 0.6033744812011719, "sampling/sampling_logp_difference/mean": 0.01826009526848793, "step": 442, "step_time": 36.22709542399389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3010959029197693, "epoch": 0.886, "frac_reward_zero_std": 0.0, "grad_norm": 1.4849302768707275, "kl": 0.009227529168128967, "learning_rate": 3.085581510771067e-06, "loss": -0.141, "num_tokens": 2476093.0, "reward": 0.3187499940395355, "reward_std": 0.2881261706352234, "rewards/reward_func/mean": 0.3187499940395355, "rewards/reward_func/std": 0.5388993620872498, "sampling/importance_sampling_ratio/max": 1.947962760925293, "sampling/importance_sampling_ratio/mean": 1.3393418788909912, "sampling/importance_sampling_ratio/min": 0.5013647675514221, "sampling/sampling_logp_difference/max": 0.4786471128463745, "sampling/sampling_logp_difference/mean": 0.01934540644288063, "step": 443, "step_time": 34.268270688000484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.33479422330856323, "epoch": 0.888, "frac_reward_zero_std": 0.0, "grad_norm": 0.9843921065330505, "kl": 0.008889885619282722, "learning_rate": 3.0777068157206535e-06, "loss": -0.1042, "num_tokens": 2481969.0, "reward": 0.3225000202655792, "reward_std": 0.2926766574382782, "rewards/reward_func/mean": 0.3225000202655792, "rewards/reward_func/std": 0.5274940729141235, "sampling/importance_sampling_ratio/max": 1.5019242763519287, "sampling/importance_sampling_ratio/mean": 0.9874775409698486, "sampling/importance_sampling_ratio/min": 0.6516150236129761, "sampling/sampling_logp_difference/max": 0.47698545455932617, "sampling/sampling_logp_difference/mean": 0.022217225283384323, "step": 444, "step_time": 37.88683218799997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 51.125, "completions/mean_terminated_length": 51.125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.347456157207489, "epoch": 0.89, "frac_reward_zero_std": 0.0, "grad_norm": 1.2607645988464355, "kl": 0.011377193965017796, "learning_rate": 3.0698260607991094e-06, "loss": -0.1942, "num_tokens": 2486972.0, "reward": 0.061250001192092896, "reward_std": 0.2965749502182007, "rewards/reward_func/mean": 0.061250001192092896, "rewards/reward_func/std": 0.3812175393104553, "sampling/importance_sampling_ratio/max": 1.8228760957717896, "sampling/importance_sampling_ratio/mean": 0.942634105682373, "sampling/importance_sampling_ratio/min": 0.46362996101379395, "sampling/sampling_logp_difference/max": 0.3296375274658203, "sampling/sampling_logp_difference/mean": 0.01900862343609333, "step": 445, "step_time": 36.2604122460034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 46.625, "completions/mean_terminated_length": 46.625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.37372976541519165, "epoch": 0.892, "frac_reward_zero_std": 0.0, "grad_norm": 1.0763218402862549, "kl": 0.01523488201200962, "learning_rate": 3.061939328671824e-06, "loss": 0.2346, "num_tokens": 2492375.0, "reward": 0.45375001430511475, "reward_std": 0.6083322763442993, "rewards/reward_func/mean": 0.45375001430511475, "rewards/reward_func/std": 0.5633050203323364, "sampling/importance_sampling_ratio/max": 1.4935482740402222, "sampling/importance_sampling_ratio/mean": 0.9692554473876953, "sampling/importance_sampling_ratio/min": 0.3600374758243561, "sampling/sampling_logp_difference/max": 0.45327699184417725, "sampling/sampling_logp_difference/mean": 0.023247534409165382, "step": 446, "step_time": 31.12013271600881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 47.875, "completions/mean_terminated_length": 47.875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3365999460220337, "epoch": 0.894, "frac_reward_zero_std": 0.0, "grad_norm": 0.8528043031692505, "kl": 0.014648355543613434, "learning_rate": 3.054046702066886e-06, "loss": -0.0107, "num_tokens": 2498064.0, "reward": 0.45374998450279236, "reward_std": 0.5031688809394836, "rewards/reward_func/mean": 0.45374998450279236, "rewards/reward_func/std": 0.5408442616462708, "sampling/importance_sampling_ratio/max": 1.301316499710083, "sampling/importance_sampling_ratio/mean": 0.8368029594421387, "sampling/importance_sampling_ratio/min": 0.5832191109657288, "sampling/sampling_logp_difference/max": 0.5037274360656738, "sampling/sampling_logp_difference/mean": 0.02292271889746189, "step": 447, "step_time": 38.94344817800447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.3609512448310852, "epoch": 0.896, "frac_reward_zero_std": 0.0, "grad_norm": 1.116279125213623, "kl": 0.00902464147657156, "learning_rate": 3.0461482637742133e-06, "loss": 0.3113, "num_tokens": 2504084.0, "reward": 0.20375001430511475, "reward_std": 0.32907286286354065, "rewards/reward_func/mean": 0.20375001430511475, "rewards/reward_func/std": 0.483822226524353, "sampling/importance_sampling_ratio/max": 2.460287570953369, "sampling/importance_sampling_ratio/mean": 1.2798258066177368, "sampling/importance_sampling_ratio/min": 0.588743269443512, "sampling/sampling_logp_difference/max": 0.512649416923523, "sampling/sampling_logp_difference/mean": 0.021559733897447586, "step": 448, "step_time": 42.009523353000986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.350589394569397, "epoch": 0.898, "frac_reward_zero_std": 0.0, "grad_norm": 0.955585241317749, "kl": 0.0077981045469641685, "learning_rate": 3.0382440966446876e-06, "loss": -0.048, "num_tokens": 2509876.0, "reward": 0.4662500023841858, "reward_std": 0.5135143995285034, "rewards/reward_func/mean": 0.4662500023841858, "rewards/reward_func/std": 0.5347346067428589, "sampling/importance_sampling_ratio/max": 1.597688913345337, "sampling/importance_sampling_ratio/mean": 1.0106717348098755, "sampling/importance_sampling_ratio/min": 0.5116551518440247, "sampling/sampling_logp_difference/max": 0.40577125549316406, "sampling/sampling_logp_difference/mean": 0.026103615760803223, "step": 449, "step_time": 38.17177925199212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 57.25, "completions/mean_terminated_length": 57.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.34553277492523193, "epoch": 0.9, "frac_reward_zero_std": 0.0, "grad_norm": 0.7108585238456726, "kl": 0.010503964498639107, "learning_rate": 3.0303342835892804e-06, "loss": -0.1033, "num_tokens": 2516447.0, "reward": 0.20999999344348907, "reward_std": 0.32175832986831665, "rewards/reward_func/mean": 0.20999999344348907, "rewards/reward_func/std": 0.4881744682788849, "sampling/importance_sampling_ratio/max": 1.5398190021514893, "sampling/importance_sampling_ratio/mean": 0.9388370513916016, "sampling/importance_sampling_ratio/min": 0.38391628861427307, "sampling/sampling_logp_difference/max": 0.45900917053222656, "sampling/sampling_logp_difference/mean": 0.020982615649700165, "step": 450, "step_time": 43.6907909319998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 47.875, "completions/mean_terminated_length": 47.875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.32224124670028687, "epoch": 0.902, "frac_reward_zero_std": 0.0, "grad_norm": 1.2028799057006836, "kl": 0.016997255384922028, "learning_rate": 3.0224189075781886e-06, "loss": 0.0639, "num_tokens": 2522872.0, "reward": 0.10750000178813934, "reward_std": 0.2686707377433777, "rewards/reward_func/mean": 0.10750000178813934, "rewards/reward_func/std": 0.3612181842327118, "sampling/importance_sampling_ratio/max": 1.7533460855484009, "sampling/importance_sampling_ratio/mean": 1.1469990015029907, "sampling/importance_sampling_ratio/min": 0.3751453459262848, "sampling/sampling_logp_difference/max": 0.5441827774047852, "sampling/sampling_logp_difference/mean": 0.020926637575030327, "step": 451, "step_time": 44.405825520996586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 48.625, "completions/mean_terminated_length": 48.625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.39713117480278015, "epoch": 0.904, "frac_reward_zero_std": 0.0, "grad_norm": 1.0681439638137817, "kl": 0.01726282574236393, "learning_rate": 3.014498051639959e-06, "loss": -0.1024, "num_tokens": 2528540.0, "reward": -0.027499999850988388, "reward_std": 0.024149831384420395, "rewards/reward_func/mean": -0.027499999850988388, "rewards/reward_func/std": 0.022519832476973534, "sampling/importance_sampling_ratio/max": 1.3811919689178467, "sampling/importance_sampling_ratio/mean": 0.9916707277297974, "sampling/importance_sampling_ratio/min": 0.5636754631996155, "sampling/sampling_logp_difference/max": 0.34890270233154297, "sampling/sampling_logp_difference/mean": 0.025962986052036285, "step": 452, "step_time": 39.69970705700689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 56.125, "completions/mean_terminated_length": 56.125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.36409592628479004, "epoch": 0.906, "frac_reward_zero_std": 0.0, "grad_norm": 0.9367609024047852, "kl": 0.008488480933010578, "learning_rate": 3.006571798860626e-06, "loss": 0.0109, "num_tokens": 2533758.0, "reward": 0.4700000286102295, "reward_std": 0.4986118674278259, "rewards/reward_func/mean": 0.4700000286102295, "rewards/reward_func/std": 0.5269860625267029, "sampling/importance_sampling_ratio/max": 1.6330208778381348, "sampling/importance_sampling_ratio/mean": 1.038517951965332, "sampling/importance_sampling_ratio/min": 0.5776368975639343, "sampling/sampling_logp_difference/max": 0.4428684711456299, "sampling/sampling_logp_difference/mean": 0.02270490676164627, "step": 453, "step_time": 31.74909894198936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 58.0, "completions/mean_terminated_length": 58.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.3147682547569275, "epoch": 0.908, "frac_reward_zero_std": 0.0, "grad_norm": 0.5675761103630066, "kl": 0.007853170856833458, "learning_rate": 2.9986402323828274e-06, "loss": -0.1266, "num_tokens": 2539130.0, "reward": 0.18000000715255737, "reward_std": 0.35241687297821045, "rewards/reward_func/mean": 0.18000000715255737, "rewards/reward_func/std": 0.5112450122833252, "sampling/importance_sampling_ratio/max": 0.7927283644676208, "sampling/importance_sampling_ratio/mean": 0.6180651187896729, "sampling/importance_sampling_ratio/min": 0.16511119902133942, "sampling/sampling_logp_difference/max": 0.6556259393692017, "sampling/sampling_logp_difference/mean": 0.022515466436743736, "step": 454, "step_time": 38.15574240600108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.3250408172607422, "epoch": 0.91, "frac_reward_zero_std": 0.0, "grad_norm": 0.9744404554367065, "kl": 0.008251778781414032, "learning_rate": 2.9907034354049443e-06, "loss": 0.1804, "num_tokens": 2544571.0, "reward": 0.44749999046325684, "reward_std": 0.5988933444023132, "rewards/reward_func/mean": 0.44749999046325684, "rewards/reward_func/std": 0.5548165440559387, "sampling/importance_sampling_ratio/max": 1.5385011434555054, "sampling/importance_sampling_ratio/mean": 0.9114477634429932, "sampling/importance_sampling_ratio/min": 0.29668596386909485, "sampling/sampling_logp_difference/max": 0.35155487060546875, "sampling/sampling_logp_difference/mean": 0.02304825559258461, "step": 455, "step_time": 33.35669154900825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 47.875, "completions/mean_terminated_length": 47.875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.3169974982738495, "epoch": 0.912, "frac_reward_zero_std": 0.0, "grad_norm": 0.806413471698761, "kl": 0.013721331022679806, "learning_rate": 2.9827614911802205e-06, "loss": -0.0326, "num_tokens": 2550599.0, "reward": 0.5987499952316284, "reward_std": 0.5605678558349609, "rewards/reward_func/mean": 0.5987499952316284, "rewards/reward_func/std": 0.543413519859314, "sampling/importance_sampling_ratio/max": 1.1396749019622803, "sampling/importance_sampling_ratio/mean": 0.8662426471710205, "sampling/importance_sampling_ratio/min": 0.5802029371261597, "sampling/sampling_logp_difference/max": 0.41710424423217773, "sampling/sampling_logp_difference/mean": 0.018221460282802582, "step": 456, "step_time": 28.43808863000595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.3788241744041443, "epoch": 0.914, "frac_reward_zero_std": 0.0, "grad_norm": 1.3950449228286743, "kl": 0.009939423762261868, "learning_rate": 2.9748144830158925e-06, "loss": -0.1656, "num_tokens": 2555995.0, "reward": 0.32499998807907104, "reward_std": 0.5697804093360901, "rewards/reward_func/mean": 0.32499998807907104, "rewards/reward_func/std": 0.5482960343360901, "sampling/importance_sampling_ratio/max": 1.5532302856445312, "sampling/importance_sampling_ratio/mean": 1.2069408893585205, "sampling/importance_sampling_ratio/min": 0.7212671041488647, "sampling/sampling_logp_difference/max": 0.6211786270141602, "sampling/sampling_logp_difference/mean": 0.020913410931825638, "step": 457, "step_time": 32.56511296798999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 48.375, "completions/mean_terminated_length": 48.375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.3860653042793274, "epoch": 0.916, "frac_reward_zero_std": 0.0, "grad_norm": 0.918066680431366, "kl": 0.0109860859811306, "learning_rate": 2.966862494272316e-06, "loss": 0.0279, "num_tokens": 2561552.0, "reward": 0.33000001311302185, "reward_std": 0.5475718975067139, "rewards/reward_func/mean": 0.33000001311302185, "rewards/reward_func/std": 0.5318431854248047, "sampling/importance_sampling_ratio/max": 1.018545389175415, "sampling/importance_sampling_ratio/mean": 0.7079232931137085, "sampling/importance_sampling_ratio/min": 0.3387696444988251, "sampling/sampling_logp_difference/max": 0.3644402027130127, "sampling/sampling_logp_difference/mean": 0.028172709047794342, "step": 458, "step_time": 43.0941359270073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.34271055459976196, "epoch": 0.918, "frac_reward_zero_std": 0.0, "grad_norm": 0.9301976561546326, "kl": 0.03799736499786377, "learning_rate": 2.9589056083620902e-06, "loss": 0.1021, "num_tokens": 2566731.0, "reward": 0.1875, "reward_std": 0.531709611415863, "rewards/reward_func/mean": 0.1875, "rewards/reward_func/std": 0.49430328607559204, "sampling/importance_sampling_ratio/max": 1.698185920715332, "sampling/importance_sampling_ratio/mean": 0.8187704682350159, "sampling/importance_sampling_ratio/min": 0.2524738907814026, "sampling/sampling_logp_difference/max": 0.7073307037353516, "sampling/sampling_logp_difference/mean": 0.029650993645191193, "step": 459, "step_time": 36.267561529995874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.3156815767288208, "epoch": 0.92, "frac_reward_zero_std": 0.0, "grad_norm": 0.8615498542785645, "kl": 0.014610467478632927, "learning_rate": 2.9509439087491837e-06, "loss": 0.071, "num_tokens": 2572071.0, "reward": 0.33250001072883606, "reward_std": 0.5657204389572144, "rewards/reward_func/mean": 0.33250001072883606, "rewards/reward_func/std": 0.546776294708252, "sampling/importance_sampling_ratio/max": 1.6013303995132446, "sampling/importance_sampling_ratio/mean": 0.9272360801696777, "sampling/importance_sampling_ratio/min": 0.5939593315124512, "sampling/sampling_logp_difference/max": 0.4442490339279175, "sampling/sampling_logp_difference/mean": 0.0204640980809927, "step": 460, "step_time": 38.44636067999818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 47.375, "completions/mean_terminated_length": 47.375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.33302855491638184, "epoch": 0.922, "frac_reward_zero_std": 0.0, "grad_norm": 1.6637190580368042, "kl": 0.05872795730829239, "learning_rate": 2.9429774789480576e-06, "loss": 0.0747, "num_tokens": 2577004.0, "reward": 0.33500000834465027, "reward_std": 0.5313875675201416, "rewards/reward_func/mean": 0.33500000834465027, "rewards/reward_func/std": 0.5113846659660339, "sampling/importance_sampling_ratio/max": 1.9112398624420166, "sampling/importance_sampling_ratio/mean": 0.9847674369812012, "sampling/importance_sampling_ratio/min": 0.5912163853645325, "sampling/sampling_logp_difference/max": 0.649017333984375, "sampling/sampling_logp_difference/mean": 0.025793246924877167, "step": 461, "step_time": 27.65348793999874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3585735261440277, "epoch": 0.924, "frac_reward_zero_std": 0.0, "grad_norm": 1.5941725969314575, "kl": 0.00782017782330513, "learning_rate": 2.93500640252279e-06, "loss": -0.4099, "num_tokens": 2582404.0, "reward": 0.07625000178813934, "reward_std": 0.2834315598011017, "rewards/reward_func/mean": 0.07625000178813934, "rewards/reward_func/std": 0.3762194514274597, "sampling/importance_sampling_ratio/max": 2.3709073066711426, "sampling/importance_sampling_ratio/mean": 1.1157333850860596, "sampling/importance_sampling_ratio/min": 0.6296651363372803, "sampling/sampling_logp_difference/max": 0.43739819526672363, "sampling/sampling_logp_difference/mean": 0.02625291794538498, "step": 462, "step_time": 41.56208431599953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 56.625, "completions/mean_terminated_length": 56.625, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.3185637891292572, "epoch": 0.926, "frac_reward_zero_std": 0.0, "grad_norm": 0.8803660273551941, "kl": 0.006397469900548458, "learning_rate": 2.927030763086201e-06, "loss": 0.0036, "num_tokens": 2587423.0, "reward": 0.4750000238418579, "reward_std": 0.5177191495895386, "rewards/reward_func/mean": 0.4750000238418579, "rewards/reward_func/std": 0.5487908720970154, "sampling/importance_sampling_ratio/max": 1.4098753929138184, "sampling/importance_sampling_ratio/mean": 0.9890186190605164, "sampling/importance_sampling_ratio/min": 0.6722173690795898, "sampling/sampling_logp_difference/max": 0.31026315689086914, "sampling/sampling_logp_difference/mean": 0.01704924926161766, "step": 463, "step_time": 27.707606516996748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 50.375, "completions/mean_terminated_length": 50.375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.341752827167511, "epoch": 0.928, "frac_reward_zero_std": 0.0, "grad_norm": 1.072123646736145, "kl": 0.00813986174762249, "learning_rate": 2.9190506442989753e-06, "loss": 0.0383, "num_tokens": 2592652.0, "reward": -0.01874999888241291, "reward_std": 0.01129152812063694, "rewards/reward_func/mean": -0.01874999888241291, "rewards/reward_func/std": 0.011259916238486767, "sampling/importance_sampling_ratio/max": 1.9521905183792114, "sampling/importance_sampling_ratio/mean": 1.0056908130645752, "sampling/importance_sampling_ratio/min": 0.4280635416507721, "sampling/sampling_logp_difference/max": 0.29516351222991943, "sampling/sampling_logp_difference/mean": 0.020366854965686798, "step": 464, "step_time": 41.832906912997714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 56.875, "completions/mean_terminated_length": 56.875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.33210664987564087, "epoch": 0.93, "frac_reward_zero_std": 0.0, "grad_norm": 1.4356821775436401, "kl": 0.010456573218107224, "learning_rate": 2.9110661298687824e-06, "loss": 0.1115, "num_tokens": 2597994.0, "reward": 0.4599999785423279, "reward_std": 0.6102153062820435, "rewards/reward_func/mean": 0.4599999785423279, "rewards/reward_func/std": 0.5649778842926025, "sampling/importance_sampling_ratio/max": 1.6905672550201416, "sampling/importance_sampling_ratio/mean": 0.9656480550765991, "sampling/importance_sampling_ratio/min": 0.4363991916179657, "sampling/sampling_logp_difference/max": 0.4984140396118164, "sampling/sampling_logp_difference/mean": 0.01931898109614849, "step": 465, "step_time": 38.68506367498776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3974885940551758, "epoch": 0.932, "frac_reward_zero_std": 0.0, "grad_norm": 1.112714171409607, "kl": 0.007472895085811615, "learning_rate": 2.9030773035493997e-06, "loss": 0.1579, "num_tokens": 2603516.0, "reward": 0.45875000953674316, "reward_std": 0.6050867438316345, "rewards/reward_func/mean": 0.45875000953674316, "rewards/reward_func/std": 0.5605975985527039, "sampling/importance_sampling_ratio/max": 1.5251615047454834, "sampling/importance_sampling_ratio/mean": 1.0866496562957764, "sampling/importance_sampling_ratio/min": 0.5772762894630432, "sampling/sampling_logp_difference/max": 0.33701562881469727, "sampling/sampling_logp_difference/mean": 0.02309374138712883, "step": 466, "step_time": 30.25664325800608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3211430311203003, "epoch": 0.934, "frac_reward_zero_std": 0.0, "grad_norm": 1.1987115144729614, "kl": 0.006518337409943342, "learning_rate": 2.8950842491398358e-06, "loss": 0.2122, "num_tokens": 2609600.0, "reward": 0.3187499940395355, "reward_std": 0.5568737983703613, "rewards/reward_func/mean": 0.3187499940395355, "rewards/reward_func/std": 0.5387137532234192, "sampling/importance_sampling_ratio/max": 1.8600863218307495, "sampling/importance_sampling_ratio/mean": 1.2007018327713013, "sampling/importance_sampling_ratio/min": 0.5022804141044617, "sampling/sampling_logp_difference/max": 0.36564385890960693, "sampling/sampling_logp_difference/mean": 0.01762022264301777, "step": 467, "step_time": 37.397101591996034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.34476569294929504, "epoch": 0.936, "frac_reward_zero_std": 0.0, "grad_norm": 1.1792073249816895, "kl": 0.008605368435382843, "learning_rate": 2.8870870504834497e-06, "loss": -0.2764, "num_tokens": 2615093.0, "reward": 0.3112500011920929, "reward_std": 0.5650759339332581, "rewards/reward_func/mean": 0.3112500011920929, "rewards/reward_func/std": 0.5401173830032349, "sampling/importance_sampling_ratio/max": 1.6590116024017334, "sampling/importance_sampling_ratio/mean": 0.9717092514038086, "sampling/importance_sampling_ratio/min": 0.4958648681640625, "sampling/sampling_logp_difference/max": 0.3287513256072998, "sampling/sampling_logp_difference/mean": 0.018109293654561043, "step": 468, "step_time": 38.83392319300037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 55.875, "completions/mean_terminated_length": 55.875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.36447829008102417, "epoch": 0.938, "frac_reward_zero_std": 0.0, "grad_norm": 1.4201726913452148, "kl": 0.006331034004688263, "learning_rate": 2.87908579146707e-06, "loss": -0.1119, "num_tokens": 2620537.0, "reward": 0.08874999731779099, "reward_std": 0.2768441140651703, "rewards/reward_func/mean": 0.08874999731779099, "rewards/reward_func/std": 0.37169256806373596, "sampling/importance_sampling_ratio/max": 1.8191684484481812, "sampling/importance_sampling_ratio/mean": 1.2801835536956787, "sampling/importance_sampling_ratio/min": 0.562208890914917, "sampling/sampling_logp_difference/max": 0.34848713874816895, "sampling/sampling_logp_difference/mean": 0.021121980622410774, "step": 469, "step_time": 33.988550513007795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 48.125, "completions/mean_terminated_length": 48.125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.34533941745758057, "epoch": 0.94, "frac_reward_zero_std": 0.0, "grad_norm": 1.524201512336731, "kl": 0.01087350957095623, "learning_rate": 2.8710805560201184e-06, "loss": 0.1765, "num_tokens": 2626187.0, "reward": 0.47749999165534973, "reward_std": 0.5234072804450989, "rewards/reward_func/mean": 0.47749999165534973, "rewards/reward_func/std": 0.5489405989646912, "sampling/importance_sampling_ratio/max": 1.834068775177002, "sampling/importance_sampling_ratio/mean": 0.9888299703598022, "sampling/importance_sampling_ratio/min": 0.19429278373718262, "sampling/sampling_logp_difference/max": 0.3557319641113281, "sampling/sampling_logp_difference/mean": 0.02235410362482071, "step": 470, "step_time": 42.12490658200113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 46.75, "completions/mean_terminated_length": 46.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.3179677724838257, "epoch": 0.942, "frac_reward_zero_std": 0.0, "grad_norm": 2.6474220752716064, "kl": 0.008520292118191719, "learning_rate": 2.8630714281137263e-06, "loss": 0.4652, "num_tokens": 2632352.0, "reward": 0.3037499785423279, "reward_std": 0.2923581600189209, "rewards/reward_func/mean": 0.3037499785423279, "rewards/reward_func/std": 0.5454732775688171, "sampling/importance_sampling_ratio/max": 2.8816580772399902, "sampling/importance_sampling_ratio/mean": 1.172086238861084, "sampling/importance_sampling_ratio/min": 0.4828004837036133, "sampling/sampling_logp_difference/max": 0.3818354606628418, "sampling/sampling_logp_difference/mean": 0.026590559631586075, "step": 471, "step_time": 44.42633706900233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 53.125, "completions/mean_terminated_length": 53.125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.35466456413269043, "epoch": 0.944, "frac_reward_zero_std": 0.0, "grad_norm": 1.0142226219177246, "kl": 0.006118321791291237, "learning_rate": 2.8550584917598558e-06, "loss": -0.1626, "num_tokens": 2638591.0, "reward": 0.08874999731779099, "reward_std": 0.28442591428756714, "rewards/reward_func/mean": 0.08874999731779099, "rewards/reward_func/std": 0.3698045015335083, "sampling/importance_sampling_ratio/max": 1.8325932025909424, "sampling/importance_sampling_ratio/mean": 1.0203707218170166, "sampling/importance_sampling_ratio/min": 0.5196402668952942, "sampling/sampling_logp_difference/max": 0.3019367456436157, "sampling/sampling_logp_difference/mean": 0.020309556275606155, "step": 472, "step_time": 35.4328885779978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.33365398645401, "epoch": 0.946, "frac_reward_zero_std": 0.0, "grad_norm": 0.9207823276519775, "kl": 0.007777152117341757, "learning_rate": 2.8470418310104175e-06, "loss": 0.0079, "num_tokens": 2644340.0, "reward": -0.0625, "reward_std": 0.046318307518959045, "rewards/reward_func/mean": -0.0625, "rewards/reward_func/std": 0.04334248974919319, "sampling/importance_sampling_ratio/max": 1.521376609802246, "sampling/importance_sampling_ratio/mean": 0.9468077421188354, "sampling/importance_sampling_ratio/min": 0.7389934659004211, "sampling/sampling_logp_difference/max": 0.35080957412719727, "sampling/sampling_logp_difference/mean": 0.024117249995470047, "step": 473, "step_time": 40.55488790899108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.38789480924606323, "epoch": 0.948, "frac_reward_zero_std": 0.0, "grad_norm": 1.0140682458877563, "kl": 0.00960574857890606, "learning_rate": 2.839021529956388e-06, "loss": 0.2122, "num_tokens": 2650117.0, "reward": 0.20624999701976776, "reward_std": 0.3101291358470917, "rewards/reward_func/mean": 0.20624999701976776, "rewards/reward_func/std": 0.47850772738456726, "sampling/importance_sampling_ratio/max": 1.7706210613250732, "sampling/importance_sampling_ratio/mean": 0.8846872448921204, "sampling/importance_sampling_ratio/min": 0.47399798035621643, "sampling/sampling_logp_difference/max": 0.4928736686706543, "sampling/sampling_logp_difference/mean": 0.02432020753622055, "step": 474, "step_time": 47.36277217399038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.3783317804336548, "epoch": 0.95, "frac_reward_zero_std": 0.0, "grad_norm": 1.0589309930801392, "kl": 0.00788644514977932, "learning_rate": 2.8309976727269335e-06, "loss": 0.2562, "num_tokens": 2655768.0, "reward": 0.057499997317790985, "reward_std": 0.2759822607040405, "rewards/reward_func/mean": 0.057499997317790985, "rewards/reward_func/std": 0.3830609917640686, "sampling/importance_sampling_ratio/max": 1.4918638467788696, "sampling/importance_sampling_ratio/mean": 0.9599424600601196, "sampling/importance_sampling_ratio/min": 0.4261733293533325, "sampling/sampling_logp_difference/max": 0.47033143043518066, "sampling/sampling_logp_difference/mean": 0.021787922829389572, "step": 475, "step_time": 40.27885495001101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 54.375, "completions/mean_terminated_length": 54.375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.339898943901062, "epoch": 0.952, "frac_reward_zero_std": 0.0, "grad_norm": 0.971348226070404, "kl": 0.00876643043011427, "learning_rate": 2.8229703434885165e-06, "loss": -0.0228, "num_tokens": 2661491.0, "reward": 0.44875001907348633, "reward_std": 0.6022260189056396, "rewards/reward_func/mean": 0.44875001907348633, "rewards/reward_func/std": 0.5575568675994873, "sampling/importance_sampling_ratio/max": 1.1516081094741821, "sampling/importance_sampling_ratio/mean": 0.9316661357879639, "sampling/importance_sampling_ratio/min": 0.5835716724395752, "sampling/sampling_logp_difference/max": 0.4196091890335083, "sampling/sampling_logp_difference/mean": 0.021482866257429123, "step": 476, "step_time": 32.179777643003035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.33622926473617554, "epoch": 0.954, "frac_reward_zero_std": 0.0, "grad_norm": 0.885502815246582, "kl": 0.006615090649574995, "learning_rate": 2.814939626444023e-06, "loss": 0.072, "num_tokens": 2666869.0, "reward": 0.07125000655651093, "reward_std": 0.28238025307655334, "rewards/reward_func/mean": 0.07125000655651093, "rewards/reward_func/std": 0.36906978487968445, "sampling/importance_sampling_ratio/max": 1.387723684310913, "sampling/importance_sampling_ratio/mean": 0.8648278117179871, "sampling/importance_sampling_ratio/min": 0.5371811985969543, "sampling/sampling_logp_difference/max": 0.3530764579772949, "sampling/sampling_logp_difference/mean": 0.025837857276201248, "step": 477, "step_time": 35.64491564700438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.3508720397949219, "epoch": 0.956, "frac_reward_zero_std": 0.0, "grad_norm": 1.0213587284088135, "kl": 0.006008337251842022, "learning_rate": 2.8069056058318754e-06, "loss": -0.0749, "num_tokens": 2672914.0, "reward": 0.19750000536441803, "reward_std": 0.33999553322792053, "rewards/reward_func/mean": 0.19750000536441803, "rewards/reward_func/std": 0.49813222885131836, "sampling/importance_sampling_ratio/max": 1.4768438339233398, "sampling/importance_sampling_ratio/mean": 0.9648414850234985, "sampling/importance_sampling_ratio/min": 0.6780772805213928, "sampling/sampling_logp_difference/max": 0.6728348731994629, "sampling/sampling_logp_difference/mean": 0.021531637758016586, "step": 478, "step_time": 44.70471254599397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 45.625, "completions/mean_terminated_length": 45.625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.332109659910202, "epoch": 0.958, "frac_reward_zero_std": 0.0, "grad_norm": 2.0038514137268066, "kl": 0.00846945308148861, "learning_rate": 2.7988683659251475e-06, "loss": 0.1922, "num_tokens": 2678156.0, "reward": -0.04124999791383743, "reward_std": 0.022555213421583176, "rewards/reward_func/mean": -0.04124999791383743, "rewards/reward_func/std": 0.024164613336324692, "sampling/importance_sampling_ratio/max": 1.750364065170288, "sampling/importance_sampling_ratio/mean": 1.0368093252182007, "sampling/importance_sampling_ratio/min": 0.6043331027030945, "sampling/sampling_logp_difference/max": 0.3436398506164551, "sampling/sampling_logp_difference/mean": 0.02278842404484749, "step": 479, "step_time": 35.97302589699393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 55.75, "completions/mean_terminated_length": 55.75, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.3682764768600464, "epoch": 0.96, "frac_reward_zero_std": 0.0, "grad_norm": 1.2075387239456177, "kl": 0.005664853844791651, "learning_rate": 2.7908279910306834e-06, "loss": -0.0507, "num_tokens": 2684319.0, "reward": -0.03999999910593033, "reward_std": 0.03759898990392685, "rewards/reward_func/mean": -0.03999999910593033, "rewards/reward_func/std": 0.03545621037483215, "sampling/importance_sampling_ratio/max": 1.4793400764465332, "sampling/importance_sampling_ratio/mean": 0.949470043182373, "sampling/importance_sampling_ratio/min": 0.7095850110054016, "sampling/sampling_logp_difference/max": 0.7661590576171875, "sampling/sampling_logp_difference/mean": 0.022031132131814957, "step": 480, "step_time": 48.977692065993324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.3368332087993622, "epoch": 0.962, "frac_reward_zero_std": 0.0, "grad_norm": 1.0318005084991455, "kl": 0.011773078702390194, "learning_rate": 2.7827845654882112e-06, "loss": -0.211, "num_tokens": 2689871.0, "reward": 0.03875000774860382, "reward_std": 0.3096115291118622, "rewards/reward_func/mean": 0.03875000774860382, "rewards/reward_func/std": 0.39116814732551575, "sampling/importance_sampling_ratio/max": 1.4918369054794312, "sampling/importance_sampling_ratio/mean": 0.9156639575958252, "sampling/importance_sampling_ratio/min": 0.4481692910194397, "sampling/sampling_logp_difference/max": 0.3614964485168457, "sampling/sampling_logp_difference/mean": 0.020262327045202255, "step": 481, "step_time": 47.9836272290122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.3230361342430115, "epoch": 0.964, "frac_reward_zero_std": 0.0, "grad_norm": 1.032042384147644, "kl": 0.0072667477652430534, "learning_rate": 2.7747381736694573e-06, "loss": 0.133, "num_tokens": 2694882.0, "reward": 0.45750004053115845, "reward_std": 0.5040745139122009, "rewards/reward_func/mean": 0.45750004053115845, "rewards/reward_func/std": 0.5405222773551941, "sampling/importance_sampling_ratio/max": 2.2135207653045654, "sampling/importance_sampling_ratio/mean": 0.9904155135154724, "sampling/importance_sampling_ratio/min": 0.5763868689537048, "sampling/sampling_logp_difference/max": 0.2665048837661743, "sampling/sampling_logp_difference/mean": 0.018102280795574188, "step": 482, "step_time": 34.09285955999803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 53.375, "completions/mean_terminated_length": 53.375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.3808034658432007, "epoch": 0.966, "frac_reward_zero_std": 0.0, "grad_norm": 1.3194098472595215, "kl": 0.006681244820356369, "learning_rate": 2.766688899977266e-06, "loss": 0.0493, "num_tokens": 2700228.0, "reward": -0.06499999761581421, "reward_std": 0.03923674300312996, "rewards/reward_func/mean": -0.06499999761581421, "rewards/reward_func/std": 0.03927921876311302, "sampling/importance_sampling_ratio/max": 1.665113925933838, "sampling/importance_sampling_ratio/mean": 1.090261459350586, "sampling/importance_sampling_ratio/min": 0.6695654392242432, "sampling/sampling_logp_difference/max": 0.3100537061691284, "sampling/sampling_logp_difference/mean": 0.02159346640110016, "step": 483, "step_time": 43.91810172899568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.3179956078529358, "epoch": 0.968, "frac_reward_zero_std": 0.0, "grad_norm": 0.6254982352256775, "kl": 0.007635701447725296, "learning_rate": 2.7586368288447094e-06, "loss": -0.1305, "num_tokens": 2705767.0, "reward": 0.19750000536441803, "reward_std": 0.350005179643631, "rewards/reward_func/mean": 0.19750000536441803, "rewards/reward_func/std": 0.4900656044483185, "sampling/importance_sampling_ratio/max": 1.0408016443252563, "sampling/importance_sampling_ratio/mean": 0.8733296990394592, "sampling/importance_sampling_ratio/min": 0.4849831461906433, "sampling/sampling_logp_difference/max": 0.32492589950561523, "sampling/sampling_logp_difference/mean": 0.01731083169579506, "step": 484, "step_time": 43.16735061899817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 49.625, "completions/mean_terminated_length": 49.625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.32685625553131104, "epoch": 0.97, "frac_reward_zero_std": 0.0, "grad_norm": 0.9807037115097046, "kl": 0.009544767439365387, "learning_rate": 2.750582044734203e-06, "loss": 0.1304, "num_tokens": 2711361.0, "reward": 0.05624999478459358, "reward_std": 0.28626444935798645, "rewards/reward_func/mean": 0.05624999478459358, "rewards/reward_func/std": 0.38250818848609924, "sampling/importance_sampling_ratio/max": 1.4526336193084717, "sampling/importance_sampling_ratio/mean": 1.1034681797027588, "sampling/importance_sampling_ratio/min": 0.822212815284729, "sampling/sampling_logp_difference/max": 0.3479280471801758, "sampling/sampling_logp_difference/mean": 0.019866865128278732, "step": 485, "step_time": 44.08477608699468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 48.625, "completions/mean_terminated_length": 48.625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.34071382880210876, "epoch": 0.972, "frac_reward_zero_std": 0.0, "grad_norm": 1.1414618492126465, "kl": 0.008361655287444592, "learning_rate": 2.7425246321366205e-06, "loss": -0.0366, "num_tokens": 2717584.0, "reward": 0.1925000101327896, "reward_std": 0.32843348383903503, "rewards/reward_func/mean": 0.1925000101327896, "rewards/reward_func/std": 0.49268218874931335, "sampling/importance_sampling_ratio/max": 1.5116984844207764, "sampling/importance_sampling_ratio/mean": 1.003057599067688, "sampling/importance_sampling_ratio/min": 0.5172320604324341, "sampling/sampling_logp_difference/max": 0.426800012588501, "sampling/sampling_logp_difference/mean": 0.019641123712062836, "step": 486, "step_time": 47.363303733000066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 52.625, "completions/mean_terminated_length": 52.625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.33557814359664917, "epoch": 0.974, "frac_reward_zero_std": 0.0, "grad_norm": 1.0366374254226685, "kl": 0.006426879204809666, "learning_rate": 2.7344646755704078e-06, "loss": 0.0267, "num_tokens": 2723150.0, "reward": 0.23000001907348633, "reward_std": 0.5069743394851685, "rewards/reward_func/mean": 0.23000001907348633, "rewards/reward_func/std": 0.46940696239471436, "sampling/importance_sampling_ratio/max": 1.188085675239563, "sampling/importance_sampling_ratio/mean": 0.9919371604919434, "sampling/importance_sampling_ratio/min": 0.6119289994239807, "sampling/sampling_logp_difference/max": 0.3435969352722168, "sampling/sampling_logp_difference/mean": 0.021727345883846283, "step": 487, "step_time": 35.52652668301016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.41280990839004517, "epoch": 0.976, "frac_reward_zero_std": 0.0, "grad_norm": 1.0274547338485718, "kl": 0.008126229047775269, "learning_rate": 2.726402259580695e-06, "loss": -0.1581, "num_tokens": 2728807.0, "reward": 0.1850000023841858, "reward_std": 0.32565683126449585, "rewards/reward_func/mean": 0.1850000023841858, "rewards/reward_func/std": 0.47698459029197693, "sampling/importance_sampling_ratio/max": 1.506007194519043, "sampling/importance_sampling_ratio/mean": 1.029561996459961, "sampling/importance_sampling_ratio/min": 0.42956024408340454, "sampling/sampling_logp_difference/max": 1.2129210233688354, "sampling/sampling_logp_difference/mean": 0.024751894176006317, "step": 488, "step_time": 47.60045914600778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 51.125, "completions/mean_terminated_length": 51.125, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.3394836187362671, "epoch": 0.978, "frac_reward_zero_std": 0.0, "grad_norm": 1.06610107421875, "kl": 0.008799143135547638, "learning_rate": 2.71833746873841e-06, "loss": 0.077, "num_tokens": 2733871.0, "reward": 0.14749997854232788, "reward_std": 0.35974743962287903, "rewards/reward_func/mean": 0.14749997854232788, "rewards/reward_func/std": 0.4975583255290985, "sampling/importance_sampling_ratio/max": 1.507678747177124, "sampling/importance_sampling_ratio/mean": 0.9904993772506714, "sampling/importance_sampling_ratio/min": 0.5135411024093628, "sampling/sampling_logp_difference/max": 0.5306490659713745, "sampling/sampling_logp_difference/mean": 0.02448309026658535, "step": 489, "step_time": 48.45194135600468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.33108896017074585, "epoch": 0.98, "frac_reward_zero_std": 0.0, "grad_norm": 1.3293511867523193, "kl": 0.009081996977329254, "learning_rate": 2.7102703876393942e-06, "loss": 0.2769, "num_tokens": 2738881.0, "reward": 0.33375000953674316, "reward_std": 0.5720977783203125, "rewards/reward_func/mean": 0.33375000953674316, "rewards/reward_func/std": 0.5522664189338684, "sampling/importance_sampling_ratio/max": 1.976300835609436, "sampling/importance_sampling_ratio/mean": 1.1032793521881104, "sampling/importance_sampling_ratio/min": 0.6139009594917297, "sampling/sampling_logp_difference/max": 0.35714292526245117, "sampling/sampling_logp_difference/mean": 0.02184763178229332, "step": 490, "step_time": 31.03724041300302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 54.125, "completions/mean_terminated_length": 54.125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.3556636571884155, "epoch": 0.982, "frac_reward_zero_std": 0.0, "grad_norm": 1.3523696660995483, "kl": 0.006467908620834351, "learning_rate": 2.702201100903511e-06, "loss": -0.1537, "num_tokens": 2745030.0, "reward": 0.35624998807907104, "reward_std": 0.5549872517585754, "rewards/reward_func/mean": 0.35624998807907104, "rewards/reward_func/std": 0.5310620069503784, "sampling/importance_sampling_ratio/max": 2.1493470668792725, "sampling/importance_sampling_ratio/mean": 1.0585157871246338, "sampling/importance_sampling_ratio/min": 0.4366893470287323, "sampling/sampling_logp_difference/max": 0.32509946823120117, "sampling/sampling_logp_difference/mean": 0.020754611119627953, "step": 491, "step_time": 38.667722318990855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 41.875, "completions/mean_terminated_length": 41.875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.32912951707839966, "epoch": 0.984, "frac_reward_zero_std": 0.0, "grad_norm": 1.4372652769088745, "kl": 0.010952703654766083, "learning_rate": 2.694129693173759e-06, "loss": 0.2391, "num_tokens": 2750228.0, "reward": 0.4750000238418579, "reward_std": 0.5136302709579468, "rewards/reward_func/mean": 0.4750000238418579, "rewards/reward_func/std": 0.5536631345748901, "sampling/importance_sampling_ratio/max": 2.09639310836792, "sampling/importance_sampling_ratio/mean": 1.0164697170257568, "sampling/importance_sampling_ratio/min": 0.6715074181556702, "sampling/sampling_logp_difference/max": 0.34882307052612305, "sampling/sampling_logp_difference/mean": 0.021729137748479843, "step": 492, "step_time": 27.27960569599236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.3119977116584778, "epoch": 0.986, "frac_reward_zero_std": 0.0, "grad_norm": 1.0905507802963257, "kl": 0.006891919299960136, "learning_rate": 2.6860562491153854e-06, "loss": 0.0112, "num_tokens": 2755740.0, "reward": 0.3087500035762787, "reward_std": 0.25716888904571533, "rewards/reward_func/mean": 0.3087500035762787, "rewards/reward_func/std": 0.4853404462337494, "sampling/importance_sampling_ratio/max": 1.7588069438934326, "sampling/importance_sampling_ratio/mean": 1.2490370273590088, "sampling/importance_sampling_ratio/min": 0.9198641180992126, "sampling/sampling_logp_difference/max": 0.29522716999053955, "sampling/sampling_logp_difference/mean": 0.01567711867392063, "step": 493, "step_time": 37.00472452600661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 47.625, "completions/mean_terminated_length": 47.625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.3635869324207306, "epoch": 0.988, "frac_reward_zero_std": 0.0, "grad_norm": 1.052552342414856, "kl": 0.013993756845593452, "learning_rate": 2.6779808534149986e-06, "loss": -0.154, "num_tokens": 2761592.0, "reward": 0.32875001430511475, "reward_std": 0.5612891316413879, "rewards/reward_func/mean": 0.32875001430511475, "rewards/reward_func/std": 0.5381035208702087, "sampling/importance_sampling_ratio/max": 1.603551983833313, "sampling/importance_sampling_ratio/mean": 0.9373511075973511, "sampling/importance_sampling_ratio/min": 0.4633772671222687, "sampling/sampling_logp_difference/max": 0.5257512331008911, "sampling/sampling_logp_difference/mean": 0.02118011564016342, "step": 494, "step_time": 39.871601100996486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.36310869455337524, "epoch": 0.99, "frac_reward_zero_std": 0.0, "grad_norm": 2.6600749492645264, "kl": 0.015285097993910313, "learning_rate": 2.6699035907796796e-06, "loss": 0.1971, "num_tokens": 2767247.0, "reward": 0.08749999850988388, "reward_std": 0.2818909287452698, "rewards/reward_func/mean": 0.08749999850988388, "rewards/reward_func/std": 0.3707424998283386, "sampling/importance_sampling_ratio/max": 2.1048922538757324, "sampling/importance_sampling_ratio/mean": 0.9266895055770874, "sampling/importance_sampling_ratio/min": 0.27929919958114624, "sampling/sampling_logp_difference/max": 1.3129551410675049, "sampling/sampling_logp_difference/mean": 0.03234206885099411, "step": 495, "step_time": 41.51073724999151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 56.0, "completions/mean_terminated_length": 56.0, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.3388064503669739, "epoch": 0.992, "frac_reward_zero_std": 0.0, "grad_norm": 1.291845679283142, "kl": 0.00741872563958168, "learning_rate": 2.6618245459360896e-06, "loss": -0.0175, "num_tokens": 2772579.0, "reward": 0.46875, "reward_std": 0.5083686113357544, "rewards/reward_func/mean": 0.46875, "rewards/reward_func/std": 0.5500503182411194, "sampling/importance_sampling_ratio/max": 2.055424451828003, "sampling/importance_sampling_ratio/mean": 1.1403716802597046, "sampling/importance_sampling_ratio/min": 0.5961279273033142, "sampling/sampling_logp_difference/max": 0.36696600914001465, "sampling/sampling_logp_difference/mean": 0.02003820426762104, "step": 496, "step_time": 29.742875528987497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.375, "completions/mean_terminated_length": 54.375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.3439745306968689, "epoch": 0.994, "frac_reward_zero_std": 0.0, "grad_norm": 0.8372857570648193, "kl": 0.004724285565316677, "learning_rate": 2.6537438036295876e-06, "loss": -0.1035, "num_tokens": 2778589.0, "reward": 0.5837500095367432, "reward_std": 0.5680229663848877, "rewards/reward_func/mean": 0.5837500095367432, "rewards/reward_func/std": 0.5513085126876831, "sampling/importance_sampling_ratio/max": 1.4481302499771118, "sampling/importance_sampling_ratio/mean": 0.8684442043304443, "sampling/importance_sampling_ratio/min": 0.5821914672851562, "sampling/sampling_logp_difference/max": 0.3571445345878601, "sampling/sampling_logp_difference/mean": 0.019016366451978683, "step": 497, "step_time": 33.911930337999365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 49.625, "completions/mean_terminated_length": 49.625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.3264671564102173, "epoch": 0.996, "frac_reward_zero_std": 0.0, "grad_norm": 3.3532819747924805, "kl": 0.009917155839502811, "learning_rate": 2.6456614486233344e-06, "loss": 0.0878, "num_tokens": 2783355.0, "reward": 0.3425000309944153, "reward_std": 0.5395705103874207, "rewards/reward_func/mean": 0.3425000309944153, "rewards/reward_func/std": 0.5232794880867004, "sampling/importance_sampling_ratio/max": 1.4326062202453613, "sampling/importance_sampling_ratio/mean": 0.7578821182250977, "sampling/importance_sampling_ratio/min": 0.3315609395503998, "sampling/sampling_logp_difference/max": 0.3385963439941406, "sampling/sampling_logp_difference/mean": 0.0261401254683733, "step": 498, "step_time": 29.913790080012404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 56.375, "completions/mean_terminated_length": 56.375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.33865243196487427, "epoch": 0.998, "frac_reward_zero_std": 0.0, "grad_norm": 0.7589961886405945, "kl": 0.008822058327496052, "learning_rate": 2.6375775656974124e-06, "loss": 0.093, "num_tokens": 2788575.0, "reward": 0.027500003576278687, "reward_std": 0.31530144810676575, "rewards/reward_func/mean": 0.027500003576278687, "rewards/reward_func/std": 0.3917998969554901, "sampling/importance_sampling_ratio/max": 1.506130337715149, "sampling/importance_sampling_ratio/mean": 0.8838839530944824, "sampling/importance_sampling_ratio/min": 0.4121124744415283, "sampling/sampling_logp_difference/max": 0.433666467666626, "sampling/sampling_logp_difference/mean": 0.02140984870493412, "step": 499, "step_time": 41.98196549799468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 51.875, "completions/mean_terminated_length": 51.875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.33830559253692627, "epoch": 1.0, "frac_reward_zero_std": 0.0, "grad_norm": 1.053360939025879, "kl": 0.007612613961100578, "learning_rate": 2.6294922396479263e-06, "loss": -0.0695, "num_tokens": 2794184.0, "reward": 0.33249998092651367, "reward_std": 0.5619164705276489, "rewards/reward_func/mean": 0.33249998092651367, "rewards/reward_func/std": 0.5373945832252502, "sampling/importance_sampling_ratio/max": 1.1783527135849, "sampling/importance_sampling_ratio/mean": 0.699951171875, "sampling/importance_sampling_ratio/min": 0.35024145245552063, "sampling/sampling_logp_difference/max": 0.5603160858154297, "sampling/sampling_logp_difference/mean": 0.027539845556020737, "step": 500, "step_time": 38.46154696500162 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 2794184, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }