{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.018, "eval_steps": 500, "global_step": 900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1635.0, "completions/max_terminated_length": 1635.0, "completions/mean_length": 437.21875, "completions/mean_terminated_length": 437.21875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.4114690534770489, "epoch": 2e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.1938793659210205, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0263, "num_tokens": 42719.0, "reward": -0.3412362039089203, "reward_std": 0.9188774228096008, "rewards/rollout_reward_func/mean": -0.3412362039089203, "rewards/rollout_reward_func/std": 0.932715654373169, "sampling/importance_sampling_ratio/max": 1.6458008289337158, "sampling/importance_sampling_ratio/mean": 1.016780972480774, "sampling/importance_sampling_ratio/min": 0.12228654325008392, "sampling/sampling_logp_difference/max": 1.0894057750701904, "sampling/sampling_logp_difference/mean": 0.06957238912582397, "step": 1, "step_time": 11.261524181000027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1678.0, "completions/max_terminated_length": 1678.0, "completions/mean_length": 472.09375, "completions/mean_terminated_length": 472.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.36337339878082275, "epoch": 4e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.1965783834457397, "kl": 0.0, "learning_rate": 1e-06, "loss": -0.0304, "num_tokens": 89457.0, "reward": -0.15283264219760895, "reward_std": 0.9929161667823792, "rewards/rollout_reward_func/mean": -0.15283264219760895, "rewards/rollout_reward_func/std": 0.982594907283783, "sampling/importance_sampling_ratio/max": 1.6136984825134277, "sampling/importance_sampling_ratio/mean": 0.952701985836029, "sampling/importance_sampling_ratio/min": 0.2759953737258911, "sampling/sampling_logp_difference/max": 1.3739876747131348, "sampling/sampling_logp_difference/mean": 0.05666077136993408, "step": 2, "step_time": 10.465716911999948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1481.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 355.40625, "completions/mean_terminated_length": 355.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.32282101921737194, "epoch": 6e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.053934097290039, "kl": 0.01393460670078639, "learning_rate": 2e-06, "loss": -0.0365, "num_tokens": 129582.0, "reward": -0.12245626747608185, "reward_std": 0.9689830541610718, "rewards/rollout_reward_func/mean": -0.12245626747608185, "rewards/rollout_reward_func/std": 1.0057955980300903, "sampling/importance_sampling_ratio/max": 2.9928359985351562, "sampling/importance_sampling_ratio/mean": 1.0525281429290771, "sampling/importance_sampling_ratio/min": 0.3391573131084442, "sampling/sampling_logp_difference/max": 1.0849307775497437, "sampling/sampling_logp_difference/mean": 0.04470321908593178, "step": 3, "step_time": 9.528464489999806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1379.0, "completions/max_terminated_length": 1379.0, "completions/mean_length": 372.4375, "completions/mean_terminated_length": 372.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.43315502256155014, "epoch": 8e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.7640054821968079, "kl": 0.009229308197973296, "learning_rate": 2.9999999999999997e-06, "loss": -0.0074, "num_tokens": 171862.0, "reward": -0.42852091789245605, "reward_std": 0.9473088979721069, "rewards/rollout_reward_func/mean": -0.42852091789245605, "rewards/rollout_reward_func/std": 0.908109724521637, "sampling/importance_sampling_ratio/max": 2.000516414642334, "sampling/importance_sampling_ratio/mean": 1.03951096534729, "sampling/importance_sampling_ratio/min": 0.5997233390808105, "sampling/sampling_logp_difference/max": 0.6001281142234802, "sampling/sampling_logp_difference/mean": 0.05822616070508957, "step": 4, "step_time": 10.755143347000057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2420.0, "completions/max_terminated_length": 2420.0, "completions/mean_length": 429.8125, "completions/mean_terminated_length": 429.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.37449060939252377, "epoch": 0.0001, "frac_reward_zero_std": 0.0, "grad_norm": 1.9644466638565063, "kl": 0.007258172292495146, "learning_rate": 4e-06, "loss": -0.0437, "num_tokens": 215816.0, "reward": -0.3692479133605957, "reward_std": 0.9574089646339417, "rewards/rollout_reward_func/mean": -0.3692479133605957, "rewards/rollout_reward_func/std": 0.9379967451095581, "sampling/importance_sampling_ratio/max": 1.9159421920776367, "sampling/importance_sampling_ratio/mean": 0.9078373908996582, "sampling/importance_sampling_ratio/min": 0.4329147934913635, "sampling/sampling_logp_difference/max": 0.819692850112915, "sampling/sampling_logp_difference/mean": 0.062477223575115204, "step": 5, "step_time": 12.814494780999894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 460.375, "completions/mean_terminated_length": 460.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.45950571075081825, "epoch": 0.00012, "frac_reward_zero_std": 0.0, "grad_norm": 1.122826099395752, "kl": 0.011083001125371084, "learning_rate": 4.9999999999999996e-06, "loss": 0.0069, "num_tokens": 261152.0, "reward": -0.4564189612865448, "reward_std": 0.8680033683776855, "rewards/rollout_reward_func/mean": -0.4564189612865448, "rewards/rollout_reward_func/std": 0.8680322170257568, "sampling/importance_sampling_ratio/max": 1.5604106187820435, "sampling/importance_sampling_ratio/mean": 0.9349099397659302, "sampling/importance_sampling_ratio/min": 0.4502843916416168, "sampling/sampling_logp_difference/max": 0.7954697608947754, "sampling/sampling_logp_difference/mean": 0.06650318205356598, "step": 6, "step_time": 10.648359982999864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1694.0, "completions/max_terminated_length": 1694.0, "completions/mean_length": 423.46875, "completions/mean_terminated_length": 423.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.4260463248938322, "epoch": 0.00014, "frac_reward_zero_std": 0.0, "grad_norm": 1.0482391119003296, "kl": 0.013913503331423271, "learning_rate": 5.999999999999999e-06, "loss": -0.0241, "num_tokens": 304500.0, "reward": -0.6236814260482788, "reward_std": 0.7732925415039062, "rewards/rollout_reward_func/mean": -0.6236814260482788, "rewards/rollout_reward_func/std": 0.745121419429779, "sampling/importance_sampling_ratio/max": 1.4707562923431396, "sampling/importance_sampling_ratio/mean": 0.9661546349525452, "sampling/importance_sampling_ratio/min": 0.41699379682540894, "sampling/sampling_logp_difference/max": 0.8802392482757568, "sampling/sampling_logp_difference/mean": 0.06739896535873413, "step": 7, "step_time": 10.25277962899986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2460.0, "completions/max_terminated_length": 2460.0, "completions/mean_length": 426.40625, "completions/mean_terminated_length": 426.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.37137986719608307, "epoch": 0.00016, "frac_reward_zero_std": 0.0, "grad_norm": 0.8304545283317566, "kl": 0.011032106373022543, "learning_rate": 7e-06, "loss": -0.0162, "num_tokens": 349113.0, "reward": -0.24587996304035187, "reward_std": 0.9935742616653442, "rewards/rollout_reward_func/mean": -0.24587996304035187, "rewards/rollout_reward_func/std": 0.9805474877357483, "sampling/importance_sampling_ratio/max": 1.6930813789367676, "sampling/importance_sampling_ratio/mean": 0.9787839651107788, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8561748266220093, "sampling/sampling_logp_difference/mean": 0.0570240244269371, "step": 8, "step_time": 12.6045493419997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1519.0, "completions/max_terminated_length": 1519.0, "completions/mean_length": 226.125, "completions/mean_terminated_length": 226.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.31529812701046467, "epoch": 0.00018, "frac_reward_zero_std": 0.0, "grad_norm": 0.9217535853385925, "kl": 0.012996508652577177, "learning_rate": 8e-06, "loss": -0.0285, "num_tokens": 384354.0, "reward": -0.3458622694015503, "reward_std": 0.749879002571106, "rewards/rollout_reward_func/mean": -0.3458622694015503, "rewards/rollout_reward_func/std": 0.9346888661384583, "sampling/importance_sampling_ratio/max": 1.4870073795318604, "sampling/importance_sampling_ratio/mean": 0.9612379670143127, "sampling/importance_sampling_ratio/min": 0.19580239057540894, "sampling/sampling_logp_difference/max": 1.6306716203689575, "sampling/sampling_logp_difference/mean": 0.05743882805109024, "step": 9, "step_time": 9.882065605000207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1645.0, "completions/max_terminated_length": 1645.0, "completions/mean_length": 358.09375, "completions/mean_terminated_length": 358.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.32163661159574986, "epoch": 0.0002, "frac_reward_zero_std": 0.0, "grad_norm": 1.1543048620224, "kl": 0.028548938949825242, "learning_rate": 8.999999999999999e-06, "loss": -0.025, "num_tokens": 425132.0, "reward": 0.05986635386943817, "reward_std": 0.958491325378418, "rewards/rollout_reward_func/mean": 0.05986635386943817, "rewards/rollout_reward_func/std": 0.9791619181632996, "sampling/importance_sampling_ratio/max": 1.758381962776184, "sampling/importance_sampling_ratio/mean": 1.0709290504455566, "sampling/importance_sampling_ratio/min": 0.7742878198623657, "sampling/sampling_logp_difference/max": 0.4754340648651123, "sampling/sampling_logp_difference/mean": 0.040690332651138306, "step": 10, "step_time": 10.256709248999869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2244.0, "completions/max_terminated_length": 2244.0, "completions/mean_length": 611.375, "completions/mean_terminated_length": 611.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.3070964999496937, "epoch": 0.00022, "frac_reward_zero_std": 0.0, "grad_norm": 3.7030222415924072, "kl": 2.9806297700852156, "learning_rate": 9.999999999999999e-06, "loss": -0.0144, "num_tokens": 475609.0, "reward": 0.12757781147956848, "reward_std": 0.8701133728027344, "rewards/rollout_reward_func/mean": 0.12757781147956848, "rewards/rollout_reward_func/std": 1.0051034688949585, "sampling/importance_sampling_ratio/max": 2.501253604888916, "sampling/importance_sampling_ratio/mean": 1.0324102640151978, "sampling/importance_sampling_ratio/min": 0.41789400577545166, "sampling/sampling_logp_difference/max": 0.951789379119873, "sampling/sampling_logp_difference/mean": 0.0470191165804863, "step": 11, "step_time": 12.220465992000754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2606.0, "completions/max_terminated_length": 2606.0, "completions/mean_length": 378.1875, "completions/mean_terminated_length": 378.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.31014328822493553, "epoch": 0.00024, "frac_reward_zero_std": 0.0, "grad_norm": 0.9326220154762268, "kl": 0.06310290546389297, "learning_rate": 1.0999999999999998e-05, "loss": -0.0474, "num_tokens": 516457.0, "reward": -0.18268364667892456, "reward_std": 1.0392582416534424, "rewards/rollout_reward_func/mean": -0.18268364667892456, "rewards/rollout_reward_func/std": 0.9940189719200134, "sampling/importance_sampling_ratio/max": 2.069570541381836, "sampling/importance_sampling_ratio/mean": 0.9633287191390991, "sampling/importance_sampling_ratio/min": 0.2533480226993561, "sampling/sampling_logp_difference/max": 1.0506527423858643, "sampling/sampling_logp_difference/mean": 0.06972773373126984, "step": 12, "step_time": 13.174191238000276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1732.0, "completions/max_terminated_length": 1732.0, "completions/mean_length": 465.0, "completions/mean_terminated_length": 465.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2819689605385065, "epoch": 0.00026, "frac_reward_zero_std": 0.0, "grad_norm": 0.8611482977867126, "kl": 0.08543028263375163, "learning_rate": 1.1999999999999999e-05, "loss": -0.0268, "num_tokens": 560711.0, "reward": -0.14805778861045837, "reward_std": 1.0241137742996216, "rewards/rollout_reward_func/mean": -0.14805778861045837, "rewards/rollout_reward_func/std": 0.9799343347549438, "sampling/importance_sampling_ratio/max": 1.6586482524871826, "sampling/importance_sampling_ratio/mean": 0.9434561133384705, "sampling/importance_sampling_ratio/min": 0.4459451735019684, "sampling/sampling_logp_difference/max": 0.7238948345184326, "sampling/sampling_logp_difference/mean": 0.043918900191783905, "step": 13, "step_time": 11.182554404000257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1473.0, "completions/max_terminated_length": 1473.0, "completions/mean_length": 299.8125, "completions/mean_terminated_length": 299.8125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2716455264016986, "epoch": 0.00028, "frac_reward_zero_std": 0.0, "grad_norm": 0.6645467281341553, "kl": 0.08498917146062013, "learning_rate": 1.3e-05, "loss": -0.0287, "num_tokens": 598780.0, "reward": -0.19310343265533447, "reward_std": 0.8885855674743652, "rewards/rollout_reward_func/mean": -0.19310343265533447, "rewards/rollout_reward_func/std": 0.9636799097061157, "sampling/importance_sampling_ratio/max": 1.2432705163955688, "sampling/importance_sampling_ratio/mean": 0.9620025157928467, "sampling/importance_sampling_ratio/min": 0.17014716565608978, "sampling/sampling_logp_difference/max": 1.7711846828460693, "sampling/sampling_logp_difference/mean": 0.051839202642440796, "step": 14, "step_time": 9.325893286999872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2530.0, "completions/max_terminated_length": 2530.0, "completions/mean_length": 428.15625, "completions/mean_terminated_length": 428.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1865299199707806, "epoch": 0.0003, "frac_reward_zero_std": 0.0, "grad_norm": 1.390516996383667, "kl": 0.08118426031433046, "learning_rate": 1.4e-05, "loss": -0.0252, "num_tokens": 641121.0, "reward": 0.005183085799217224, "reward_std": 1.0140929222106934, "rewards/rollout_reward_func/mean": 0.005183085799217224, "rewards/rollout_reward_func/std": 0.9768301844596863, "sampling/importance_sampling_ratio/max": 2.3301897048950195, "sampling/importance_sampling_ratio/mean": 1.103709101676941, "sampling/importance_sampling_ratio/min": 0.5039141774177551, "sampling/sampling_logp_difference/max": 0.8443331718444824, "sampling/sampling_logp_difference/mean": 0.03696654736995697, "step": 15, "step_time": 12.787788593999949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2514.0, "completions/max_terminated_length": 2514.0, "completions/mean_length": 547.3125, "completions/mean_terminated_length": 547.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2326231673359871, "epoch": 0.00032, "frac_reward_zero_std": 0.0, "grad_norm": 0.7268860340118408, "kl": 0.11162168486043811, "learning_rate": 1.4999999999999999e-05, "loss": -0.0618, "num_tokens": 688683.0, "reward": -0.029961444437503815, "reward_std": 0.9671661257743835, "rewards/rollout_reward_func/mean": -0.029961444437503815, "rewards/rollout_reward_func/std": 0.9953790903091431, "sampling/importance_sampling_ratio/max": 1.393131971359253, "sampling/importance_sampling_ratio/mean": 0.9585981369018555, "sampling/importance_sampling_ratio/min": 0.1027221754193306, "sampling/sampling_logp_difference/max": 2.340406894683838, "sampling/sampling_logp_difference/mean": 0.04949421435594559, "step": 16, "step_time": 12.859808910999618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1782.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 576.59375, "completions/mean_terminated_length": 576.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.16899114195257425, "epoch": 0.00034, "frac_reward_zero_std": 0.0, "grad_norm": 0.8959416151046753, "kl": 0.20661810354795307, "learning_rate": 1.6e-05, "loss": -0.0037, "num_tokens": 738207.0, "reward": 0.3202114701271057, "reward_std": 0.9959360361099243, "rewards/rollout_reward_func/mean": 0.3202114701271057, "rewards/rollout_reward_func/std": 0.9544781446456909, "sampling/importance_sampling_ratio/max": 1.5079377889633179, "sampling/importance_sampling_ratio/mean": 0.9559791088104248, "sampling/importance_sampling_ratio/min": 0.03567842021584511, "sampling/sampling_logp_difference/max": 3.299084186553955, "sampling/sampling_logp_difference/mean": 0.047464240342378616, "step": 17, "step_time": 10.977360289000217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1809.0, "completions/max_terminated_length": 1809.0, "completions/mean_length": 541.21875, "completions/mean_terminated_length": 541.21875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2226830404251814, "epoch": 0.00036, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171250343322754, "kl": 0.35516780242323875, "learning_rate": 1.7e-05, "loss": -0.0316, "num_tokens": 785241.0, "reward": -0.09258918464183807, "reward_std": 0.9349028468132019, "rewards/rollout_reward_func/mean": -0.09258918464183807, "rewards/rollout_reward_func/std": 0.954598605632782, "sampling/importance_sampling_ratio/max": 1.223116397857666, "sampling/importance_sampling_ratio/mean": 0.82972651720047, "sampling/importance_sampling_ratio/min": 0.1076008528470993, "sampling/sampling_logp_difference/max": 2.222273349761963, "sampling/sampling_logp_difference/mean": 0.07205961644649506, "step": 18, "step_time": 11.482884725999838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1607.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 451.03125, "completions/mean_terminated_length": 451.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.22411950305104256, "epoch": 0.00038, "frac_reward_zero_std": 0.0, "grad_norm": 3.3903536796569824, "kl": 1.9247599435038865, "learning_rate": 1.7999999999999997e-05, "loss": -0.0229, "num_tokens": 829127.0, "reward": -0.02561771869659424, "reward_std": 1.0086638927459717, "rewards/rollout_reward_func/mean": -0.02561771869659424, "rewards/rollout_reward_func/std": 0.9920539259910583, "sampling/importance_sampling_ratio/max": 2.795459032058716, "sampling/importance_sampling_ratio/mean": 1.0214674472808838, "sampling/importance_sampling_ratio/min": 0.07295823097229004, "sampling/sampling_logp_difference/max": 2.6179604530334473, "sampling/sampling_logp_difference/mean": 0.09476105868816376, "step": 19, "step_time": 10.109739205000096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1329.0, "completions/max_terminated_length": 1329.0, "completions/mean_length": 347.59375, "completions/mean_terminated_length": 347.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2989676259458065, "epoch": 0.0004, "frac_reward_zero_std": 0.0, "grad_norm": 0.5463613867759705, "kl": 0.4960377588868141, "learning_rate": 1.8999999999999998e-05, "loss": -0.0335, "num_tokens": 870036.0, "reward": -0.03720325231552124, "reward_std": 0.9819276928901672, "rewards/rollout_reward_func/mean": -0.03720325231552124, "rewards/rollout_reward_func/std": 0.9634839296340942, "sampling/importance_sampling_ratio/max": 2.4953343868255615, "sampling/importance_sampling_ratio/mean": 0.9398384094238281, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.82833731174469, "sampling/sampling_logp_difference/mean": 0.13301682472229004, "step": 20, "step_time": 9.1843802770004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1686.0, "completions/max_terminated_length": 1686.0, "completions/mean_length": 489.75, "completions/mean_terminated_length": 489.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.20260196272283792, "epoch": 0.00042, "frac_reward_zero_std": 0.0, "grad_norm": 0.9198043346405029, "kl": 0.3196151494048536, "learning_rate": 1.9999999999999998e-05, "loss": -0.0291, "num_tokens": 916118.0, "reward": 0.15434634685516357, "reward_std": 0.9565210342407227, "rewards/rollout_reward_func/mean": 0.15434634685516357, "rewards/rollout_reward_func/std": 0.9480637907981873, "sampling/importance_sampling_ratio/max": 2.011781930923462, "sampling/importance_sampling_ratio/mean": 0.9602175951004028, "sampling/importance_sampling_ratio/min": 0.1728551834821701, "sampling/sampling_logp_difference/max": 1.7481689453125, "sampling/sampling_logp_difference/mean": 0.0657820999622345, "step": 21, "step_time": 10.722510443999681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1618.0, "completions/max_terminated_length": 1618.0, "completions/mean_length": 400.46875, "completions/mean_terminated_length": 400.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.251527389511466, "epoch": 0.00044, "frac_reward_zero_std": 0.0, "grad_norm": 1.137460470199585, "kl": 0.4977840855717659, "learning_rate": 2.1e-05, "loss": -0.0335, "num_tokens": 958938.0, "reward": 0.062270864844322205, "reward_std": 0.9989160299301147, "rewards/rollout_reward_func/mean": 0.062270864844322205, "rewards/rollout_reward_func/std": 0.9765338897705078, "sampling/importance_sampling_ratio/max": 2.575089693069458, "sampling/importance_sampling_ratio/mean": 0.9019770622253418, "sampling/importance_sampling_ratio/min": 0.15158407390117645, "sampling/sampling_logp_difference/max": 1.8857437372207642, "sampling/sampling_logp_difference/mean": 0.11307962983846664, "step": 22, "step_time": 9.972082896000074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 357.875, "completions/mean_terminated_length": 357.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.27321060933172703, "epoch": 0.00046, "frac_reward_zero_std": 0.0, "grad_norm": 0.9042027592658997, "kl": 1.7196196634322405, "learning_rate": 2.1999999999999996e-05, "loss": -0.0319, "num_tokens": 1000891.0, "reward": -0.15283489227294922, "reward_std": 0.8902783989906311, "rewards/rollout_reward_func/mean": -0.15283489227294922, "rewards/rollout_reward_func/std": 0.944137692451477, "sampling/importance_sampling_ratio/max": 2.7453203201293945, "sampling/importance_sampling_ratio/mean": 0.8568977117538452, "sampling/importance_sampling_ratio/min": 0.07576071470975876, "sampling/sampling_logp_difference/max": 2.5627493858337402, "sampling/sampling_logp_difference/mean": 0.1815863698720932, "step": 23, "step_time": 8.937115395000092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1489.0, "completions/max_terminated_length": 1489.0, "completions/mean_length": 376.28125, "completions/mean_terminated_length": 376.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.23230331670492887, "epoch": 0.00048, "frac_reward_zero_std": 0.0, "grad_norm": 1.042927622795105, "kl": 0.96097163669765, "learning_rate": 2.2999999999999997e-05, "loss": -0.025, "num_tokens": 1042105.0, "reward": 0.4298139214515686, "reward_std": 0.8571569323539734, "rewards/rollout_reward_func/mean": 0.4298139214515686, "rewards/rollout_reward_func/std": 0.840574324131012, "sampling/importance_sampling_ratio/max": 1.6321258544921875, "sampling/importance_sampling_ratio/mean": 0.9084681868553162, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.5537645816802979, "sampling/sampling_logp_difference/mean": 0.13259707391262054, "step": 24, "step_time": 9.625167302999898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1461.0, "completions/max_terminated_length": 1461.0, "completions/mean_length": 402.34375, "completions/mean_terminated_length": 402.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2493597762659192, "epoch": 0.0005, "frac_reward_zero_std": 0.0, "grad_norm": 96.83624267578125, "kl": 7.249650102108717, "learning_rate": 2.3999999999999997e-05, "loss": -0.0204, "num_tokens": 1084834.0, "reward": 0.28377825021743774, "reward_std": 0.9180938601493835, "rewards/rollout_reward_func/mean": 0.28377825021743774, "rewards/rollout_reward_func/std": 0.9525761008262634, "sampling/importance_sampling_ratio/max": 1.4567694664001465, "sampling/importance_sampling_ratio/mean": 0.7921868562698364, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.399549961090088, "sampling/sampling_logp_difference/mean": 0.17179027199745178, "step": 25, "step_time": 9.551806454000143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1341.0, "completions/max_terminated_length": 1341.0, "completions/mean_length": 401.84375, "completions/mean_terminated_length": 401.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.22400148585438728, "epoch": 0.00052, "frac_reward_zero_std": 0.0, "grad_norm": 47.83807373046875, "kl": 8.523684550076723, "learning_rate": 2.4999999999999998e-05, "loss": 0.0017, "num_tokens": 1127906.0, "reward": 0.342040091753006, "reward_std": 0.8707850575447083, "rewards/rollout_reward_func/mean": 0.342040091753006, "rewards/rollout_reward_func/std": 0.8955131769180298, "sampling/importance_sampling_ratio/max": 2.4132585525512695, "sampling/importance_sampling_ratio/mean": 0.8823522329330444, "sampling/importance_sampling_ratio/min": 0.004202887881547213, "sampling/sampling_logp_difference/max": 5.50736141204834, "sampling/sampling_logp_difference/mean": 0.16027212142944336, "step": 26, "step_time": 9.221728538999969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1646.0, "completions/max_terminated_length": 1646.0, "completions/mean_length": 455.65625, "completions/mean_terminated_length": 455.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2418615110218525, "epoch": 0.00054, "frac_reward_zero_std": 0.0, "grad_norm": 2.2051784992218018, "kl": 2.8916810490190983, "learning_rate": 2.6e-05, "loss": -0.0086, "num_tokens": 1172366.0, "reward": 0.48706817626953125, "reward_std": 0.7855890989303589, "rewards/rollout_reward_func/mean": 0.48706817626953125, "rewards/rollout_reward_func/std": 0.7683048248291016, "sampling/importance_sampling_ratio/max": 1.8762000799179077, "sampling/importance_sampling_ratio/mean": 0.8224948644638062, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.8920795917510986, "sampling/sampling_logp_difference/mean": 0.13326886296272278, "step": 27, "step_time": 10.550995148999618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1631.0, "completions/max_terminated_length": 1631.0, "completions/mean_length": 446.21875, "completions/mean_terminated_length": 446.21875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.23661512695252895, "epoch": 0.00056, "frac_reward_zero_std": 0.25, "grad_norm": 1.1998122930526733, "kl": 1.1530585922300816, "learning_rate": 2.7e-05, "loss": -0.0226, "num_tokens": 1215836.0, "reward": 0.5608255863189697, "reward_std": 0.6881622076034546, "rewards/rollout_reward_func/mean": 0.5608255863189697, "rewards/rollout_reward_func/std": 0.7997443079948425, "sampling/importance_sampling_ratio/max": 1.8536982536315918, "sampling/importance_sampling_ratio/mean": 0.8735762238502502, "sampling/importance_sampling_ratio/min": 0.14520956575870514, "sampling/sampling_logp_difference/max": 1.9934189319610596, "sampling/sampling_logp_difference/mean": 0.1453382968902588, "step": 28, "step_time": 11.007205905000319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2295.0, "completions/max_terminated_length": 2295.0, "completions/mean_length": 655.625, "completions/mean_terminated_length": 655.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.3026051614433527, "epoch": 0.00058, "frac_reward_zero_std": 0.0, "grad_norm": 1.4750059843063354, "kl": 0.6000408753752708, "learning_rate": 2.8e-05, "loss": -0.0058, "num_tokens": 1267210.0, "reward": 0.39779043197631836, "reward_std": 0.8791565895080566, "rewards/rollout_reward_func/mean": 0.39779043197631836, "rewards/rollout_reward_func/std": 0.8394127488136292, "sampling/importance_sampling_ratio/max": 2.159236431121826, "sampling/importance_sampling_ratio/mean": 1.0731801986694336, "sampling/importance_sampling_ratio/min": 0.43263739347457886, "sampling/sampling_logp_difference/max": 1.5503740310668945, "sampling/sampling_logp_difference/mean": 0.09001783281564713, "step": 29, "step_time": 13.023432098000285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2374.0, "completions/max_terminated_length": 2374.0, "completions/mean_length": 430.0625, "completions/mean_terminated_length": 430.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.23896725289523602, "epoch": 0.0006, "frac_reward_zero_std": 0.0, "grad_norm": 1.1569913625717163, "kl": 1.5764386430382729, "learning_rate": 2.9e-05, "loss": -0.0225, "num_tokens": 1310600.0, "reward": 0.5232760906219482, "reward_std": 0.8015413284301758, "rewards/rollout_reward_func/mean": 0.5232760906219482, "rewards/rollout_reward_func/std": 0.8084567785263062, "sampling/importance_sampling_ratio/max": 1.6486430168151855, "sampling/importance_sampling_ratio/mean": 0.9057324528694153, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.8287765979766846, "sampling/sampling_logp_difference/mean": 0.11667509377002716, "step": 30, "step_time": 12.724996239999882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 555.71875, "completions/mean_terminated_length": 555.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3168955370783806, "epoch": 0.00062, "frac_reward_zero_std": 0.0, "grad_norm": 0.6839341521263123, "kl": 0.6111752949655056, "learning_rate": 2.9999999999999997e-05, "loss": -0.0254, "num_tokens": 1358696.0, "reward": 0.36887240409851074, "reward_std": 0.8790722489356995, "rewards/rollout_reward_func/mean": 0.36887240409851074, "rewards/rollout_reward_func/std": 0.8677482604980469, "sampling/importance_sampling_ratio/max": 1.741344928741455, "sampling/importance_sampling_ratio/mean": 0.8799818754196167, "sampling/importance_sampling_ratio/min": 0.13823118805885315, "sampling/sampling_logp_difference/max": 1.6196327209472656, "sampling/sampling_logp_difference/mean": 0.11652074009180069, "step": 31, "step_time": 13.467314088999956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1471.0, "completions/max_terminated_length": 1471.0, "completions/mean_length": 408.4375, "completions/mean_terminated_length": 408.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2844387497752905, "epoch": 0.00064, "frac_reward_zero_std": 0.0, "grad_norm": 14.46434497833252, "kl": 36.2515846719034, "learning_rate": 3.0999999999999995e-05, "loss": 0.0942, "num_tokens": 1401847.0, "reward": 0.4030369222164154, "reward_std": 0.8557415008544922, "rewards/rollout_reward_func/mean": 0.4030369222164154, "rewards/rollout_reward_func/std": 0.9120143055915833, "sampling/importance_sampling_ratio/max": 2.70402193069458, "sampling/importance_sampling_ratio/mean": 0.9604042768478394, "sampling/importance_sampling_ratio/min": 0.1921328604221344, "sampling/sampling_logp_difference/max": 1.624659538269043, "sampling/sampling_logp_difference/mean": 0.09359941631555557, "step": 32, "step_time": 10.173114688000169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1471.0, "completions/max_terminated_length": 1471.0, "completions/mean_length": 337.28125, "completions/mean_terminated_length": 337.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.33246706426143646, "epoch": 0.00066, "frac_reward_zero_std": 0.0, "grad_norm": 0.464267373085022, "kl": 0.8447359204292297, "learning_rate": 3.2e-05, "loss": -0.0299, "num_tokens": 1440776.0, "reward": 0.37142911553382874, "reward_std": 0.8298568725585938, "rewards/rollout_reward_func/mean": 0.37142911553382874, "rewards/rollout_reward_func/std": 0.9086017608642578, "sampling/importance_sampling_ratio/max": 2.523827314376831, "sampling/importance_sampling_ratio/mean": 0.8613070249557495, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3120331764221191, "sampling/sampling_logp_difference/mean": 0.11714231967926025, "step": 33, "step_time": 9.986435650999965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1553.0, "completions/max_terminated_length": 1553.0, "completions/mean_length": 525.03125, "completions/mean_terminated_length": 525.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2587994560599327, "epoch": 0.00068, "frac_reward_zero_std": 0.0, "grad_norm": 1.3060849905014038, "kl": 1.1883731111884117, "learning_rate": 3.2999999999999996e-05, "loss": 0.0011, "num_tokens": 1488790.0, "reward": 0.6841987371444702, "reward_std": 0.6876415014266968, "rewards/rollout_reward_func/mean": 0.6841987371444702, "rewards/rollout_reward_func/std": 0.6906946897506714, "sampling/importance_sampling_ratio/max": 2.4355316162109375, "sampling/importance_sampling_ratio/mean": 0.989209771156311, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.318927764892578, "sampling/sampling_logp_difference/mean": 0.10495365411043167, "step": 34, "step_time": 10.200931758999786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1512.0, "completions/max_terminated_length": 1512.0, "completions/mean_length": 433.84375, "completions/mean_terminated_length": 433.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3632338233292103, "epoch": 0.0007, "frac_reward_zero_std": 0.0, "grad_norm": 1.0153075456619263, "kl": 1.1626598238945007, "learning_rate": 3.4e-05, "loss": -0.0159, "num_tokens": 1533286.0, "reward": 0.24946264922618866, "reward_std": 0.9093924760818481, "rewards/rollout_reward_func/mean": 0.24946264922618866, "rewards/rollout_reward_func/std": 0.945837140083313, "sampling/importance_sampling_ratio/max": 1.8296226263046265, "sampling/importance_sampling_ratio/mean": 0.8834986090660095, "sampling/importance_sampling_ratio/min": 0.08122767508029938, "sampling/sampling_logp_difference/max": 2.218510627746582, "sampling/sampling_logp_difference/mean": 0.11007274687290192, "step": 35, "step_time": 9.73354207899979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3160.0, "completions/max_terminated_length": 3160.0, "completions/mean_length": 601.0, "completions/mean_terminated_length": 601.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2976596225053072, "epoch": 0.00072, "frac_reward_zero_std": 0.0, "grad_norm": 0.8215146064758301, "kl": 1.4974541738629341, "learning_rate": 3.5e-05, "loss": -0.0395, "num_tokens": 1582682.0, "reward": 0.1957320272922516, "reward_std": 0.8224852085113525, "rewards/rollout_reward_func/mean": 0.1957320272922516, "rewards/rollout_reward_func/std": 0.8337967991828918, "sampling/importance_sampling_ratio/max": 1.8884732723236084, "sampling/importance_sampling_ratio/mean": 0.8597487807273865, "sampling/importance_sampling_ratio/min": 0.11200515180826187, "sampling/sampling_logp_difference/max": 1.9908053874969482, "sampling/sampling_logp_difference/mean": 0.0979558527469635, "step": 36, "step_time": 14.591012650999573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1762.0, "completions/max_terminated_length": 1762.0, "completions/mean_length": 410.75, "completions/mean_terminated_length": 410.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.27584014274179935, "epoch": 0.00074, "frac_reward_zero_std": 0.0, "grad_norm": 0.7159145474433899, "kl": 1.4426019564270973, "learning_rate": 3.5999999999999994e-05, "loss": -0.0103, "num_tokens": 1626093.0, "reward": 0.398786723613739, "reward_std": 0.840857207775116, "rewards/rollout_reward_func/mean": 0.398786723613739, "rewards/rollout_reward_func/std": 0.831967830657959, "sampling/importance_sampling_ratio/max": 2.4024133682250977, "sampling/importance_sampling_ratio/mean": 0.9687727093696594, "sampling/importance_sampling_ratio/min": 0.12413598597049713, "sampling/sampling_logp_difference/max": 2.002993106842041, "sampling/sampling_logp_difference/mean": 0.11232121288776398, "step": 37, "step_time": 11.253544365999915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1714.0, "completions/max_terminated_length": 1714.0, "completions/mean_length": 541.28125, "completions/mean_terminated_length": 541.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2844531964510679, "epoch": 0.00076, "frac_reward_zero_std": 0.0, "grad_norm": 0.6665834188461304, "kl": 1.0706911347806454, "learning_rate": 3.7e-05, "loss": -0.0223, "num_tokens": 1673676.0, "reward": 0.07170034199953079, "reward_std": 1.0343921184539795, "rewards/rollout_reward_func/mean": 0.07170034199953079, "rewards/rollout_reward_func/std": 1.004249095916748, "sampling/importance_sampling_ratio/max": 1.8028594255447388, "sampling/importance_sampling_ratio/mean": 0.879553496837616, "sampling/importance_sampling_ratio/min": 0.1998407244682312, "sampling/sampling_logp_difference/max": 1.2881124019622803, "sampling/sampling_logp_difference/mean": 0.09821625053882599, "step": 38, "step_time": 10.510983592999992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2280.0, "completions/max_terminated_length": 2280.0, "completions/mean_length": 621.09375, "completions/mean_terminated_length": 621.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.23057018592953682, "epoch": 0.00078, "frac_reward_zero_std": 0.0, "grad_norm": 0.43681204319000244, "kl": 0.8114980757236481, "learning_rate": 3.7999999999999995e-05, "loss": -0.0087, "num_tokens": 1724229.0, "reward": 0.31776782870292664, "reward_std": 0.9051707983016968, "rewards/rollout_reward_func/mean": 0.31776782870292664, "rewards/rollout_reward_func/std": 0.9178617596626282, "sampling/importance_sampling_ratio/max": 2.5716254711151123, "sampling/importance_sampling_ratio/mean": 1.005103349685669, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0591217279434204, "sampling/sampling_logp_difference/mean": 0.06464052200317383, "step": 39, "step_time": 12.099574670000266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1407.0, "completions/max_terminated_length": 1407.0, "completions/mean_length": 366.90625, "completions/mean_terminated_length": 366.90625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.21629616618156433, "epoch": 0.0008, "frac_reward_zero_std": 0.0, "grad_norm": 0.5254016518592834, "kl": 1.0138556025922298, "learning_rate": 3.9e-05, "loss": -0.0056, "num_tokens": 1765385.0, "reward": 0.45431849360466003, "reward_std": 0.7685567736625671, "rewards/rollout_reward_func/mean": 0.45431849360466003, "rewards/rollout_reward_func/std": 0.8085847496986389, "sampling/importance_sampling_ratio/max": 2.0527050495147705, "sampling/importance_sampling_ratio/mean": 1.0394235849380493, "sampling/importance_sampling_ratio/min": 0.09251963347196579, "sampling/sampling_logp_difference/max": 1.8371855020523071, "sampling/sampling_logp_difference/mean": 0.06612863391637802, "step": 40, "step_time": 9.397912222000286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1734.0, "completions/max_terminated_length": 1734.0, "completions/mean_length": 491.90625, "completions/mean_terminated_length": 491.90625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.22974738478660583, "epoch": 0.00082, "frac_reward_zero_std": 0.0, "grad_norm": 0.611748456954956, "kl": 0.7851045057177544, "learning_rate": 3.9999999999999996e-05, "loss": -0.0029, "num_tokens": 1810170.0, "reward": 0.12092258036136627, "reward_std": 0.9245712757110596, "rewards/rollout_reward_func/mean": 0.12092258036136627, "rewards/rollout_reward_func/std": 0.9339917898178101, "sampling/importance_sampling_ratio/max": 1.5789703130722046, "sampling/importance_sampling_ratio/mean": 0.9680348634719849, "sampling/importance_sampling_ratio/min": 0.38823214173316956, "sampling/sampling_logp_difference/max": 1.128779411315918, "sampling/sampling_logp_difference/mean": 0.05330125615000725, "step": 41, "step_time": 11.23466578200032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1576.0, "completions/max_terminated_length": 1576.0, "completions/mean_length": 397.8125, "completions/mean_terminated_length": 397.8125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2839460838586092, "epoch": 0.00084, "frac_reward_zero_std": 0.0, "grad_norm": 0.5547649264335632, "kl": 1.1773334182798862, "learning_rate": 4.1e-05, "loss": -0.0274, "num_tokens": 1851660.0, "reward": 0.09382157027721405, "reward_std": 0.9577667713165283, "rewards/rollout_reward_func/mean": 0.09382157027721405, "rewards/rollout_reward_func/std": 0.9533609747886658, "sampling/importance_sampling_ratio/max": 2.0156667232513428, "sampling/importance_sampling_ratio/mean": 0.9031965732574463, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7616057395935059, "sampling/sampling_logp_difference/mean": 0.07802588492631912, "step": 42, "step_time": 10.500591049999684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1358.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 329.75, "completions/mean_terminated_length": 329.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.23919850960373878, "epoch": 0.00086, "frac_reward_zero_std": 0.0, "grad_norm": 0.5154358148574829, "kl": 0.919975645840168, "learning_rate": 4.2e-05, "loss": -0.0114, "num_tokens": 1890997.0, "reward": 0.527825117111206, "reward_std": 0.7903493642807007, "rewards/rollout_reward_func/mean": 0.527825117111206, "rewards/rollout_reward_func/std": 0.7973954081535339, "sampling/importance_sampling_ratio/max": 1.6170350313186646, "sampling/importance_sampling_ratio/mean": 0.9944158792495728, "sampling/importance_sampling_ratio/min": 0.5560446381568909, "sampling/sampling_logp_difference/max": 0.8630304336547852, "sampling/sampling_logp_difference/mean": 0.055822648108005524, "step": 43, "step_time": 9.091097353999885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1471.0, "completions/max_terminated_length": 1471.0, "completions/mean_length": 467.40625, "completions/mean_terminated_length": 467.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.22037405520677567, "epoch": 0.00088, "frac_reward_zero_std": 0.0, "grad_norm": 0.8348227143287659, "kl": 0.7930199503898621, "learning_rate": 4.3e-05, "loss": 0.005, "num_tokens": 1936865.0, "reward": 0.6463057994842529, "reward_std": 0.6354831457138062, "rewards/rollout_reward_func/mean": 0.6463057994842529, "rewards/rollout_reward_func/std": 0.6565618515014648, "sampling/importance_sampling_ratio/max": 1.4494444131851196, "sampling/importance_sampling_ratio/mean": 0.9751706123352051, "sampling/importance_sampling_ratio/min": 0.35253632068634033, "sampling/sampling_logp_difference/max": 0.8710594177246094, "sampling/sampling_logp_difference/mean": 0.04521109163761139, "step": 44, "step_time": 9.730483430999811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1467.0, "completions/max_terminated_length": 1467.0, "completions/mean_length": 532.5625, "completions/mean_terminated_length": 532.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.16123176738619804, "epoch": 0.0009, "frac_reward_zero_std": 0.0, "grad_norm": 0.2624436914920807, "kl": 0.7007593922317028, "learning_rate": 4.399999999999999e-05, "loss": -0.0093, "num_tokens": 1984421.0, "reward": 0.6494507789611816, "reward_std": 0.6563788652420044, "rewards/rollout_reward_func/mean": 0.6494507789611816, "rewards/rollout_reward_func/std": 0.7020707130432129, "sampling/importance_sampling_ratio/max": 1.2357373237609863, "sampling/importance_sampling_ratio/mean": 0.9721324443817139, "sampling/importance_sampling_ratio/min": 0.4628504812717438, "sampling/sampling_logp_difference/max": 0.7627899646759033, "sampling/sampling_logp_difference/mean": 0.022538864985108376, "step": 45, "step_time": 9.816673296000317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2532.0, "completions/max_terminated_length": 2532.0, "completions/mean_length": 433.15625, "completions/mean_terminated_length": 433.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2130075814202428, "epoch": 0.00092, "frac_reward_zero_std": 0.25, "grad_norm": 0.871058464050293, "kl": 2.0737335681915283, "learning_rate": 4.4999999999999996e-05, "loss": -0.0106, "num_tokens": 2028235.0, "reward": 0.46100178360939026, "reward_std": 0.6548740863800049, "rewards/rollout_reward_func/mean": 0.46100178360939026, "rewards/rollout_reward_func/std": 0.7991276979446411, "sampling/importance_sampling_ratio/max": 1.229529857635498, "sampling/importance_sampling_ratio/mean": 0.9466047883033752, "sampling/importance_sampling_ratio/min": 0.20644398033618927, "sampling/sampling_logp_difference/max": 1.3534173965454102, "sampling/sampling_logp_difference/mean": 0.041823968291282654, "step": 46, "step_time": 13.191369012999758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 368.78125, "completions/mean_terminated_length": 368.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.1936363112181425, "epoch": 0.00094, "frac_reward_zero_std": 0.0, "grad_norm": 0.36885741353034973, "kl": 1.0751465633511543, "learning_rate": 4.599999999999999e-05, "loss": 0.0034, "num_tokens": 2068419.0, "reward": 0.6205021142959595, "reward_std": 0.7318580746650696, "rewards/rollout_reward_func/mean": 0.6205021142959595, "rewards/rollout_reward_func/std": 0.7517509460449219, "sampling/importance_sampling_ratio/max": 1.8882579803466797, "sampling/importance_sampling_ratio/mean": 1.023834466934204, "sampling/importance_sampling_ratio/min": 0.579017698764801, "sampling/sampling_logp_difference/max": 0.6615636944770813, "sampling/sampling_logp_difference/mean": 0.032872386276721954, "step": 47, "step_time": 12.633017148000363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1746.0, "completions/max_terminated_length": 1746.0, "completions/mean_length": 404.09375, "completions/mean_terminated_length": 404.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.21343508455902338, "epoch": 0.00096, "frac_reward_zero_std": 0.0, "grad_norm": 0.8466521501541138, "kl": 1.088382013142109, "learning_rate": 4.699999999999999e-05, "loss": 0.0063, "num_tokens": 2110058.0, "reward": 0.6447704434394836, "reward_std": 0.6749294996261597, "rewards/rollout_reward_func/mean": 0.6447704434394836, "rewards/rollout_reward_func/std": 0.6571642160415649, "sampling/importance_sampling_ratio/max": 2.2685439586639404, "sampling/importance_sampling_ratio/mean": 1.0864028930664062, "sampling/importance_sampling_ratio/min": 0.618796706199646, "sampling/sampling_logp_difference/max": 0.819129228591919, "sampling/sampling_logp_difference/mean": 0.0409696027636528, "step": 48, "step_time": 10.539570211000637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1462.0, "completions/max_terminated_length": 1462.0, "completions/mean_length": 394.0, "completions/mean_terminated_length": 394.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.22381674125790596, "epoch": 0.00098, "frac_reward_zero_std": 0.0, "grad_norm": 0.3326900005340576, "kl": 1.424051508307457, "learning_rate": 4.7999999999999994e-05, "loss": -0.0062, "num_tokens": 2153067.0, "reward": 0.4617741107940674, "reward_std": 0.8026109933853149, "rewards/rollout_reward_func/mean": 0.4617741107940674, "rewards/rollout_reward_func/std": 0.8009026646614075, "sampling/importance_sampling_ratio/max": 2.5391650199890137, "sampling/importance_sampling_ratio/mean": 1.0073416233062744, "sampling/importance_sampling_ratio/min": 0.347463995218277, "sampling/sampling_logp_difference/max": 0.9631484150886536, "sampling/sampling_logp_difference/mean": 0.05753588676452637, "step": 49, "step_time": 9.822351044000015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1551.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 488.46875, "completions/mean_terminated_length": 488.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.29982032626867294, "epoch": 0.001, "frac_reward_zero_std": 0.0, "grad_norm": 0.8671094179153442, "kl": 0.9727141261100769, "learning_rate": 4.899999999999999e-05, "loss": -0.007, "num_tokens": 2199138.0, "reward": 0.37679898738861084, "reward_std": 0.9403157234191895, "rewards/rollout_reward_func/mean": 0.37679898738861084, "rewards/rollout_reward_func/std": 0.8992018699645996, "sampling/importance_sampling_ratio/max": 1.8259638547897339, "sampling/importance_sampling_ratio/mean": 0.9858899116516113, "sampling/importance_sampling_ratio/min": 0.3982914090156555, "sampling/sampling_logp_difference/max": 0.7273249626159668, "sampling/sampling_logp_difference/mean": 0.05463321506977081, "step": 50, "step_time": 10.738488725999332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1585.0, "completions/max_terminated_length": 1585.0, "completions/mean_length": 333.1875, "completions/mean_terminated_length": 333.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.22391923377290368, "epoch": 0.00102, "frac_reward_zero_std": 0.0, "grad_norm": 0.47821611166000366, "kl": 1.0646247491240501, "learning_rate": 4.9999999999999996e-05, "loss": 0.0046, "num_tokens": 2238337.0, "reward": 0.5152149796485901, "reward_std": 0.7291878461837769, "rewards/rollout_reward_func/mean": 0.5152149796485901, "rewards/rollout_reward_func/std": 0.7714893817901611, "sampling/importance_sampling_ratio/max": 1.2874901294708252, "sampling/importance_sampling_ratio/mean": 1.0237869024276733, "sampling/importance_sampling_ratio/min": 0.649499237537384, "sampling/sampling_logp_difference/max": 0.43135809898376465, "sampling/sampling_logp_difference/mean": 0.025746671482920647, "step": 51, "step_time": 10.30634023100015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2574.0, "completions/max_terminated_length": 2574.0, "completions/mean_length": 549.15625, "completions/mean_terminated_length": 549.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3197893872857094, "epoch": 0.00104, "frac_reward_zero_std": 0.0, "grad_norm": 0.5121764540672302, "kl": 1.0992940813302994, "learning_rate": 5.099999999999999e-05, "loss": -0.0104, "num_tokens": 2286993.0, "reward": 0.41164565086364746, "reward_std": 0.905423641204834, "rewards/rollout_reward_func/mean": 0.41164565086364746, "rewards/rollout_reward_func/std": 0.8994181752204895, "sampling/importance_sampling_ratio/max": 1.3189126253128052, "sampling/importance_sampling_ratio/mean": 0.954028844833374, "sampling/importance_sampling_ratio/min": 0.5851255655288696, "sampling/sampling_logp_difference/max": 0.5522274971008301, "sampling/sampling_logp_difference/mean": 0.03515256568789482, "step": 52, "step_time": 12.987638004999553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 446.78125, "completions/mean_terminated_length": 446.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.30747243389487267, "epoch": 0.00106, "frac_reward_zero_std": 0.0, "grad_norm": 0.19903643429279327, "kl": 1.1317579578608274, "learning_rate": 5.2e-05, "loss": -0.012, "num_tokens": 2331144.0, "reward": 0.30379337072372437, "reward_std": 0.7730056643486023, "rewards/rollout_reward_func/mean": 0.30379337072372437, "rewards/rollout_reward_func/std": 0.8563342094421387, "sampling/importance_sampling_ratio/max": 1.3316208124160767, "sampling/importance_sampling_ratio/mean": 0.9596077799797058, "sampling/importance_sampling_ratio/min": 0.4228017032146454, "sampling/sampling_logp_difference/max": 0.8354442119598389, "sampling/sampling_logp_difference/mean": 0.03748326003551483, "step": 53, "step_time": 10.517057877999832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1719.0, "completions/max_terminated_length": 1719.0, "completions/mean_length": 447.6875, "completions/mean_terminated_length": 447.6875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2822277210652828, "epoch": 0.00108, "frac_reward_zero_std": 0.0, "grad_norm": 0.2580128312110901, "kl": 1.6629524379968643, "learning_rate": 5.2999999999999994e-05, "loss": -0.0103, "num_tokens": 2375206.0, "reward": 0.4977615475654602, "reward_std": 0.7644492387771606, "rewards/rollout_reward_func/mean": 0.4977615475654602, "rewards/rollout_reward_func/std": 0.7955817580223083, "sampling/importance_sampling_ratio/max": 1.7196698188781738, "sampling/importance_sampling_ratio/mean": 0.955186128616333, "sampling/importance_sampling_ratio/min": 0.4854273498058319, "sampling/sampling_logp_difference/max": 0.7344328165054321, "sampling/sampling_logp_difference/mean": 0.04283210635185242, "step": 54, "step_time": 10.419993891999866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 322.875, "completions/mean_terminated_length": 322.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.18024862371385098, "epoch": 0.0011, "frac_reward_zero_std": 0.0, "grad_norm": 0.4570525586605072, "kl": 1.3060022667050362, "learning_rate": 5.4e-05, "loss": -0.0019, "num_tokens": 2414542.0, "reward": 0.6771817207336426, "reward_std": 0.6104331016540527, "rewards/rollout_reward_func/mean": 0.6771817207336426, "rewards/rollout_reward_func/std": 0.6540780663490295, "sampling/importance_sampling_ratio/max": 2.619563341140747, "sampling/importance_sampling_ratio/mean": 1.0797266960144043, "sampling/importance_sampling_ratio/min": 0.6771005392074585, "sampling/sampling_logp_difference/max": 0.9633440971374512, "sampling/sampling_logp_difference/mean": 0.031133729964494705, "step": 55, "step_time": 8.295717008999645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2568.0, "completions/max_terminated_length": 2568.0, "completions/mean_length": 526.40625, "completions/mean_terminated_length": 526.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.31278855353593826, "epoch": 0.00112, "frac_reward_zero_std": 0.0, "grad_norm": 0.7560754418373108, "kl": 1.4882714450359344, "learning_rate": 5.4999999999999995e-05, "loss": 0.0112, "num_tokens": 2462074.0, "reward": 0.5130623579025269, "reward_std": 0.7361263036727905, "rewards/rollout_reward_func/mean": 0.5130623579025269, "rewards/rollout_reward_func/std": 0.7266902923583984, "sampling/importance_sampling_ratio/max": 2.5340633392333984, "sampling/importance_sampling_ratio/mean": 1.0857641696929932, "sampling/importance_sampling_ratio/min": 0.45280569791793823, "sampling/sampling_logp_difference/max": 1.3817520141601562, "sampling/sampling_logp_difference/mean": 0.05469951778650284, "step": 56, "step_time": 13.353751793999663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2264.0, "completions/max_terminated_length": 2264.0, "completions/mean_length": 625.65625, "completions/mean_terminated_length": 625.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.31969811394810677, "epoch": 0.00114, "frac_reward_zero_std": 0.0, "grad_norm": 1.1105835437774658, "kl": 1.1555701866745949, "learning_rate": 5.6e-05, "loss": 0.0399, "num_tokens": 2512982.0, "reward": 0.5882573127746582, "reward_std": 0.7410321235656738, "rewards/rollout_reward_func/mean": 0.5882573127746582, "rewards/rollout_reward_func/std": 0.754264771938324, "sampling/importance_sampling_ratio/max": 2.5503058433532715, "sampling/importance_sampling_ratio/mean": 1.0013823509216309, "sampling/importance_sampling_ratio/min": 0.46001890301704407, "sampling/sampling_logp_difference/max": 0.6079883575439453, "sampling/sampling_logp_difference/mean": 0.04742881655693054, "step": 57, "step_time": 12.121361051999884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1724.0, "completions/max_terminated_length": 1724.0, "completions/mean_length": 446.75, "completions/mean_terminated_length": 446.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2895616143941879, "epoch": 0.00116, "frac_reward_zero_std": 0.0, "grad_norm": 0.35624849796295166, "kl": 1.355555072426796, "learning_rate": 5.6999999999999996e-05, "loss": -0.0073, "num_tokens": 2557881.0, "reward": 0.611238956451416, "reward_std": 0.6273282766342163, "rewards/rollout_reward_func/mean": 0.611238956451416, "rewards/rollout_reward_func/std": 0.6687383055686951, "sampling/importance_sampling_ratio/max": 2.491894483566284, "sampling/importance_sampling_ratio/mean": 1.0659096240997314, "sampling/importance_sampling_ratio/min": 0.6563628911972046, "sampling/sampling_logp_difference/max": 0.9111161231994629, "sampling/sampling_logp_difference/mean": 0.04719571769237518, "step": 58, "step_time": 10.401329957999678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1538.0, "completions/max_terminated_length": 1538.0, "completions/mean_length": 446.84375, "completions/mean_terminated_length": 446.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2944948635995388, "epoch": 0.00118, "frac_reward_zero_std": 0.0, "grad_norm": 0.37593191862106323, "kl": 1.6436612010002136, "learning_rate": 5.8e-05, "loss": -0.0094, "num_tokens": 2603018.0, "reward": 0.5244594216346741, "reward_std": 0.8012816905975342, "rewards/rollout_reward_func/mean": 0.5244594216346741, "rewards/rollout_reward_func/std": 0.8057574033737183, "sampling/importance_sampling_ratio/max": 1.7599656581878662, "sampling/importance_sampling_ratio/mean": 0.9491475820541382, "sampling/importance_sampling_ratio/min": 0.36143070459365845, "sampling/sampling_logp_difference/max": 0.7137837409973145, "sampling/sampling_logp_difference/mean": 0.055331528186798096, "step": 59, "step_time": 10.497084323999843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 335.96875, "completions/mean_terminated_length": 335.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2053281757980585, "epoch": 0.0012, "frac_reward_zero_std": 0.25, "grad_norm": 0.22069185972213745, "kl": 1.7469080090522766, "learning_rate": 5.9e-05, "loss": 0.005, "num_tokens": 2643826.0, "reward": 0.7654061317443848, "reward_std": 0.4450979232788086, "rewards/rollout_reward_func/mean": 0.7654061317443848, "rewards/rollout_reward_func/std": 0.5156545639038086, "sampling/importance_sampling_ratio/max": 1.9665604829788208, "sampling/importance_sampling_ratio/mean": 0.9830435514450073, "sampling/importance_sampling_ratio/min": 0.40057483315467834, "sampling/sampling_logp_difference/max": 0.8576858043670654, "sampling/sampling_logp_difference/mean": 0.03817757964134216, "step": 60, "step_time": 8.97697070599952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1796.0, "completions/max_terminated_length": 1796.0, "completions/mean_length": 458.21875, "completions/mean_terminated_length": 458.21875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.18468773923814297, "epoch": 0.00122, "frac_reward_zero_std": 0.0, "grad_norm": 0.22015832364559174, "kl": 1.7126774489879608, "learning_rate": 5.9999999999999995e-05, "loss": 0.0016, "num_tokens": 2689419.0, "reward": 0.7375465631484985, "reward_std": 0.5294548273086548, "rewards/rollout_reward_func/mean": 0.7375465631484985, "rewards/rollout_reward_func/std": 0.5243182182312012, "sampling/importance_sampling_ratio/max": 1.9091094732284546, "sampling/importance_sampling_ratio/mean": 1.002228021621704, "sampling/importance_sampling_ratio/min": 0.761676013469696, "sampling/sampling_logp_difference/max": 0.6463425159454346, "sampling/sampling_logp_difference/mean": 0.03149639070034027, "step": 61, "step_time": 10.940608026000064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1572.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 347.59375, "completions/mean_terminated_length": 347.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.17673727683722973, "epoch": 0.00124, "frac_reward_zero_std": 0.25, "grad_norm": 0.2487325668334961, "kl": 1.6599693447351456, "learning_rate": 6.1e-05, "loss": 0.0003, "num_tokens": 2729861.0, "reward": 0.8267905116081238, "reward_std": 0.35488075017929077, "rewards/rollout_reward_func/mean": 0.8267905116081238, "rewards/rollout_reward_func/std": 0.4089851677417755, "sampling/importance_sampling_ratio/max": 1.8078171014785767, "sampling/importance_sampling_ratio/mean": 0.9810857772827148, "sampling/importance_sampling_ratio/min": 0.49386802315711975, "sampling/sampling_logp_difference/max": 0.5919884443283081, "sampling/sampling_logp_difference/mean": 0.03359387442469597, "step": 62, "step_time": 10.220663090999778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 402.3125, "completions/mean_terminated_length": 402.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.18029231019318104, "epoch": 0.00126, "frac_reward_zero_std": 0.0, "grad_norm": 0.21009044349193573, "kl": 2.4260742217302322, "learning_rate": 6.199999999999999e-05, "loss": 0.008, "num_tokens": 2773007.0, "reward": 0.6391865015029907, "reward_std": 0.604887068271637, "rewards/rollout_reward_func/mean": 0.6391865015029907, "rewards/rollout_reward_func/std": 0.6198267340660095, "sampling/importance_sampling_ratio/max": 1.690628170967102, "sampling/importance_sampling_ratio/mean": 0.9645527005195618, "sampling/importance_sampling_ratio/min": 0.45240405201911926, "sampling/sampling_logp_difference/max": 0.5249950885772705, "sampling/sampling_logp_difference/mean": 0.050745680928230286, "step": 63, "step_time": 8.380111027000112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 363.59375, "completions/mean_terminated_length": 363.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.19281298108398914, "epoch": 0.00128, "frac_reward_zero_std": 0.25, "grad_norm": 0.6816410422325134, "kl": 3.6316768378019333, "learning_rate": 6.3e-05, "loss": 0.0003, "num_tokens": 2814330.0, "reward": 0.6769261360168457, "reward_std": 0.5691558122634888, "rewards/rollout_reward_func/mean": 0.6769261360168457, "rewards/rollout_reward_func/std": 0.6552136540412903, "sampling/importance_sampling_ratio/max": 1.6253528594970703, "sampling/importance_sampling_ratio/mean": 0.9704272150993347, "sampling/importance_sampling_ratio/min": 0.29559749364852905, "sampling/sampling_logp_difference/max": 1.3066649436950684, "sampling/sampling_logp_difference/mean": 0.05995550751686096, "step": 64, "step_time": 8.942913078000174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 488.0, "completions/mean_terminated_length": 488.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.22785130888223648, "epoch": 0.0013, "frac_reward_zero_std": 0.0, "grad_norm": 0.38275572657585144, "kl": 2.1096755117177963, "learning_rate": 6.4e-05, "loss": 0.0074, "num_tokens": 2862315.0, "reward": 0.5419843792915344, "reward_std": 0.6987729668617249, "rewards/rollout_reward_func/mean": 0.5419843792915344, "rewards/rollout_reward_func/std": 0.6862879395484924, "sampling/importance_sampling_ratio/max": 1.329522728919983, "sampling/importance_sampling_ratio/mean": 0.9374920725822449, "sampling/importance_sampling_ratio/min": 0.3278305232524872, "sampling/sampling_logp_difference/max": 0.737900972366333, "sampling/sampling_logp_difference/mean": 0.04596731811761856, "step": 65, "step_time": 8.95573355700003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 434.34375, "completions/mean_terminated_length": 434.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.20254813320934772, "epoch": 0.00132, "frac_reward_zero_std": 0.0, "grad_norm": 0.564424991607666, "kl": 1.8671841323375702, "learning_rate": 6.5e-05, "loss": 0.0309, "num_tokens": 2906613.0, "reward": 0.6349352598190308, "reward_std": 0.5667197704315186, "rewards/rollout_reward_func/mean": 0.6349352598190308, "rewards/rollout_reward_func/std": 0.5723178386688232, "sampling/importance_sampling_ratio/max": 2.082275390625, "sampling/importance_sampling_ratio/mean": 0.9734312891960144, "sampling/importance_sampling_ratio/min": 0.6364663243293762, "sampling/sampling_logp_difference/max": 0.6452445983886719, "sampling/sampling_logp_difference/mean": 0.04617973417043686, "step": 66, "step_time": 8.046829737999815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 329.21875, "completions/mean_terminated_length": 329.21875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.22479398176074028, "epoch": 0.00134, "frac_reward_zero_std": 0.0, "grad_norm": 0.5483220219612122, "kl": 2.4359081089496613, "learning_rate": 6.599999999999999e-05, "loss": 0.0241, "num_tokens": 2946739.0, "reward": 0.6691075563430786, "reward_std": 0.6089736819267273, "rewards/rollout_reward_func/mean": 0.6691075563430786, "rewards/rollout_reward_func/std": 0.6150685548782349, "sampling/importance_sampling_ratio/max": 1.8554893732070923, "sampling/importance_sampling_ratio/mean": 0.9979631304740906, "sampling/importance_sampling_ratio/min": 0.48545849323272705, "sampling/sampling_logp_difference/max": 0.6605856418609619, "sampling/sampling_logp_difference/mean": 0.0380132757127285, "step": 67, "step_time": 7.742515937000235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 276.28125, "completions/mean_terminated_length": 276.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.1596527025103569, "epoch": 0.00136, "frac_reward_zero_std": 0.25, "grad_norm": 0.2421184778213501, "kl": 1.676218792796135, "learning_rate": 6.7e-05, "loss": 0.0072, "num_tokens": 2984806.0, "reward": 0.7747413516044617, "reward_std": 0.47069045901298523, "rewards/rollout_reward_func/mean": 0.7747413516044617, "rewards/rollout_reward_func/std": 0.5620537996292114, "sampling/importance_sampling_ratio/max": 1.1180931329727173, "sampling/importance_sampling_ratio/mean": 0.9611015915870667, "sampling/importance_sampling_ratio/min": 0.5154492855072021, "sampling/sampling_logp_difference/max": 0.6626415848731995, "sampling/sampling_logp_difference/mean": 0.018015112727880478, "step": 68, "step_time": 7.425554623999915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 380.65625, "completions/mean_terminated_length": 380.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.16528514586389065, "epoch": 0.00138, "frac_reward_zero_std": 0.0, "grad_norm": 0.18221475183963776, "kl": 1.9254668653011322, "learning_rate": 6.8e-05, "loss": -0.0009, "num_tokens": 3027044.0, "reward": 0.5216482877731323, "reward_std": 0.7680641412734985, "rewards/rollout_reward_func/mean": 0.5216482877731323, "rewards/rollout_reward_func/std": 0.7652435898780823, "sampling/importance_sampling_ratio/max": 1.167378544807434, "sampling/importance_sampling_ratio/mean": 0.9800703525543213, "sampling/importance_sampling_ratio/min": 0.7217369675636292, "sampling/sampling_logp_difference/max": 0.22176837921142578, "sampling/sampling_logp_difference/mean": 0.013853147625923157, "step": 69, "step_time": 8.444997737999984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 456.1875, "completions/mean_terminated_length": 456.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.11435590591281652, "epoch": 0.0014, "frac_reward_zero_std": 0.25, "grad_norm": 4.698845863342285, "kl": 13.575719520449638, "learning_rate": 6.9e-05, "loss": 0.0361, "num_tokens": 3072182.0, "reward": 0.7682366371154785, "reward_std": 0.4131924510002136, "rewards/rollout_reward_func/mean": 0.7682366371154785, "rewards/rollout_reward_func/std": 0.5127333998680115, "sampling/importance_sampling_ratio/max": 1.3921582698822021, "sampling/importance_sampling_ratio/mean": 0.9815487861633301, "sampling/importance_sampling_ratio/min": 0.4221239387989044, "sampling/sampling_logp_difference/max": 0.8747730255126953, "sampling/sampling_logp_difference/mean": 0.016982851549983025, "step": 70, "step_time": 8.509082285000204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 374.5, "completions/mean_terminated_length": 374.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09522162470966578, "epoch": 0.00142, "frac_reward_zero_std": 0.0, "grad_norm": 0.19268281757831573, "kl": 1.9637327939271927, "learning_rate": 7e-05, "loss": 0.009, "num_tokens": 3113336.0, "reward": 0.6718176603317261, "reward_std": 0.6276779174804688, "rewards/rollout_reward_func/mean": 0.6718176603317261, "rewards/rollout_reward_func/std": 0.6117041707038879, "sampling/importance_sampling_ratio/max": 1.4406239986419678, "sampling/importance_sampling_ratio/mean": 1.0005383491516113, "sampling/importance_sampling_ratio/min": 0.5267597436904907, "sampling/sampling_logp_difference/max": 0.7007496356964111, "sampling/sampling_logp_difference/mean": 0.01753494329750538, "step": 71, "step_time": 7.984549703000084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 438.4375, "completions/mean_terminated_length": 438.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.13882869575172663, "epoch": 0.00144, "frac_reward_zero_std": 0.0, "grad_norm": 0.21752215921878815, "kl": 1.723952278494835, "learning_rate": 6.999979942764313e-05, "loss": 0.0075, "num_tokens": 3157846.0, "reward": 0.608008623123169, "reward_std": 0.6281692981719971, "rewards/rollout_reward_func/mean": 0.608008623123169, "rewards/rollout_reward_func/std": 0.6276559829711914, "sampling/importance_sampling_ratio/max": 1.112539291381836, "sampling/importance_sampling_ratio/mean": 0.9731200933456421, "sampling/importance_sampling_ratio/min": 0.7152884602546692, "sampling/sampling_logp_difference/max": 0.5473935008049011, "sampling/sampling_logp_difference/mean": 0.017287615686655045, "step": 72, "step_time": 8.151087586000358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 389.75, "completions/mean_terminated_length": 389.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09043915569782257, "epoch": 0.00146, "frac_reward_zero_std": 0.0, "grad_norm": 0.12930338084697723, "kl": 1.7410562485456467, "learning_rate": 6.999919771344602e-05, "loss": 0.0178, "num_tokens": 3199527.0, "reward": 0.6024843454360962, "reward_std": 0.5768618583679199, "rewards/rollout_reward_func/mean": 0.6024843454360962, "rewards/rollout_reward_func/std": 0.5787659287452698, "sampling/importance_sampling_ratio/max": 1.4299548864364624, "sampling/importance_sampling_ratio/mean": 1.0022668838500977, "sampling/importance_sampling_ratio/min": 0.921169638633728, "sampling/sampling_logp_difference/max": 0.35825181007385254, "sampling/sampling_logp_difference/mean": 0.008778933435678482, "step": 73, "step_time": 8.037616143999685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 480.34375, "completions/mean_terminated_length": 480.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.13736943248659372, "epoch": 0.00148, "frac_reward_zero_std": 0.25, "grad_norm": 0.45593345165252686, "kl": 4.621501669287682, "learning_rate": 6.99981948660292e-05, "loss": 0.0137, "num_tokens": 3245988.0, "reward": 0.700668215751648, "reward_std": 0.44149094820022583, "rewards/rollout_reward_func/mean": 0.700668215751648, "rewards/rollout_reward_func/std": 0.5473355054855347, "sampling/importance_sampling_ratio/max": 1.367963194847107, "sampling/importance_sampling_ratio/mean": 0.9624881744384766, "sampling/importance_sampling_ratio/min": 0.7060485482215881, "sampling/sampling_logp_difference/max": 0.3509335517883301, "sampling/sampling_logp_difference/mean": 0.021219635382294655, "step": 74, "step_time": 8.726398431000234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 386.625, "completions/mean_terminated_length": 386.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.15658615063875914, "epoch": 0.0015, "frac_reward_zero_std": 0.0, "grad_norm": 0.2405322641134262, "kl": 2.1263996735215187, "learning_rate": 6.999679089976006e-05, "loss": 0.0142, "num_tokens": 3288300.0, "reward": 0.6719529628753662, "reward_std": 0.6051351428031921, "rewards/rollout_reward_func/mean": 0.6719529628753662, "rewards/rollout_reward_func/std": 0.6115742921829224, "sampling/importance_sampling_ratio/max": 1.3705013990402222, "sampling/importance_sampling_ratio/mean": 1.0109786987304688, "sampling/importance_sampling_ratio/min": 0.9197432398796082, "sampling/sampling_logp_difference/max": 0.23185789585113525, "sampling/sampling_logp_difference/mean": 0.017108675092458725, "step": 75, "step_time": 8.495304994999742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 484.5625, "completions/mean_terminated_length": 484.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.20931780710816383, "epoch": 0.00152, "frac_reward_zero_std": 0.0, "grad_norm": 0.2858348488807678, "kl": 1.8903097808361053, "learning_rate": 6.999498583475265e-05, "loss": 0.0093, "num_tokens": 3334896.0, "reward": 0.6741836071014404, "reward_std": 0.6266090869903564, "rewards/rollout_reward_func/mean": 0.6741836071014404, "rewards/rollout_reward_func/std": 0.6072819828987122, "sampling/importance_sampling_ratio/max": 1.2143765687942505, "sampling/importance_sampling_ratio/mean": 0.9763558506965637, "sampling/importance_sampling_ratio/min": 0.7793107032775879, "sampling/sampling_logp_difference/max": 0.3453660011291504, "sampling/sampling_logp_difference/mean": 0.023211227729916573, "step": 76, "step_time": 8.289157642999953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 514.75, "completions/mean_terminated_length": 514.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2113075777888298, "epoch": 0.00154, "frac_reward_zero_std": 0.0, "grad_norm": 0.13702461123466492, "kl": 1.9775008708238602, "learning_rate": 6.999277969686742e-05, "loss": 0.0112, "num_tokens": 3382423.0, "reward": 0.6680815815925598, "reward_std": 0.561756432056427, "rewards/rollout_reward_func/mean": 0.6680815815925598, "rewards/rollout_reward_func/std": 0.5597373843193054, "sampling/importance_sampling_ratio/max": 1.4485403299331665, "sampling/importance_sampling_ratio/mean": 0.9985830783843994, "sampling/importance_sampling_ratio/min": 0.8130910992622375, "sampling/sampling_logp_difference/max": 0.3705407381057739, "sampling/sampling_logp_difference/mean": 0.02125030755996704, "step": 77, "step_time": 8.254739188999793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 385.21875, "completions/mean_terminated_length": 385.21875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.18936726078391075, "epoch": 0.00156, "frac_reward_zero_std": 0.25, "grad_norm": 0.10953646153211594, "kl": 1.7601478695869446, "learning_rate": 6.999017251771082e-05, "loss": 0.0057, "num_tokens": 3424651.0, "reward": 0.8278888463973999, "reward_std": 0.3518129587173462, "rewards/rollout_reward_func/mean": 0.8278888463973999, "rewards/rollout_reward_func/std": 0.4064043462276459, "sampling/importance_sampling_ratio/max": 1.2776691913604736, "sampling/importance_sampling_ratio/mean": 1.00196373462677, "sampling/importance_sampling_ratio/min": 0.8784951567649841, "sampling/sampling_logp_difference/max": 0.24513864517211914, "sampling/sampling_logp_difference/mean": 0.015150435268878937, "step": 78, "step_time": 8.344696540000314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 488.40625, "completions/mean_terminated_length": 488.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.255377059802413, "epoch": 0.00158, "frac_reward_zero_std": 0.0, "grad_norm": 0.25969529151916504, "kl": 2.3587107956409454, "learning_rate": 6.998716433463483e-05, "loss": 0.0007, "num_tokens": 3471474.0, "reward": 0.5629860758781433, "reward_std": 0.6000138521194458, "rewards/rollout_reward_func/mean": 0.5629860758781433, "rewards/rollout_reward_func/std": 0.5943225622177124, "sampling/importance_sampling_ratio/max": 1.2025734186172485, "sampling/importance_sampling_ratio/mean": 0.976870059967041, "sampling/importance_sampling_ratio/min": 0.5027166604995728, "sampling/sampling_logp_difference/max": 0.45729804039001465, "sampling/sampling_logp_difference/mean": 0.020079627633094788, "step": 79, "step_time": 8.366506297999877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 430.6875, "completions/mean_terminated_length": 430.6875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2579723224043846, "epoch": 0.0016, "frac_reward_zero_std": 0.0, "grad_norm": 0.24404332041740417, "kl": 1.7810578346252441, "learning_rate": 6.998375519073651e-05, "loss": 0.0096, "num_tokens": 3515093.0, "reward": 0.7292600870132446, "reward_std": 0.48634880781173706, "rewards/rollout_reward_func/mean": 0.7292600870132446, "rewards/rollout_reward_func/std": 0.4768061637878418, "sampling/importance_sampling_ratio/max": 1.3002490997314453, "sampling/importance_sampling_ratio/mean": 1.0180599689483643, "sampling/importance_sampling_ratio/min": 0.796721339225769, "sampling/sampling_logp_difference/max": 0.26252567768096924, "sampling/sampling_logp_difference/mean": 0.015404840931296349, "step": 80, "step_time": 8.591510155000151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 350.5, "completions/mean_terminated_length": 350.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.30479421745985746, "epoch": 0.00162, "frac_reward_zero_std": 0.0, "grad_norm": 0.15629281103610992, "kl": 1.781613141298294, "learning_rate": 6.997994513485727e-05, "loss": 0.0013, "num_tokens": 3555568.0, "reward": 0.4757760465145111, "reward_std": 0.6741898059844971, "rewards/rollout_reward_func/mean": 0.4757760465145111, "rewards/rollout_reward_func/std": 0.6888821721076965, "sampling/importance_sampling_ratio/max": 1.120108962059021, "sampling/importance_sampling_ratio/mean": 0.9715175628662109, "sampling/importance_sampling_ratio/min": 0.8922353982925415, "sampling/sampling_logp_difference/max": 0.1215214729309082, "sampling/sampling_logp_difference/mean": 0.014093402773141861, "step": 81, "step_time": 8.311515689000544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 339.15625, "completions/mean_terminated_length": 339.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3674714770168066, "epoch": 0.00164, "frac_reward_zero_std": 0.0, "grad_norm": 0.18878492712974548, "kl": 1.7992180287837982, "learning_rate": 6.997573422158226e-05, "loss": -0.0067, "num_tokens": 3596143.0, "reward": 0.5111106634140015, "reward_std": 0.7121702432632446, "rewards/rollout_reward_func/mean": 0.5111106634140015, "rewards/rollout_reward_func/std": 0.7338927984237671, "sampling/importance_sampling_ratio/max": 1.2800871133804321, "sampling/importance_sampling_ratio/mean": 1.00455904006958, "sampling/importance_sampling_ratio/min": 0.7677103281021118, "sampling/sampling_logp_difference/max": 0.2630174160003662, "sampling/sampling_logp_difference/mean": 0.020217955112457275, "step": 82, "step_time": 7.945674317999419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 292.59375, "completions/mean_terminated_length": 292.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.35092878714203835, "epoch": 0.00166, "frac_reward_zero_std": 0.25, "grad_norm": 0.26307934522628784, "kl": 1.5900853723287582, "learning_rate": 6.997112251123953e-05, "loss": 0.0002, "num_tokens": 3633639.0, "reward": 0.6487507224082947, "reward_std": 0.5868535041809082, "rewards/rollout_reward_func/mean": 0.6487507224082947, "rewards/rollout_reward_func/std": 0.7079955339431763, "sampling/importance_sampling_ratio/max": 1.6505076885223389, "sampling/importance_sampling_ratio/mean": 1.0203434228897095, "sampling/importance_sampling_ratio/min": 0.8300421237945557, "sampling/sampling_logp_difference/max": 0.43131542205810547, "sampling/sampling_logp_difference/mean": 0.016862552613019943, "step": 83, "step_time": 8.10490163999998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 381.6875, "completions/mean_terminated_length": 381.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4776056930422783, "epoch": 0.00168, "frac_reward_zero_std": 0.0, "grad_norm": 0.44747674465179443, "kl": 1.9982721656560898, "learning_rate": 6.996611006989916e-05, "loss": -0.0075, "num_tokens": 3677131.0, "reward": 0.3607633113861084, "reward_std": 0.8395156860351562, "rewards/rollout_reward_func/mean": 0.3607633113861084, "rewards/rollout_reward_func/std": 0.8391993641853333, "sampling/importance_sampling_ratio/max": 1.2758227586746216, "sampling/importance_sampling_ratio/mean": 0.9667969942092896, "sampling/importance_sampling_ratio/min": 0.6069893836975098, "sampling/sampling_logp_difference/max": 0.3109464645385742, "sampling/sampling_logp_difference/mean": 0.026365164667367935, "step": 84, "step_time": 8.343225314999927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 445.28125, "completions/mean_terminated_length": 445.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.36424944549798965, "epoch": 0.0017, "frac_reward_zero_std": 0.0, "grad_norm": 0.22888615727424622, "kl": 1.8504460602998734, "learning_rate": 6.996069696937243e-05, "loss": 0.0112, "num_tokens": 3721412.0, "reward": 0.5442749857902527, "reward_std": 0.6806379556655884, "rewards/rollout_reward_func/mean": 0.5442749857902527, "rewards/rollout_reward_func/std": 0.6851301789283752, "sampling/importance_sampling_ratio/max": 1.4193453788757324, "sampling/importance_sampling_ratio/mean": 1.0137629508972168, "sampling/importance_sampling_ratio/min": 0.8926205039024353, "sampling/sampling_logp_difference/max": 0.19996857643127441, "sampling/sampling_logp_difference/mean": 0.014609931968152523, "step": 85, "step_time": 8.325104749000275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 332.875, "completions/mean_terminated_length": 332.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.45267584547400475, "epoch": 0.00172, "frac_reward_zero_std": 0.0, "grad_norm": 0.3672713339328766, "kl": 1.7174315750598907, "learning_rate": 6.995488328721063e-05, "loss": 0.0027, "num_tokens": 3761565.0, "reward": 0.4253282845020294, "reward_std": 0.8324552178382874, "rewards/rollout_reward_func/mean": 0.4253282845020294, "rewards/rollout_reward_func/std": 0.8066444396972656, "sampling/importance_sampling_ratio/max": 1.5925917625427246, "sampling/importance_sampling_ratio/mean": 1.0033386945724487, "sampling/importance_sampling_ratio/min": 0.5601366758346558, "sampling/sampling_logp_difference/max": 0.42822766304016113, "sampling/sampling_logp_difference/mean": 0.02108183316886425, "step": 86, "step_time": 8.027882510999916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 365.53125, "completions/mean_terminated_length": 365.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3710917457938194, "epoch": 0.00174, "frac_reward_zero_std": 0.0, "grad_norm": 0.14773647487163544, "kl": 1.4494422674179077, "learning_rate": 6.994866910670403e-05, "loss": -0.0078, "num_tokens": 3803199.0, "reward": 0.5903237462043762, "reward_std": 0.7371278405189514, "rewards/rollout_reward_func/mean": 0.5903237462043762, "rewards/rollout_reward_func/std": 0.7547247409820557, "sampling/importance_sampling_ratio/max": 1.1033929586410522, "sampling/importance_sampling_ratio/mean": 0.9815865755081177, "sampling/importance_sampling_ratio/min": 0.7179847359657288, "sampling/sampling_logp_difference/max": 0.25618791580200195, "sampling/sampling_logp_difference/mean": 0.012633274309337139, "step": 87, "step_time": 8.08130858699974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 448.40625, "completions/mean_terminated_length": 448.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.3210822716355324, "epoch": 0.00176, "frac_reward_zero_std": 0.0, "grad_norm": 0.20952939987182617, "kl": 2.2278282195329666, "learning_rate": 6.994205451688072e-05, "loss": -0.0059, "num_tokens": 3848514.0, "reward": 0.5198886394500732, "reward_std": 0.7470307350158691, "rewards/rollout_reward_func/mean": 0.5198886394500732, "rewards/rollout_reward_func/std": 0.769844651222229, "sampling/importance_sampling_ratio/max": 1.1122186183929443, "sampling/importance_sampling_ratio/mean": 0.982390284538269, "sampling/importance_sampling_ratio/min": 0.5455213189125061, "sampling/sampling_logp_difference/max": 0.6039938926696777, "sampling/sampling_logp_difference/mean": 0.01470731757581234, "step": 88, "step_time": 8.817167401000233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 397.21875, "completions/mean_terminated_length": 397.21875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.25545692071318626, "epoch": 0.00178, "frac_reward_zero_std": 0.0, "grad_norm": 0.24466770887374878, "kl": 1.7027958184480667, "learning_rate": 6.993503961250521e-05, "loss": 0.0025, "num_tokens": 3890798.0, "reward": 0.608557939529419, "reward_std": 0.6426177024841309, "rewards/rollout_reward_func/mean": 0.608557939529419, "rewards/rollout_reward_func/std": 0.6761431097984314, "sampling/importance_sampling_ratio/max": 1.3116307258605957, "sampling/importance_sampling_ratio/mean": 1.0091814994812012, "sampling/importance_sampling_ratio/min": 0.909535825252533, "sampling/sampling_logp_difference/max": 0.22873687744140625, "sampling/sampling_logp_difference/mean": 0.010588819161057472, "step": 89, "step_time": 8.80232782200028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 410.40625, "completions/mean_terminated_length": 410.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.17479807510972023, "epoch": 0.0018, "frac_reward_zero_std": 0.0, "grad_norm": 0.0971289575099945, "kl": 1.5888445675373077, "learning_rate": 6.992762449407726e-05, "loss": 0.013, "num_tokens": 3932933.0, "reward": 0.636957585811615, "reward_std": 0.5633026361465454, "rewards/rollout_reward_func/mean": 0.636957585811615, "rewards/rollout_reward_func/std": 0.5680742263793945, "sampling/importance_sampling_ratio/max": 1.3635399341583252, "sampling/importance_sampling_ratio/mean": 1.0050634145736694, "sampling/importance_sampling_ratio/min": 0.89275723695755, "sampling/sampling_logp_difference/max": 0.3100614547729492, "sampling/sampling_logp_difference/mean": 0.007482980843633413, "step": 90, "step_time": 8.184203316000094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 389.4375, "completions/mean_terminated_length": 389.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.17599561251699924, "epoch": 0.00182, "frac_reward_zero_std": 0.0, "grad_norm": 0.2065366506576538, "kl": 2.2439624667167664, "learning_rate": 6.991980926783024e-05, "loss": 0.0093, "num_tokens": 3975498.0, "reward": 0.5995465517044067, "reward_std": 0.5976712107658386, "rewards/rollout_reward_func/mean": 0.5995465517044067, "rewards/rollout_reward_func/std": 0.5838564038276672, "sampling/importance_sampling_ratio/max": 1.0866305828094482, "sampling/importance_sampling_ratio/mean": 0.9829515218734741, "sampling/importance_sampling_ratio/min": 0.83978670835495, "sampling/sampling_logp_difference/max": 0.23869013786315918, "sampling/sampling_logp_difference/mean": 0.008459258824586868, "step": 91, "step_time": 7.9459561860003305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 473.09375, "completions/mean_terminated_length": 473.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.1822468126192689, "epoch": 0.00184, "frac_reward_zero_std": 0.0, "grad_norm": 0.19154804944992065, "kl": 1.894306719303131, "learning_rate": 6.99115940457298e-05, "loss": -0.0004, "num_tokens": 4022001.0, "reward": 0.6678733825683594, "reward_std": 0.5771548748016357, "rewards/rollout_reward_func/mean": 0.6678733825683594, "rewards/rollout_reward_func/std": 0.5600860118865967, "sampling/importance_sampling_ratio/max": 1.2554646730422974, "sampling/importance_sampling_ratio/mean": 1.0070075988769531, "sampling/importance_sampling_ratio/min": 0.8858056664466858, "sampling/sampling_logp_difference/max": 0.22751092910766602, "sampling/sampling_logp_difference/mean": 0.011102176271378994, "step": 92, "step_time": 8.984215293000034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 327.8125, "completions/mean_terminated_length": 327.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.15431288536638021, "epoch": 0.00186, "frac_reward_zero_std": 0.0, "grad_norm": 0.19767220318317413, "kl": 1.767415851354599, "learning_rate": 6.990297894547204e-05, "loss": 0.0016, "num_tokens": 4062104.0, "reward": 0.6994801163673401, "reward_std": 0.527228057384491, "rewards/rollout_reward_func/mean": 0.6994801163673401, "rewards/rollout_reward_func/std": 0.5491287112236023, "sampling/importance_sampling_ratio/max": 1.5052241086959839, "sampling/importance_sampling_ratio/mean": 1.024430274963379, "sampling/importance_sampling_ratio/min": 0.9644942879676819, "sampling/sampling_logp_difference/max": 0.4088963270187378, "sampling/sampling_logp_difference/mean": 0.007720736786723137, "step": 93, "step_time": 8.283556342000338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 281.46875, "completions/mean_terminated_length": 281.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.149189833085984, "epoch": 0.00188, "frac_reward_zero_std": 0.25, "grad_norm": 0.2802703082561493, "kl": 1.8762592151761055, "learning_rate": 6.989396409048212e-05, "loss": 0.0054, "num_tokens": 4099758.0, "reward": 0.8001121282577515, "reward_std": 0.38774827122688293, "rewards/rollout_reward_func/mean": 0.8001121282577515, "rewards/rollout_reward_func/std": 0.49347132444381714, "sampling/importance_sampling_ratio/max": 1.2361680269241333, "sampling/importance_sampling_ratio/mean": 0.9998959302902222, "sampling/importance_sampling_ratio/min": 0.6868039965629578, "sampling/sampling_logp_difference/max": 0.37544918060302734, "sampling/sampling_logp_difference/mean": 0.01619332656264305, "step": 94, "step_time": 8.360807555000065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 391.8125, "completions/mean_terminated_length": 391.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.18111733347177505, "epoch": 0.0019, "frac_reward_zero_std": 0.0, "grad_norm": 0.35250815749168396, "kl": 1.6701001524925232, "learning_rate": 6.988454960991214e-05, "loss": -0.0037, "num_tokens": 4142195.0, "reward": 0.6763222813606262, "reward_std": 0.6422635316848755, "rewards/rollout_reward_func/mean": 0.6763222813606262, "rewards/rollout_reward_func/std": 0.6572142243385315, "sampling/importance_sampling_ratio/max": 1.323617935180664, "sampling/importance_sampling_ratio/mean": 0.965711236000061, "sampling/importance_sampling_ratio/min": 0.49613961577415466, "sampling/sampling_logp_difference/max": 0.5761501789093018, "sampling/sampling_logp_difference/mean": 0.01674860343337059, "step": 95, "step_time": 8.027815820000114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 443.0625, "completions/mean_terminated_length": 443.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.1805575005710125, "epoch": 0.00192, "frac_reward_zero_std": 0.0, "grad_norm": 0.18090590834617615, "kl": 2.0828833878040314, "learning_rate": 6.987473563863965e-05, "loss": 0.0033, "num_tokens": 4186991.0, "reward": 0.6663029789924622, "reward_std": 0.5650061368942261, "rewards/rollout_reward_func/mean": 0.6663029789924622, "rewards/rollout_reward_func/std": 0.5636399388313293, "sampling/importance_sampling_ratio/max": 1.2429559230804443, "sampling/importance_sampling_ratio/mean": 1.0254368782043457, "sampling/importance_sampling_ratio/min": 0.8903895020484924, "sampling/sampling_logp_difference/max": 0.21748554706573486, "sampling/sampling_logp_difference/mean": 0.012768237851560116, "step": 96, "step_time": 8.135349525000265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 361.46875, "completions/mean_terminated_length": 361.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.12273205909878016, "epoch": 0.00194, "frac_reward_zero_std": 0.0, "grad_norm": 0.18846271932125092, "kl": 1.864557921886444, "learning_rate": 6.98645223172654e-05, "loss": -0.0001, "num_tokens": 4228420.0, "reward": 0.6380250453948975, "reward_std": 0.6412568092346191, "rewards/rollout_reward_func/mean": 0.6380250453948975, "rewards/rollout_reward_func/std": 0.620073139667511, "sampling/importance_sampling_ratio/max": 1.4166878461837769, "sampling/importance_sampling_ratio/mean": 1.0161833763122559, "sampling/importance_sampling_ratio/min": 0.9093538522720337, "sampling/sampling_logp_difference/max": 0.3482828140258789, "sampling/sampling_logp_difference/mean": 0.007998542860150337, "step": 97, "step_time": 8.395397782000146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 438.09375, "completions/mean_terminated_length": 438.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.15049820952117443, "epoch": 0.00196, "frac_reward_zero_std": 0.25, "grad_norm": 0.12161560356616974, "kl": 1.7080794423818588, "learning_rate": 6.985390979211156e-05, "loss": 0.0111, "num_tokens": 4272909.0, "reward": 0.7285215854644775, "reward_std": 0.3952997326850891, "rewards/rollout_reward_func/mean": 0.7285215854644775, "rewards/rollout_reward_func/std": 0.4778377115726471, "sampling/importance_sampling_ratio/max": 1.1276673078536987, "sampling/importance_sampling_ratio/mean": 0.9685790538787842, "sampling/importance_sampling_ratio/min": 0.697672426700592, "sampling/sampling_logp_difference/max": 0.36006784439086914, "sampling/sampling_logp_difference/mean": 0.011554844677448273, "step": 98, "step_time": 8.293451802000163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 385.46875, "completions/mean_terminated_length": 385.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.139141661580652, "epoch": 0.00198, "frac_reward_zero_std": 0.0, "grad_norm": 0.1300252377986908, "kl": 1.862698808312416, "learning_rate": 6.98428982152195e-05, "loss": 0.0182, "num_tokens": 4315637.0, "reward": 0.6625325679779053, "reward_std": 0.5050868391990662, "rewards/rollout_reward_func/mean": 0.6625325679779053, "rewards/rollout_reward_func/std": 0.5087482929229736, "sampling/importance_sampling_ratio/max": 1.0950313806533813, "sampling/importance_sampling_ratio/mean": 0.996522068977356, "sampling/importance_sampling_ratio/min": 0.944489598274231, "sampling/sampling_logp_difference/max": 0.09079241752624512, "sampling/sampling_logp_difference/mean": 0.0044177137315273285, "step": 99, "step_time": 8.517537991000154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 462.25, "completions/mean_terminated_length": 462.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1649560034275055, "epoch": 0.002, "frac_reward_zero_std": 0.0, "grad_norm": 0.44814881682395935, "kl": 1.9073071479797363, "learning_rate": 6.983148774434763e-05, "loss": 0.0149, "num_tokens": 4361297.0, "reward": 0.5654847621917725, "reward_std": 0.5679118037223816, "rewards/rollout_reward_func/mean": 0.5654847621917725, "rewards/rollout_reward_func/std": 0.5898941159248352, "sampling/importance_sampling_ratio/max": 1.5087393522262573, "sampling/importance_sampling_ratio/mean": 1.0307037830352783, "sampling/importance_sampling_ratio/min": 0.8818880915641785, "sampling/sampling_logp_difference/max": 0.4227931499481201, "sampling/sampling_logp_difference/mean": 0.01391515601426363, "step": 100, "step_time": 8.106754661000195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 353.625, "completions/mean_terminated_length": 353.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1756619354709983, "epoch": 0.00202, "frac_reward_zero_std": 0.25, "grad_norm": 0.17520330846309662, "kl": 1.7513129711151123, "learning_rate": 6.981967854296917e-05, "loss": 0.0006, "num_tokens": 4402476.0, "reward": 0.7050204277038574, "reward_std": 0.5086288452148438, "rewards/rollout_reward_func/mean": 0.7050204277038574, "rewards/rollout_reward_func/std": 0.5992195010185242, "sampling/importance_sampling_ratio/max": 1.203630805015564, "sampling/importance_sampling_ratio/mean": 1.0026533603668213, "sampling/importance_sampling_ratio/min": 0.8004232048988342, "sampling/sampling_logp_difference/max": 0.222581148147583, "sampling/sampling_logp_difference/mean": 0.009565237909555435, "step": 101, "step_time": 7.943958140000632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 408.78125, "completions/mean_terminated_length": 408.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.17221538815647364, "epoch": 0.00204, "frac_reward_zero_std": 0.0, "grad_norm": 0.2141128033399582, "kl": 1.8871168568730354, "learning_rate": 6.980747078026981e-05, "loss": 0.0176, "num_tokens": 4445568.0, "reward": 0.7649700045585632, "reward_std": 0.4514860212802887, "rewards/rollout_reward_func/mean": 0.7649700045585632, "rewards/rollout_reward_func/std": 0.4514249861240387, "sampling/importance_sampling_ratio/max": 1.3050637245178223, "sampling/importance_sampling_ratio/mean": 1.0144734382629395, "sampling/importance_sampling_ratio/min": 0.7950129508972168, "sampling/sampling_logp_difference/max": 0.3070017695426941, "sampling/sampling_logp_difference/mean": 0.017191601917147636, "step": 102, "step_time": 8.938696585000116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 335.71875, "completions/mean_terminated_length": 335.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.1572232014441397, "epoch": 0.00206, "frac_reward_zero_std": 0.25, "grad_norm": 0.529062032699585, "kl": 1.8250514715909958, "learning_rate": 6.97948646311452e-05, "loss": 0.0092, "num_tokens": 4485609.0, "reward": 0.6728852391242981, "reward_std": 0.5197975635528564, "rewards/rollout_reward_func/mean": 0.6728852391242981, "rewards/rollout_reward_func/std": 0.6089306473731995, "sampling/importance_sampling_ratio/max": 1.2196383476257324, "sampling/importance_sampling_ratio/mean": 1.0031460523605347, "sampling/importance_sampling_ratio/min": 0.9217978119850159, "sampling/sampling_logp_difference/max": 0.19861054420471191, "sampling/sampling_logp_difference/mean": 0.008735416457057, "step": 103, "step_time": 7.847385109000015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 295.59375, "completions/mean_terminated_length": 295.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.13076199794886634, "epoch": 0.00208, "frac_reward_zero_std": 0.0, "grad_norm": 0.3052496612071991, "kl": 1.8629903495311737, "learning_rate": 6.97818602761986e-05, "loss": 0.0122, "num_tokens": 4523689.0, "reward": 0.7686701416969299, "reward_std": 0.5056204795837402, "rewards/rollout_reward_func/mean": 0.7686701416969299, "rewards/rollout_reward_func/std": 0.510356605052948, "sampling/importance_sampling_ratio/max": 1.2289683818817139, "sampling/importance_sampling_ratio/mean": 1.0025521516799927, "sampling/importance_sampling_ratio/min": 0.7775745987892151, "sampling/sampling_logp_difference/max": 0.25156688690185547, "sampling/sampling_logp_difference/mean": 0.010572392493486404, "step": 104, "step_time": 8.36837475700031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 487.40625, "completions/mean_terminated_length": 487.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1476985989138484, "epoch": 0.0021, "frac_reward_zero_std": 0.0, "grad_norm": 0.2932775616645813, "kl": 2.026362434029579, "learning_rate": 6.976845790173813e-05, "loss": 0.0137, "num_tokens": 4569852.0, "reward": 0.5639874935150146, "reward_std": 0.5734737515449524, "rewards/rollout_reward_func/mean": 0.5639874935150146, "rewards/rollout_reward_func/std": 0.592988908290863, "sampling/importance_sampling_ratio/max": 1.9975080490112305, "sampling/importance_sampling_ratio/mean": 1.0277347564697266, "sampling/importance_sampling_ratio/min": 0.7113316059112549, "sampling/sampling_logp_difference/max": 0.4291799068450928, "sampling/sampling_logp_difference/mean": 0.013813110068440437, "step": 105, "step_time": 8.328415839999934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 409.09375, "completions/mean_terminated_length": 409.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.11719931941479445, "epoch": 0.00212, "frac_reward_zero_std": 0.5, "grad_norm": 0.1532392054796219, "kl": 1.7445210963487625, "learning_rate": 6.975465769977424e-05, "loss": 0.0058, "num_tokens": 4613282.0, "reward": 0.8366729021072388, "reward_std": 0.31211966276168823, "rewards/rollout_reward_func/mean": 0.8366729021072388, "rewards/rollout_reward_func/std": 0.4601250886917114, "sampling/importance_sampling_ratio/max": 1.116402506828308, "sampling/importance_sampling_ratio/mean": 0.9955401420593262, "sampling/importance_sampling_ratio/min": 0.6487058401107788, "sampling/sampling_logp_difference/max": 0.4737354516983032, "sampling/sampling_logp_difference/mean": 0.010037299245595932, "step": 106, "step_time": 9.061311543000102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 468.75, "completions/mean_terminated_length": 468.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1071808859705925, "epoch": 0.00214, "frac_reward_zero_std": 0.25, "grad_norm": 0.21042250096797943, "kl": 1.6240965127944946, "learning_rate": 6.974045986801684e-05, "loss": 0.0118, "num_tokens": 4659065.0, "reward": 0.7717170119285583, "reward_std": 0.40683332085609436, "rewards/rollout_reward_func/mean": 0.7717170119285583, "rewards/rollout_reward_func/std": 0.5048468708992004, "sampling/importance_sampling_ratio/max": 1.2543514966964722, "sampling/importance_sampling_ratio/mean": 1.0016342401504517, "sampling/importance_sampling_ratio/min": 0.8529443144798279, "sampling/sampling_logp_difference/max": 0.22654488682746887, "sampling/sampling_logp_difference/mean": 0.008881621062755585, "step": 107, "step_time": 8.70277123299934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 396.6875, "completions/mean_terminated_length": 396.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09364429651759565, "epoch": 0.00216, "frac_reward_zero_std": 0.25, "grad_norm": 0.28397414088249207, "kl": 2.4982657209038734, "learning_rate": 6.972586460987255e-05, "loss": -0.005, "num_tokens": 4701779.0, "reward": 0.6166483163833618, "reward_std": 0.6078213453292847, "rewards/rollout_reward_func/mean": 0.6166483163833618, "rewards/rollout_reward_func/std": 0.7126331329345703, "sampling/importance_sampling_ratio/max": 1.1536800861358643, "sampling/importance_sampling_ratio/mean": 0.996100902557373, "sampling/importance_sampling_ratio/min": 0.7578999400138855, "sampling/sampling_logp_difference/max": 0.1671619415283203, "sampling/sampling_logp_difference/mean": 0.007744008209556341, "step": 108, "step_time": 8.013261022000279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 491.6875, "completions/mean_terminated_length": 491.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06487412983551621, "epoch": 0.00218, "frac_reward_zero_std": 0.0, "grad_norm": 0.1661359965801239, "kl": 2.218500182032585, "learning_rate": 6.971087213444174e-05, "loss": 0.0033, "num_tokens": 4749279.0, "reward": 0.4031241536140442, "reward_std": 0.66425621509552, "rewards/rollout_reward_func/mean": 0.4031241536140442, "rewards/rollout_reward_func/std": 0.6419984102249146, "sampling/importance_sampling_ratio/max": 1.21169114112854, "sampling/importance_sampling_ratio/mean": 0.9936032295227051, "sampling/importance_sampling_ratio/min": 0.780785322189331, "sampling/sampling_logp_difference/max": 0.3973425626754761, "sampling/sampling_logp_difference/mean": 0.011129787191748619, "step": 109, "step_time": 8.572064380999564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 433.5, "completions/mean_terminated_length": 433.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.15283073438331485, "epoch": 0.0022, "frac_reward_zero_std": 0.25, "grad_norm": 0.5786590576171875, "kl": 1.631599947810173, "learning_rate": 6.969548265651556e-05, "loss": 0.0127, "num_tokens": 4792898.0, "reward": 0.7709787487983704, "reward_std": 0.43653905391693115, "rewards/rollout_reward_func/mean": 0.7709787487983704, "rewards/rollout_reward_func/std": 0.5063337683677673, "sampling/importance_sampling_ratio/max": 2.099424362182617, "sampling/importance_sampling_ratio/mean": 0.9183117151260376, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.290827751159668, "sampling/sampling_logp_difference/mean": 0.07958551496267319, "step": 110, "step_time": 8.095649133000279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 390.6875, "completions/mean_terminated_length": 390.6875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.049667182145640254, "epoch": 0.00222, "frac_reward_zero_std": 0.0, "grad_norm": 0.12858329713344574, "kl": 1.7075619101524353, "learning_rate": 6.967969639657288e-05, "loss": 0.0095, "num_tokens": 4835891.0, "reward": 0.7640657424926758, "reward_std": 0.4550207853317261, "rewards/rollout_reward_func/mean": 0.7640657424926758, "rewards/rollout_reward_func/std": 0.45319947600364685, "sampling/importance_sampling_ratio/max": 1.0411285161972046, "sampling/importance_sampling_ratio/mean": 0.9788383841514587, "sampling/importance_sampling_ratio/min": 0.47018277645111084, "sampling/sampling_logp_difference/max": 0.7546225190162659, "sampling/sampling_logp_difference/mean": 0.00914161466062069, "step": 111, "step_time": 8.40615492599909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 265.90625, "completions/mean_terminated_length": 265.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05648553936043754, "epoch": 0.00224, "frac_reward_zero_std": 0.25, "grad_norm": 0.18855810165405273, "kl": 1.648529276251793, "learning_rate": 6.966351358077707e-05, "loss": 0.0137, "num_tokens": 4873641.0, "reward": 0.7757415771484375, "reward_std": 0.41727834939956665, "rewards/rollout_reward_func/mean": 0.7757415771484375, "rewards/rollout_reward_func/std": 0.5592144727706909, "sampling/importance_sampling_ratio/max": 1.0144052505493164, "sampling/importance_sampling_ratio/mean": 0.936678409576416, "sampling/importance_sampling_ratio/min": 0.3933812379837036, "sampling/sampling_logp_difference/max": 0.9329784512519836, "sampling/sampling_logp_difference/mean": 0.023122316226363182, "step": 112, "step_time": 7.418490388999999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 252.6875, "completions/mean_terminated_length": 252.6875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07089687278494239, "epoch": 0.00226, "frac_reward_zero_std": 0.25, "grad_norm": 0.5723386406898499, "kl": 1.9006303548812866, "learning_rate": 6.964693444097279e-05, "loss": -0.0057, "num_tokens": 4910852.0, "reward": 0.6603022813796997, "reward_std": 0.48169267177581787, "rewards/rollout_reward_func/mean": 0.6603022813796997, "rewards/rollout_reward_func/std": 0.571934700012207, "sampling/importance_sampling_ratio/max": 1.5453569889068604, "sampling/importance_sampling_ratio/mean": 0.9794349670410156, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.0768280029296875, "sampling/sampling_logp_difference/mean": 0.038831666111946106, "step": 113, "step_time": 7.641681788999904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 395.65625, "completions/mean_terminated_length": 395.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.039838066906668246, "epoch": 0.00228, "frac_reward_zero_std": 0.0, "grad_norm": 0.2050938755273819, "kl": 1.761099100112915, "learning_rate": 6.962995921468272e-05, "loss": 0.0019, "num_tokens": 4953623.0, "reward": 0.46693187952041626, "reward_std": 0.6217513680458069, "rewards/rollout_reward_func/mean": 0.46693187952041626, "rewards/rollout_reward_func/std": 0.6492326259613037, "sampling/importance_sampling_ratio/max": 1.106942057609558, "sampling/importance_sampling_ratio/mean": 0.9798781871795654, "sampling/importance_sampling_ratio/min": 0.6083207726478577, "sampling/sampling_logp_difference/max": 0.4970542788505554, "sampling/sampling_logp_difference/mean": 0.008343815803527832, "step": 114, "step_time": 8.533304958999452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 384.5625, "completions/mean_terminated_length": 384.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.060432382859289646, "epoch": 0.0023, "frac_reward_zero_std": 0.0, "grad_norm": 0.4672287404537201, "kl": 1.6556293070316315, "learning_rate": 6.961258814510408e-05, "loss": 0.0094, "num_tokens": 4996062.0, "reward": 0.6603374481201172, "reward_std": 0.47468507289886475, "rewards/rollout_reward_func/mean": 0.6603374481201172, "rewards/rollout_reward_func/std": 0.5120614171028137, "sampling/importance_sampling_ratio/max": 1.8685352802276611, "sampling/importance_sampling_ratio/mean": 0.9285306930541992, "sampling/importance_sampling_ratio/min": 0.17895597219467163, "sampling/sampling_logp_difference/max": 1.7206169366836548, "sampling/sampling_logp_difference/mean": 0.05124187469482422, "step": 115, "step_time": 8.148025684000459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 256.71875, "completions/mean_terminated_length": 256.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06126073328778148, "epoch": 0.00232, "frac_reward_zero_std": 0.0, "grad_norm": 8.576211929321289, "kl": 8.639516994357109, "learning_rate": 6.959482148110518e-05, "loss": 0.0435, "num_tokens": 5032227.0, "reward": 0.623213529586792, "reward_std": 0.5156008005142212, "rewards/rollout_reward_func/mean": 0.623213529586792, "rewards/rollout_reward_func/std": 0.5290338397026062, "sampling/importance_sampling_ratio/max": 1.8682678937911987, "sampling/importance_sampling_ratio/mean": 0.9609127044677734, "sampling/importance_sampling_ratio/min": 0.1256217062473297, "sampling/sampling_logp_difference/max": 2.074484348297119, "sampling/sampling_logp_difference/mean": 0.06468600034713745, "step": 116, "step_time": 8.27468268299981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 318.125, "completions/mean_terminated_length": 318.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06389371224213392, "epoch": 0.00234, "frac_reward_zero_std": 0.0, "grad_norm": 0.4545961022377014, "kl": 2.2334485054016113, "learning_rate": 6.957665947722184e-05, "loss": 0.0043, "num_tokens": 5072197.0, "reward": 0.5617854595184326, "reward_std": 0.5770033597946167, "rewards/rollout_reward_func/mean": 0.5617854595184326, "rewards/rollout_reward_func/std": 0.5956276059150696, "sampling/importance_sampling_ratio/max": 1.1062841415405273, "sampling/importance_sampling_ratio/mean": 0.976425051689148, "sampling/importance_sampling_ratio/min": 0.7219615578651428, "sampling/sampling_logp_difference/max": 0.24492979049682617, "sampling/sampling_logp_difference/mean": 0.008697876706719398, "step": 117, "step_time": 7.940881702998922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 372.875, "completions/mean_terminated_length": 372.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.037457566475495696, "epoch": 0.00236, "frac_reward_zero_std": 0.0, "grad_norm": 0.07120159268379211, "kl": 1.6351619958877563, "learning_rate": 6.955810239365379e-05, "loss": 0.0119, "num_tokens": 5113945.0, "reward": 0.6960131525993347, "reward_std": 0.45870327949523926, "rewards/rollout_reward_func/mean": 0.6960131525993347, "rewards/rollout_reward_func/std": 0.4939889907836914, "sampling/importance_sampling_ratio/max": 1.189887285232544, "sampling/importance_sampling_ratio/mean": 1.0076897144317627, "sampling/importance_sampling_ratio/min": 0.9666085839271545, "sampling/sampling_logp_difference/max": 0.09520590305328369, "sampling/sampling_logp_difference/mean": 0.0024062409065663815, "step": 118, "step_time": 8.102439531000527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 301.75, "completions/mean_terminated_length": 301.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05895731248892844, "epoch": 0.00238, "frac_reward_zero_std": 0.0, "grad_norm": 0.47188901901245117, "kl": 1.7260829508304596, "learning_rate": 6.953915049626087e-05, "loss": 0.0064, "num_tokens": 5152539.0, "reward": 0.6968393921852112, "reward_std": 0.5568441152572632, "rewards/rollout_reward_func/mean": 0.6968393921852112, "rewards/rollout_reward_func/std": 0.5546013116836548, "sampling/importance_sampling_ratio/max": 1.2302684783935547, "sampling/importance_sampling_ratio/mean": 1.0094701051712036, "sampling/importance_sampling_ratio/min": 0.9583143591880798, "sampling/sampling_logp_difference/max": 0.20723068714141846, "sampling/sampling_logp_difference/mean": 0.00409814715385437, "step": 119, "step_time": 8.330380652000258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 411.65625, "completions/mean_terminated_length": 411.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06105515197850764, "epoch": 0.0024, "frac_reward_zero_std": 0.0, "grad_norm": 0.21721751987934113, "kl": 1.6542091220617294, "learning_rate": 6.951980405655927e-05, "loss": 0.0084, "num_tokens": 5196870.0, "reward": 0.5635444521903992, "reward_std": 0.5712343454360962, "rewards/rollout_reward_func/mean": 0.5635444521903992, "rewards/rollout_reward_func/std": 0.5937695503234863, "sampling/importance_sampling_ratio/max": 1.3107467889785767, "sampling/importance_sampling_ratio/mean": 1.0076525211334229, "sampling/importance_sampling_ratio/min": 0.8808911442756653, "sampling/sampling_logp_difference/max": 0.6149251461029053, "sampling/sampling_logp_difference/mean": 0.01265739742666483, "step": 120, "step_time": 8.520250364000276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 381.875, "completions/mean_terminated_length": 381.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.049431102350354195, "epoch": 0.00242, "frac_reward_zero_std": 0.25, "grad_norm": 0.0861010029911995, "kl": 1.6912771165370941, "learning_rate": 6.950006335171767e-05, "loss": -0.0019, "num_tokens": 5239095.0, "reward": 0.7072031497955322, "reward_std": 0.4696626663208008, "rewards/rollout_reward_func/mean": 0.7072031497955322, "rewards/rollout_reward_func/std": 0.5934861302375793, "sampling/importance_sampling_ratio/max": 1.1324269771575928, "sampling/importance_sampling_ratio/mean": 0.9850848913192749, "sampling/importance_sampling_ratio/min": 0.5153018236160278, "sampling/sampling_logp_difference/max": 0.296215295791626, "sampling/sampling_logp_difference/mean": 0.007136152591556311, "step": 121, "step_time": 8.439428004000092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 404.90625, "completions/mean_terminated_length": 404.90625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04797510977368802, "epoch": 0.00244, "frac_reward_zero_std": 0.0, "grad_norm": 0.4094468951225281, "kl": 1.764967754483223, "learning_rate": 6.947992866455315e-05, "loss": 0.0127, "num_tokens": 5281941.0, "reward": 0.6603405475616455, "reward_std": 0.5064055919647217, "rewards/rollout_reward_func/mean": 0.6603405475616455, "rewards/rollout_reward_func/std": 0.5119606852531433, "sampling/importance_sampling_ratio/max": 1.5873793363571167, "sampling/importance_sampling_ratio/mean": 0.9975490570068359, "sampling/importance_sampling_ratio/min": 0.7251269221305847, "sampling/sampling_logp_difference/max": 0.462078332901001, "sampling/sampling_logp_difference/mean": 0.010421337559819221, "step": 122, "step_time": 8.476206724000349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 336.40625, "completions/mean_terminated_length": 336.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04149223316926509, "epoch": 0.00246, "frac_reward_zero_std": 0.0, "grad_norm": 0.1540147364139557, "kl": 2.0502097755670547, "learning_rate": 6.945940028352729e-05, "loss": 0.0079, "num_tokens": 5322733.0, "reward": 0.521257758140564, "reward_std": 0.5298856496810913, "rewards/rollout_reward_func/mean": 0.521257758140564, "rewards/rollout_reward_func/std": 0.5516647696495056, "sampling/importance_sampling_ratio/max": 1.1865365505218506, "sampling/importance_sampling_ratio/mean": 1.004417896270752, "sampling/importance_sampling_ratio/min": 0.7911506295204163, "sampling/sampling_logp_difference/max": 0.23504924774169922, "sampling/sampling_logp_difference/mean": 0.004832152742892504, "step": 123, "step_time": 8.755636847000915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 352.53125, "completions/mean_terminated_length": 352.53125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04489810473751277, "epoch": 0.00248, "frac_reward_zero_std": 0.0, "grad_norm": 0.10008637607097626, "kl": 1.729919746518135, "learning_rate": 6.943847850274189e-05, "loss": -0.0002, "num_tokens": 5364045.0, "reward": 0.6558245420455933, "reward_std": 0.5139605402946472, "rewards/rollout_reward_func/mean": 0.6558245420455933, "rewards/rollout_reward_func/std": 0.51876300573349, "sampling/importance_sampling_ratio/max": 1.0309457778930664, "sampling/importance_sampling_ratio/mean": 1.0003764629364014, "sampling/importance_sampling_ratio/min": 0.9871845245361328, "sampling/sampling_logp_difference/max": 0.13472747802734375, "sampling/sampling_logp_difference/mean": 0.003076164284721017, "step": 124, "step_time": 8.377294558999893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 353.375, "completions/mean_terminated_length": 353.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0585522479377687, "epoch": 0.0025, "frac_reward_zero_std": 0.0, "grad_norm": 0.1530948430299759, "kl": 1.8962959796190262, "learning_rate": 6.94171636219349e-05, "loss": -0.0008, "num_tokens": 5405215.0, "reward": 0.5011724829673767, "reward_std": 0.6464757919311523, "rewards/rollout_reward_func/mean": 0.5011724829673767, "rewards/rollout_reward_func/std": 0.647203266620636, "sampling/importance_sampling_ratio/max": 1.0165759325027466, "sampling/importance_sampling_ratio/mean": 0.9888937473297119, "sampling/importance_sampling_ratio/min": 0.7571166157722473, "sampling/sampling_logp_difference/max": 0.28024613857269287, "sampling/sampling_logp_difference/mean": 0.0037572914734482765, "step": 125, "step_time": 8.887125216000186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 449.5, "completions/mean_terminated_length": 449.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.036634604912251234, "epoch": 0.00252, "frac_reward_zero_std": 0.0, "grad_norm": 0.12788502871990204, "kl": 1.7886826545000076, "learning_rate": 6.939545594647603e-05, "loss": 0.0096, "num_tokens": 5449989.0, "reward": 0.7271150350570679, "reward_std": 0.47620663046836853, "rewards/rollout_reward_func/mean": 0.7271150350570679, "rewards/rollout_reward_func/std": 0.48027390241622925, "sampling/importance_sampling_ratio/max": 1.1287072896957397, "sampling/importance_sampling_ratio/mean": 0.9990203380584717, "sampling/importance_sampling_ratio/min": 0.8802892565727234, "sampling/sampling_logp_difference/max": 0.127532958984375, "sampling/sampling_logp_difference/mean": 0.002252740552648902, "step": 126, "step_time": 8.549569208999856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 405.28125, "completions/mean_terminated_length": 405.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08991920086555183, "epoch": 0.00254, "frac_reward_zero_std": 0.0, "grad_norm": 0.21976742148399353, "kl": 1.778988093137741, "learning_rate": 6.937335578736238e-05, "loss": -0.002, "num_tokens": 5493345.0, "reward": 0.6024448871612549, "reward_std": 0.6412853002548218, "rewards/rollout_reward_func/mean": 0.6024448871612549, "rewards/rollout_reward_func/std": 0.6338467597961426, "sampling/importance_sampling_ratio/max": 1.0722838640213013, "sampling/importance_sampling_ratio/mean": 0.9917869567871094, "sampling/importance_sampling_ratio/min": 0.7798120975494385, "sampling/sampling_logp_difference/max": 0.24870634078979492, "sampling/sampling_logp_difference/mean": 0.004676412791013718, "step": 127, "step_time": 8.307155995000358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 493.625, "completions/mean_terminated_length": 493.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.061719701858237386, "epoch": 0.00256, "frac_reward_zero_std": 0.0, "grad_norm": 0.1902787685394287, "kl": 1.7388941049575806, "learning_rate": 6.935086346121403e-05, "loss": 0.0163, "num_tokens": 5540166.0, "reward": 0.6273924112319946, "reward_std": 0.474701464176178, "rewards/rollout_reward_func/mean": 0.6273924112319946, "rewards/rollout_reward_func/std": 0.5232400298118591, "sampling/importance_sampling_ratio/max": 1.0163652896881104, "sampling/importance_sampling_ratio/mean": 0.9982985258102417, "sampling/importance_sampling_ratio/min": 0.9351279139518738, "sampling/sampling_logp_difference/max": 0.089058518409729, "sampling/sampling_logp_difference/mean": 0.0022086985409259796, "step": 128, "step_time": 8.55156641099893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 409.09375, "completions/mean_terminated_length": 409.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06503343977965415, "epoch": 0.00258, "frac_reward_zero_std": 0.25, "grad_norm": 0.11314985156059265, "kl": 1.6655625700950623, "learning_rate": 6.932797929026946e-05, "loss": 0.0023, "num_tokens": 5583245.0, "reward": 0.6994686126708984, "reward_std": 0.436296284198761, "rewards/rollout_reward_func/mean": 0.6994686126708984, "rewards/rollout_reward_func/std": 0.5503652691841125, "sampling/importance_sampling_ratio/max": 1.1665102243423462, "sampling/importance_sampling_ratio/mean": 1.001559853553772, "sampling/importance_sampling_ratio/min": 0.8893438577651978, "sampling/sampling_logp_difference/max": 0.14663994312286377, "sampling/sampling_logp_difference/mean": 0.003469959134235978, "step": 129, "step_time": 8.024904284000968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 416.09375, "completions/mean_terminated_length": 416.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06782270362600684, "epoch": 0.0026, "frac_reward_zero_std": 0.25, "grad_norm": 0.22241033613681793, "kl": 1.7370643466711044, "learning_rate": 6.930470360238097e-05, "loss": 0.0064, "num_tokens": 5625592.0, "reward": 0.7618658542633057, "reward_std": 0.3928532302379608, "rewards/rollout_reward_func/mean": 0.7618658542633057, "rewards/rollout_reward_func/std": 0.45730534195899963, "sampling/importance_sampling_ratio/max": 1.0976089239120483, "sampling/importance_sampling_ratio/mean": 1.0038659572601318, "sampling/importance_sampling_ratio/min": 0.9126062393188477, "sampling/sampling_logp_difference/max": 0.09300017356872559, "sampling/sampling_logp_difference/mean": 0.004241408314555883, "step": 130, "step_time": 8.32918149599982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2526.0, "completions/max_terminated_length": 2526.0, "completions/mean_length": 537.0625, "completions/mean_terminated_length": 537.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.11232386901974678, "epoch": 0.00262, "frac_reward_zero_std": 0.25, "grad_norm": 0.19914263486862183, "kl": 1.6124058216810226, "learning_rate": 6.928103673100996e-05, "loss": 0.0134, "num_tokens": 5674512.0, "reward": 0.7372391223907471, "reward_std": 0.42708802223205566, "rewards/rollout_reward_func/mean": 0.7372391223907471, "rewards/rollout_reward_func/std": 0.5279713869094849, "sampling/importance_sampling_ratio/max": 1.1086699962615967, "sampling/importance_sampling_ratio/mean": 0.9869132041931152, "sampling/importance_sampling_ratio/min": 0.7251589298248291, "sampling/sampling_logp_difference/max": 0.2999105453491211, "sampling/sampling_logp_difference/mean": 0.00835170317441225, "step": 131, "step_time": 12.70340933099942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1587.0, "completions/max_terminated_length": 1587.0, "completions/mean_length": 493.15625, "completions/mean_terminated_length": 493.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1036450732499361, "epoch": 0.00264, "frac_reward_zero_std": 0.0, "grad_norm": 0.22418592870235443, "kl": 1.189106211066246, "learning_rate": 6.925697901522216e-05, "loss": 0.0102, "num_tokens": 5720451.0, "reward": 0.7663019895553589, "reward_std": 0.45075100660324097, "rewards/rollout_reward_func/mean": 0.7663019895553589, "rewards/rollout_reward_func/std": 0.44891929626464844, "sampling/importance_sampling_ratio/max": 1.420162320137024, "sampling/importance_sampling_ratio/mean": 0.9603314399719238, "sampling/importance_sampling_ratio/min": 0.45742419362068176, "sampling/sampling_logp_difference/max": 0.7829279899597168, "sampling/sampling_logp_difference/mean": 0.024587398394942284, "step": 132, "step_time": 10.090436069999669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 423.65625, "completions/mean_terminated_length": 423.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1285006799735129, "epoch": 0.00266, "frac_reward_zero_std": 0.25, "grad_norm": 0.19017264246940613, "kl": 1.4345169216394424, "learning_rate": 6.923253079968278e-05, "loss": 0.0193, "num_tokens": 5763838.0, "reward": 0.7736271619796753, "reward_std": 0.4055019021034241, "rewards/rollout_reward_func/mean": 0.7736271619796753, "rewards/rollout_reward_func/std": 0.49772101640701294, "sampling/importance_sampling_ratio/max": 1.762739658355713, "sampling/importance_sampling_ratio/mean": 1.0055733919143677, "sampling/importance_sampling_ratio/min": 0.7579171657562256, "sampling/sampling_logp_difference/max": 0.5607509613037109, "sampling/sampling_logp_difference/mean": 0.015371108427643776, "step": 133, "step_time": 9.161538816999382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 376.84375, "completions/mean_terminated_length": 376.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.14960732590407133, "epoch": 0.00268, "frac_reward_zero_std": 0.25, "grad_norm": 0.19790472090244293, "kl": 1.4920354634523392, "learning_rate": 6.92076924346515e-05, "loss": -0.0037, "num_tokens": 5806142.0, "reward": 0.589077353477478, "reward_std": 0.6109552383422852, "rewards/rollout_reward_func/mean": 0.589077353477478, "rewards/rollout_reward_func/std": 0.7544901967048645, "sampling/importance_sampling_ratio/max": 1.3257403373718262, "sampling/importance_sampling_ratio/mean": 0.9894411563873291, "sampling/importance_sampling_ratio/min": 0.7823004722595215, "sampling/sampling_logp_difference/max": 0.28122401237487793, "sampling/sampling_logp_difference/mean": 0.014850204810500145, "step": 134, "step_time": 8.218136736999895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1582.0, "completions/max_terminated_length": 1582.0, "completions/mean_length": 529.0, "completions/mean_terminated_length": 529.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.17403697315603495, "epoch": 0.0027, "frac_reward_zero_std": 0.0, "grad_norm": 0.34759271144866943, "kl": 1.7940292358398438, "learning_rate": 6.918246427597761e-05, "loss": -0.0141, "num_tokens": 5853615.0, "reward": 0.5549989342689514, "reward_std": 0.7460665702819824, "rewards/rollout_reward_func/mean": 0.5549989342689514, "rewards/rollout_reward_func/std": 0.7627648711204529, "sampling/importance_sampling_ratio/max": 1.495866298675537, "sampling/importance_sampling_ratio/mean": 1.0274715423583984, "sampling/importance_sampling_ratio/min": 0.7577843070030212, "sampling/sampling_logp_difference/max": 0.36718177795410156, "sampling/sampling_logp_difference/mean": 0.019773241132497787, "step": 135, "step_time": 11.131566321999799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1603.0, "completions/max_terminated_length": 1603.0, "completions/mean_length": 419.15625, "completions/mean_terminated_length": 419.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.1494435193017125, "epoch": 0.00272, "frac_reward_zero_std": 0.0, "grad_norm": 0.5919251441955566, "kl": 1.6999053806066513, "learning_rate": 6.915684668509476e-05, "loss": -0.0011, "num_tokens": 5896464.0, "reward": 0.5612733364105225, "reward_std": 0.8076068162918091, "rewards/rollout_reward_func/mean": 0.5612733364105225, "rewards/rollout_reward_func/std": 0.7968980669975281, "sampling/importance_sampling_ratio/max": 1.9047027826309204, "sampling/importance_sampling_ratio/mean": 1.0197663307189941, "sampling/importance_sampling_ratio/min": 0.6657741665840149, "sampling/sampling_logp_difference/max": 0.6446270942687988, "sampling/sampling_logp_difference/mean": 0.018227962777018547, "step": 136, "step_time": 10.116442246999668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1848.0, "completions/max_terminated_length": 1848.0, "completions/mean_length": 369.1875, "completions/mean_terminated_length": 369.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.11229250766336918, "epoch": 0.00274, "frac_reward_zero_std": 0.25, "grad_norm": 0.6677427291870117, "kl": 1.9892034083604813, "learning_rate": 6.913084002901584e-05, "loss": 0.0679, "num_tokens": 5937579.0, "reward": 0.7424792051315308, "reward_std": 0.4947810173034668, "rewards/rollout_reward_func/mean": 0.7424792051315308, "rewards/rollout_reward_func/std": 0.5729233026504517, "sampling/importance_sampling_ratio/max": 1.7643640041351318, "sampling/importance_sampling_ratio/mean": 1.0196212530136108, "sampling/importance_sampling_ratio/min": 0.6030710339546204, "sampling/sampling_logp_difference/max": 0.5033688545227051, "sampling/sampling_logp_difference/mean": 0.01576850190758705, "step": 137, "step_time": 10.549261247999766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1771.0, "completions/max_terminated_length": 1771.0, "completions/mean_length": 410.8125, "completions/mean_terminated_length": 410.8125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08885523211210966, "epoch": 0.00276, "frac_reward_zero_std": 0.0, "grad_norm": 0.5994117259979248, "kl": 1.891626924276352, "learning_rate": 6.910444468032774e-05, "loss": 0.0672, "num_tokens": 5980290.0, "reward": 0.6409123539924622, "reward_std": 0.4703308939933777, "rewards/rollout_reward_func/mean": 0.6409123539924622, "rewards/rollout_reward_func/std": 0.6158703565597534, "sampling/importance_sampling_ratio/max": 2.3112781047821045, "sampling/importance_sampling_ratio/mean": 1.053478479385376, "sampling/importance_sampling_ratio/min": 0.8892855644226074, "sampling/sampling_logp_difference/max": 0.7605465650558472, "sampling/sampling_logp_difference/mean": 0.016185229644179344, "step": 138, "step_time": 10.743810512000437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 428.34375, "completions/mean_terminated_length": 428.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08729152474552393, "epoch": 0.00278, "frac_reward_zero_std": 0.25, "grad_norm": 0.09601018577814102, "kl": 2.0200949758291245, "learning_rate": 6.907766101718596e-05, "loss": 0.0097, "num_tokens": 6024108.0, "reward": 0.8051615953445435, "reward_std": 0.3803647756576538, "rewards/rollout_reward_func/mean": 0.8051615953445435, "rewards/rollout_reward_func/std": 0.48248007893562317, "sampling/importance_sampling_ratio/max": 1.2999780178070068, "sampling/importance_sampling_ratio/mean": 1.0022921562194824, "sampling/importance_sampling_ratio/min": 0.8671300411224365, "sampling/sampling_logp_difference/max": 0.251309871673584, "sampling/sampling_logp_difference/mean": 0.007208333350718021, "step": 139, "step_time": 8.09924808300002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 347.0625, "completions/mean_terminated_length": 347.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1412167316302657, "epoch": 0.0028, "frac_reward_zero_std": 0.0, "grad_norm": 0.243582621216774, "kl": 2.5252844989299774, "learning_rate": 6.905048942330926e-05, "loss": 0.0279, "num_tokens": 6064761.0, "reward": 0.5078032612800598, "reward_std": 0.6778466105461121, "rewards/rollout_reward_func/mean": 0.5078032612800598, "rewards/rollout_reward_func/std": 0.6905280947685242, "sampling/importance_sampling_ratio/max": 1.1858367919921875, "sampling/importance_sampling_ratio/mean": 0.9849872589111328, "sampling/importance_sampling_ratio/min": 0.7739213109016418, "sampling/sampling_logp_difference/max": 0.25033265352249146, "sampling/sampling_logp_difference/mean": 0.011538430117070675, "step": 140, "step_time": 8.622904346000723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 549.9375, "completions/mean_terminated_length": 549.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.12284017167985439, "epoch": 0.00282, "frac_reward_zero_std": 0.25, "grad_norm": 0.38300821185112, "kl": 2.5409726798534393, "learning_rate": 6.902293028797413e-05, "loss": 0.0117, "num_tokens": 6114066.0, "reward": 0.6743726134300232, "reward_std": 0.506584882736206, "rewards/rollout_reward_func/mean": 0.6743726134300232, "rewards/rollout_reward_func/std": 0.6056241989135742, "sampling/importance_sampling_ratio/max": 2.0709011554718018, "sampling/importance_sampling_ratio/mean": 1.0216888189315796, "sampling/importance_sampling_ratio/min": 0.8089287281036377, "sampling/sampling_logp_difference/max": 0.9610195159912109, "sampling/sampling_logp_difference/mean": 0.013189367949962616, "step": 141, "step_time": 8.909329409000748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 467.625, "completions/mean_terminated_length": 467.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.12430033041164279, "epoch": 0.00284, "frac_reward_zero_std": 0.0, "grad_norm": 0.7742959260940552, "kl": 2.1053836345672607, "learning_rate": 6.899498400600917e-05, "loss": 0.0273, "num_tokens": 6159383.0, "reward": 0.6654000878334045, "reward_std": 0.49736326932907104, "rewards/rollout_reward_func/mean": 0.6654000878334045, "rewards/rollout_reward_func/std": 0.5650418400764465, "sampling/importance_sampling_ratio/max": 2.9543092250823975, "sampling/importance_sampling_ratio/mean": 1.065422534942627, "sampling/importance_sampling_ratio/min": 0.7036713361740112, "sampling/sampling_logp_difference/max": 0.7008264064788818, "sampling/sampling_logp_difference/mean": 0.020889047533273697, "step": 142, "step_time": 8.635623016999944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 432.78125, "completions/mean_terminated_length": 432.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1647682460024953, "epoch": 0.00286, "frac_reward_zero_std": 0.0, "grad_norm": 0.2879779040813446, "kl": 2.4868428111076355, "learning_rate": 6.89666509777895e-05, "loss": 0.0502, "num_tokens": 6202739.0, "reward": 0.6137266159057617, "reward_std": 0.6815221309661865, "rewards/rollout_reward_func/mean": 0.6137266159057617, "rewards/rollout_reward_func/std": 0.6681612133979797, "sampling/importance_sampling_ratio/max": 1.4290919303894043, "sampling/importance_sampling_ratio/mean": 1.0040864944458008, "sampling/importance_sampling_ratio/min": 0.4210340082645416, "sampling/sampling_logp_difference/max": 0.8334367275238037, "sampling/sampling_logp_difference/mean": 0.017581667751073837, "step": 143, "step_time": 8.936409403000198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 340.78125, "completions/mean_terminated_length": 340.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.15722141321748495, "epoch": 0.00288, "frac_reward_zero_std": 0.0, "grad_norm": 0.16277821362018585, "kl": 2.65716552734375, "learning_rate": 6.893793160923101e-05, "loss": 0.0099, "num_tokens": 6244081.0, "reward": 0.4794809818267822, "reward_std": 0.7292118072509766, "rewards/rollout_reward_func/mean": 0.4794809818267822, "rewards/rollout_reward_func/std": 0.731340229511261, "sampling/importance_sampling_ratio/max": 1.1101821660995483, "sampling/importance_sampling_ratio/mean": 0.9790865182876587, "sampling/importance_sampling_ratio/min": 0.481206476688385, "sampling/sampling_logp_difference/max": 0.8111705780029297, "sampling/sampling_logp_difference/mean": 0.013802092522382736, "step": 144, "step_time": 8.491798626999753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 476.59375, "completions/mean_terminated_length": 476.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2349953092634678, "epoch": 0.0029, "frac_reward_zero_std": 0.0, "grad_norm": 0.3454694151878357, "kl": 3.0731483548879623, "learning_rate": 6.89088263117845e-05, "loss": 0.0325, "num_tokens": 6290776.0, "reward": 0.6040980219841003, "reward_std": 0.6291488409042358, "rewards/rollout_reward_func/mean": 0.6040980219841003, "rewards/rollout_reward_func/std": 0.6289886236190796, "sampling/importance_sampling_ratio/max": 1.199967384338379, "sampling/importance_sampling_ratio/mean": 0.986351490020752, "sampling/importance_sampling_ratio/min": 0.43127188086509705, "sampling/sampling_logp_difference/max": 0.5418962240219116, "sampling/sampling_logp_difference/mean": 0.018624454736709595, "step": 145, "step_time": 8.310214066999833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 406.4375, "completions/mean_terminated_length": 406.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.11199789587408304, "epoch": 0.00292, "frac_reward_zero_std": 0.25, "grad_norm": 0.05417755991220474, "kl": 2.2099212408065796, "learning_rate": 6.887933550242984e-05, "loss": 0.0173, "num_tokens": 6333554.0, "reward": 0.7309483289718628, "reward_std": 0.37760108709335327, "rewards/rollout_reward_func/mean": 0.7309483289718628, "rewards/rollout_reward_func/std": 0.4735129773616791, "sampling/importance_sampling_ratio/max": 1.1231882572174072, "sampling/importance_sampling_ratio/mean": 0.995548665523529, "sampling/importance_sampling_ratio/min": 0.8350507020950317, "sampling/sampling_logp_difference/max": 0.14621257781982422, "sampling/sampling_logp_difference/mean": 0.007376141846179962, "step": 146, "step_time": 8.084056337999755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 444.8125, "completions/mean_terminated_length": 444.8125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.109924694057554, "epoch": 0.00294, "frac_reward_zero_std": 0.0, "grad_norm": 0.0942542627453804, "kl": 2.07125623524189, "learning_rate": 6.884945960366996e-05, "loss": 0.0157, "num_tokens": 6378376.0, "reward": 0.6287172436714172, "reward_std": 0.533454418182373, "rewards/rollout_reward_func/mean": 0.6287172436714172, "rewards/rollout_reward_func/std": 0.5213602781295776, "sampling/importance_sampling_ratio/max": 1.108890175819397, "sampling/importance_sampling_ratio/mean": 0.9939414262771606, "sampling/importance_sampling_ratio/min": 0.8660221099853516, "sampling/sampling_logp_difference/max": 0.1435680389404297, "sampling/sampling_logp_difference/mean": 0.004937987308949232, "step": 147, "step_time": 8.512633863999781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 322.375, "completions/mean_terminated_length": 322.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.14297241857275367, "epoch": 0.00296, "frac_reward_zero_std": 0.0, "grad_norm": 0.1013653501868248, "kl": 2.6017225682735443, "learning_rate": 6.881919904352485e-05, "loss": 0.004, "num_tokens": 6417930.0, "reward": 0.603690505027771, "reward_std": 0.6464805603027344, "rewards/rollout_reward_func/mean": 0.603690505027771, "rewards/rollout_reward_func/std": 0.6295469999313354, "sampling/importance_sampling_ratio/max": 1.1779271364212036, "sampling/importance_sampling_ratio/mean": 0.9952632188796997, "sampling/importance_sampling_ratio/min": 0.7660879492759705, "sampling/sampling_logp_difference/max": 0.1279224157333374, "sampling/sampling_logp_difference/mean": 0.007327167317271233, "step": 148, "step_time": 8.007958097001392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 365.25, "completions/mean_terminated_length": 365.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.19186076894402504, "epoch": 0.00298, "frac_reward_zero_std": 0.0, "grad_norm": 0.49788305163383484, "kl": 1.9299955815076828, "learning_rate": 6.878855425552531e-05, "loss": 0.0007, "num_tokens": 6460263.0, "reward": 0.5284366607666016, "reward_std": 0.5895458459854126, "rewards/rollout_reward_func/mean": 0.5284366607666016, "rewards/rollout_reward_func/std": 0.5988034605979919, "sampling/importance_sampling_ratio/max": 1.188763976097107, "sampling/importance_sampling_ratio/mean": 0.9809389710426331, "sampling/importance_sampling_ratio/min": 0.39090976119041443, "sampling/sampling_logp_difference/max": 0.34725141525268555, "sampling/sampling_logp_difference/mean": 0.014049873687326908, "step": 149, "step_time": 8.630367029999434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 471.125, "completions/mean_terminated_length": 471.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.15333914011716843, "epoch": 0.003, "frac_reward_zero_std": 0.0, "grad_norm": 0.04265453293919563, "kl": 2.1754404455423355, "learning_rate": 6.875752567870691e-05, "loss": 0.0072, "num_tokens": 6506096.0, "reward": 0.5308393239974976, "reward_std": 0.5865603685379028, "rewards/rollout_reward_func/mean": 0.5308393239974976, "rewards/rollout_reward_func/std": 0.5961723327636719, "sampling/importance_sampling_ratio/max": 1.1724435091018677, "sampling/importance_sampling_ratio/mean": 0.9923365712165833, "sampling/importance_sampling_ratio/min": 0.9485777616500854, "sampling/sampling_logp_difference/max": 0.15984201431274414, "sampling/sampling_logp_difference/mean": 0.006490117870271206, "step": 150, "step_time": 8.198367182000766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 345.6875, "completions/mean_terminated_length": 345.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.17767126020044088, "epoch": 0.00302, "frac_reward_zero_std": 0.0, "grad_norm": 0.13997304439544678, "kl": 2.375278726220131, "learning_rate": 6.872611375760355e-05, "loss": 0.0166, "num_tokens": 6547049.0, "reward": 0.7023141384124756, "reward_std": 0.5214084386825562, "rewards/rollout_reward_func/mean": 0.7023141384124756, "rewards/rollout_reward_func/std": 0.544910728931427, "sampling/importance_sampling_ratio/max": 1.0752495527267456, "sampling/importance_sampling_ratio/mean": 0.9812922477722168, "sampling/importance_sampling_ratio/min": 0.711997926235199, "sampling/sampling_logp_difference/max": 0.19846177101135254, "sampling/sampling_logp_difference/mean": 0.01130919810384512, "step": 151, "step_time": 7.944414528999914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 348.71875, "completions/mean_terminated_length": 348.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.17527879308909178, "epoch": 0.00304, "frac_reward_zero_std": 0.0, "grad_norm": 0.1765938252210617, "kl": 2.01448455452919, "learning_rate": 6.869431894224114e-05, "loss": 0.0097, "num_tokens": 6587453.0, "reward": 0.6667919754981995, "reward_std": 0.5543959140777588, "rewards/rollout_reward_func/mean": 0.6667919754981995, "rewards/rollout_reward_func/std": 0.5601391792297363, "sampling/importance_sampling_ratio/max": 1.376830816268921, "sampling/importance_sampling_ratio/mean": 1.0110423564910889, "sampling/importance_sampling_ratio/min": 0.7547652125358582, "sampling/sampling_logp_difference/max": 0.2969411611557007, "sampling/sampling_logp_difference/mean": 0.01429503783583641, "step": 152, "step_time": 8.405341950000548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 409.28125, "completions/mean_terminated_length": 409.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.22015654109418392, "epoch": 0.00306, "frac_reward_zero_std": 0.0, "grad_norm": 0.2541674077510834, "kl": 2.2020594775676727, "learning_rate": 6.866214168813121e-05, "loss": 0.01, "num_tokens": 6631057.0, "reward": 0.7622162103652954, "reward_std": 0.4731295108795166, "rewards/rollout_reward_func/mean": 0.7622162103652954, "rewards/rollout_reward_func/std": 0.45668280124664307, "sampling/importance_sampling_ratio/max": 1.1427841186523438, "sampling/importance_sampling_ratio/mean": 0.9945514798164368, "sampling/importance_sampling_ratio/min": 0.8663008213043213, "sampling/sampling_logp_difference/max": 0.27170607447624207, "sampling/sampling_logp_difference/mean": 0.01537714060395956, "step": 153, "step_time": 8.021342247999655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 491.90625, "completions/mean_terminated_length": 491.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2341323122382164, "epoch": 0.00308, "frac_reward_zero_std": 0.0, "grad_norm": 0.36237457394599915, "kl": 2.6906689554452896, "learning_rate": 6.86295824562643e-05, "loss": 0.0277, "num_tokens": 6677937.0, "reward": 0.6151905059814453, "reward_std": 0.6055624485015869, "rewards/rollout_reward_func/mean": 0.6151905059814453, "rewards/rollout_reward_func/std": 0.6653425097465515, "sampling/importance_sampling_ratio/max": 1.2518352270126343, "sampling/importance_sampling_ratio/mean": 0.986686110496521, "sampling/importance_sampling_ratio/min": 0.7055495977401733, "sampling/sampling_logp_difference/max": 0.23240160942077637, "sampling/sampling_logp_difference/mean": 0.013557899743318558, "step": 154, "step_time": 8.846915288998844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 326.125, "completions/mean_terminated_length": 326.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.16333370935171843, "epoch": 0.0031, "frac_reward_zero_std": 0.25, "grad_norm": 0.6206619143486023, "kl": 3.6357462257146835, "learning_rate": 6.859664171310341e-05, "loss": 0.0262, "num_tokens": 6716610.0, "reward": 0.5785986185073853, "reward_std": 0.5241948962211609, "rewards/rollout_reward_func/mean": 0.5785986185073853, "rewards/rollout_reward_func/std": 0.6780737042427063, "sampling/importance_sampling_ratio/max": 1.1293368339538574, "sampling/importance_sampling_ratio/mean": 0.9958474040031433, "sampling/importance_sampling_ratio/min": 0.8375458121299744, "sampling/sampling_logp_difference/max": 0.17715036869049072, "sampling/sampling_logp_difference/mean": 0.010078039020299911, "step": 155, "step_time": 7.990782318000129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 535.0, "completions/mean_terminated_length": 535.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2376995887607336, "epoch": 0.00312, "frac_reward_zero_std": 0.0, "grad_norm": 0.2993471622467041, "kl": 2.668097272515297, "learning_rate": 6.856331993057726e-05, "loss": 0.0127, "num_tokens": 6764926.0, "reward": 0.6682635545730591, "reward_std": 0.5521674156188965, "rewards/rollout_reward_func/mean": 0.6682635545730591, "rewards/rollout_reward_func/std": 0.5607436895370483, "sampling/importance_sampling_ratio/max": 1.4742839336395264, "sampling/importance_sampling_ratio/mean": 1.021243929862976, "sampling/importance_sampling_ratio/min": 0.49732884764671326, "sampling/sampling_logp_difference/max": 0.38766372203826904, "sampling/sampling_logp_difference/mean": 0.020889626815915108, "step": 156, "step_time": 8.369116508999468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 949.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 391.8125, "completions/mean_terminated_length": 391.8125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.23285317234694958, "epoch": 0.00314, "frac_reward_zero_std": 0.25, "grad_norm": 0.18278354406356812, "kl": 2.7516681402921677, "learning_rate": 6.852961758607362e-05, "loss": 0.0125, "num_tokens": 6807215.0, "reward": 0.6662008762359619, "reward_std": 0.47479134798049927, "rewards/rollout_reward_func/mean": 0.6662008762359619, "rewards/rollout_reward_func/std": 0.5640289187431335, "sampling/importance_sampling_ratio/max": 1.112220048904419, "sampling/importance_sampling_ratio/mean": 0.9654578566551208, "sampling/importance_sampling_ratio/min": 0.8039438724517822, "sampling/sampling_logp_difference/max": 0.19369831681251526, "sampling/sampling_logp_difference/mean": 0.016382649540901184, "step": 157, "step_time": 8.439679926999816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 190.84375, "completions/mean_terminated_length": 190.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.13776259496808052, "epoch": 0.00316, "frac_reward_zero_std": 0.25, "grad_norm": 0.17417091131210327, "kl": 2.1835216879844666, "learning_rate": 6.84955351624324e-05, "loss": 0.005, "num_tokens": 6840033.0, "reward": 0.7664922475814819, "reward_std": 0.4252150058746338, "rewards/rollout_reward_func/mean": 0.7664922475814819, "rewards/rollout_reward_func/std": 0.5141245722770691, "sampling/importance_sampling_ratio/max": 1.1223676204681396, "sampling/importance_sampling_ratio/mean": 1.0026180744171143, "sampling/importance_sampling_ratio/min": 0.8996183276176453, "sampling/sampling_logp_difference/max": 0.24655961990356445, "sampling/sampling_logp_difference/mean": 0.010646210052073002, "step": 158, "step_time": 7.851220569999896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 357.46875, "completions/mean_terminated_length": 357.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.17811238660942763, "epoch": 0.00318, "frac_reward_zero_std": 0.0, "grad_norm": 0.21772460639476776, "kl": 2.536572389304638, "learning_rate": 6.846107314793875e-05, "loss": 0.0225, "num_tokens": 6879972.0, "reward": 0.7087434530258179, "reward_std": 0.5834853649139404, "rewards/rollout_reward_func/mean": 0.7087434530258179, "rewards/rollout_reward_func/std": 0.5909309983253479, "sampling/importance_sampling_ratio/max": 1.219282865524292, "sampling/importance_sampling_ratio/mean": 1.002859115600586, "sampling/importance_sampling_ratio/min": 0.9269883036613464, "sampling/sampling_logp_difference/max": 0.20642352104187012, "sampling/sampling_logp_difference/mean": 0.00855359435081482, "step": 159, "step_time": 7.961617491999732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 376.1875, "completions/mean_terminated_length": 376.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.16091987304389477, "epoch": 0.0032, "frac_reward_zero_std": 0.25, "grad_norm": 0.13054226338863373, "kl": 2.256314054131508, "learning_rate": 6.842623203631602e-05, "loss": 0.0156, "num_tokens": 6922512.0, "reward": 0.6309266686439514, "reward_std": 0.44880473613739014, "rewards/rollout_reward_func/mean": 0.6309266686439514, "rewards/rollout_reward_func/std": 0.5774557590484619, "sampling/importance_sampling_ratio/max": 1.237082600593567, "sampling/importance_sampling_ratio/mean": 0.9855228662490845, "sampling/importance_sampling_ratio/min": 0.696254551410675, "sampling/sampling_logp_difference/max": 0.23336458206176758, "sampling/sampling_logp_difference/mean": 0.008451390080153942, "step": 160, "step_time": 8.027507409000009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 414.125, "completions/mean_terminated_length": 414.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.10047629912151024, "epoch": 0.00322, "frac_reward_zero_std": 0.0, "grad_norm": 0.06133757531642914, "kl": 2.040329024195671, "learning_rate": 6.839101232671883e-05, "loss": 0.0159, "num_tokens": 6965303.0, "reward": 0.8651517629623413, "reward_std": 0.38140833377838135, "rewards/rollout_reward_func/mean": 0.8651517629623413, "rewards/rollout_reward_func/std": 0.3624863624572754, "sampling/importance_sampling_ratio/max": 1.1353775262832642, "sampling/importance_sampling_ratio/mean": 1.0017962455749512, "sampling/importance_sampling_ratio/min": 0.8364040851593018, "sampling/sampling_logp_difference/max": 0.17000633478164673, "sampling/sampling_logp_difference/mean": 0.003716979408636689, "step": 161, "step_time": 7.767446645000291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 361.25, "completions/mean_terminated_length": 361.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08867920422926545, "epoch": 0.00324, "frac_reward_zero_std": 0.25, "grad_norm": 0.01726979948580265, "kl": 1.9614118188619614, "learning_rate": 6.835541452372573e-05, "loss": 0.0122, "num_tokens": 7006777.0, "reward": 0.6591328978538513, "reward_std": 0.36322152614593506, "rewards/rollout_reward_func/mean": 0.6591328978538513, "rewards/rollout_reward_func/std": 0.5138141512870789, "sampling/importance_sampling_ratio/max": 1.0143826007843018, "sampling/importance_sampling_ratio/mean": 0.9977243542671204, "sampling/importance_sampling_ratio/min": 0.8961117267608643, "sampling/sampling_logp_difference/max": 0.10951113700866699, "sampling/sampling_logp_difference/mean": 0.0017522887792438269, "step": 162, "step_time": 8.470875069000158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 781.0, "completions/max_terminated_length": 781.0, "completions/mean_length": 329.96875, "completions/mean_terminated_length": 329.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.11715568928048015, "epoch": 0.00326, "frac_reward_zero_std": 0.0, "grad_norm": 0.09424084424972534, "kl": 1.5954171046614647, "learning_rate": 6.831943913733208e-05, "loss": 0.003, "num_tokens": 7046740.0, "reward": 0.7267489433288574, "reward_std": 0.46610280871391296, "rewards/rollout_reward_func/mean": 0.7267489433288574, "rewards/rollout_reward_func/std": 0.4812053143978119, "sampling/importance_sampling_ratio/max": 1.1439964771270752, "sampling/importance_sampling_ratio/mean": 1.0060396194458008, "sampling/importance_sampling_ratio/min": 0.9480236172676086, "sampling/sampling_logp_difference/max": 0.13454818725585938, "sampling/sampling_logp_difference/mean": 0.0032766172662377357, "step": 163, "step_time": 8.290685211999971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 341.375, "completions/mean_terminated_length": 341.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.14245759323239326, "epoch": 0.00328, "frac_reward_zero_std": 0.25, "grad_norm": 0.10498303174972534, "kl": 2.2534817159175873, "learning_rate": 6.828308668294278e-05, "loss": -0.0001, "num_tokens": 7087479.0, "reward": 0.6713604927062988, "reward_std": 0.5026332139968872, "rewards/rollout_reward_func/mean": 0.6713604927062988, "rewards/rollout_reward_func/std": 0.6081782579421997, "sampling/importance_sampling_ratio/max": 1.0306373834609985, "sampling/importance_sampling_ratio/mean": 0.987622857093811, "sampling/importance_sampling_ratio/min": 0.8876676559448242, "sampling/sampling_logp_difference/max": 0.11920475959777832, "sampling/sampling_logp_difference/mean": 0.004765721037983894, "step": 164, "step_time": 8.166015415000402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 319.5, "completions/mean_terminated_length": 319.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.12448371879872866, "epoch": 0.0033, "frac_reward_zero_std": 0.25, "grad_norm": 0.09654010087251663, "kl": 2.0997682213783264, "learning_rate": 6.824635768136478e-05, "loss": 0.0032, "num_tokens": 7127221.0, "reward": 0.8002113103866577, "reward_std": 0.3861098885536194, "rewards/rollout_reward_func/mean": 0.8002113103866577, "rewards/rollout_reward_func/std": 0.491781622171402, "sampling/importance_sampling_ratio/max": 1.0984333753585815, "sampling/importance_sampling_ratio/mean": 1.001004934310913, "sampling/importance_sampling_ratio/min": 0.8596299290657043, "sampling/sampling_logp_difference/max": 0.14517855644226074, "sampling/sampling_logp_difference/mean": 0.0030105463229119778, "step": 165, "step_time": 8.011805591999291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 296.28125, "completions/mean_terminated_length": 296.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.11935456725768745, "epoch": 0.00332, "frac_reward_zero_std": 0.0, "grad_norm": 0.04219117388129234, "kl": 1.8202880918979645, "learning_rate": 6.820925265879964e-05, "loss": 0.0042, "num_tokens": 7166345.0, "reward": 0.5538318753242493, "reward_std": 0.5631110668182373, "rewards/rollout_reward_func/mean": 0.5538318753242493, "rewards/rollout_reward_func/std": 0.5481942892074585, "sampling/importance_sampling_ratio/max": 1.1451362371444702, "sampling/importance_sampling_ratio/mean": 1.004054069519043, "sampling/importance_sampling_ratio/min": 0.9749541282653809, "sampling/sampling_logp_difference/max": 0.13626623153686523, "sampling/sampling_logp_difference/mean": 0.0032532932236790657, "step": 166, "step_time": 7.692564919000233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 393.90625, "completions/mean_terminated_length": 393.90625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1372534521506168, "epoch": 0.00334, "frac_reward_zero_std": 0.25, "grad_norm": 0.09940759092569351, "kl": 2.1635479629039764, "learning_rate": 6.81717721468361e-05, "loss": 0.0144, "num_tokens": 7209163.0, "reward": 0.7293205261230469, "reward_std": 0.37990185618400574, "rewards/rollout_reward_func/mean": 0.7293205261230469, "rewards/rollout_reward_func/std": 0.4764424264431, "sampling/importance_sampling_ratio/max": 1.124687910079956, "sampling/importance_sampling_ratio/mean": 1.0043847560882568, "sampling/importance_sampling_ratio/min": 0.9703287482261658, "sampling/sampling_logp_difference/max": 0.11814403533935547, "sampling/sampling_logp_difference/mean": 0.002542454283684492, "step": 167, "step_time": 8.367033979999633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 439.5, "completions/mean_terminated_length": 439.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.21549299731850624, "epoch": 0.00336, "frac_reward_zero_std": 0.0, "grad_norm": 0.09969864785671234, "kl": 2.125189334154129, "learning_rate": 6.813391668244235e-05, "loss": -0.0009, "num_tokens": 7254658.0, "reward": 0.664747953414917, "reward_std": 0.581751823425293, "rewards/rollout_reward_func/mean": 0.664747953414917, "rewards/rollout_reward_func/std": 0.5644985437393188, "sampling/importance_sampling_ratio/max": 1.0445969104766846, "sampling/importance_sampling_ratio/mean": 0.9780154228210449, "sampling/importance_sampling_ratio/min": 0.4843900799751282, "sampling/sampling_logp_difference/max": 0.6237154006958008, "sampling/sampling_logp_difference/mean": 0.009985985234379768, "step": 168, "step_time": 8.620164513000873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 392.0625, "completions/mean_terminated_length": 392.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1930416664108634, "epoch": 0.00338, "frac_reward_zero_std": 0.0, "grad_norm": 0.26555782556533813, "kl": 2.180087149143219, "learning_rate": 6.809568680795836e-05, "loss": 0.0104, "num_tokens": 7298156.0, "reward": 0.7338677048683167, "reward_std": 0.512953519821167, "rewards/rollout_reward_func/mean": 0.7338677048683167, "rewards/rollout_reward_func/std": 0.5311267971992493, "sampling/importance_sampling_ratio/max": 1.1068590879440308, "sampling/importance_sampling_ratio/mean": 0.9961108565330505, "sampling/importance_sampling_ratio/min": 0.9520285725593567, "sampling/sampling_logp_difference/max": 0.12442398071289062, "sampling/sampling_logp_difference/mean": 0.004828417673707008, "step": 169, "step_time": 8.018567587001144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 300.875, "completions/mean_terminated_length": 300.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.15026373689761385, "epoch": 0.0034, "frac_reward_zero_std": 0.25, "grad_norm": 0.06421629339456558, "kl": 2.46119923889637, "learning_rate": 6.805708307108811e-05, "loss": 0.0083, "num_tokens": 7336622.0, "reward": 0.7937889099121094, "reward_std": 0.38190287351608276, "rewards/rollout_reward_func/mean": 0.7937889099121094, "rewards/rollout_reward_func/std": 0.43625569343566895, "sampling/importance_sampling_ratio/max": 1.0788273811340332, "sampling/importance_sampling_ratio/mean": 0.9989398717880249, "sampling/importance_sampling_ratio/min": 0.9036030769348145, "sampling/sampling_logp_difference/max": 0.12101507186889648, "sampling/sampling_logp_difference/mean": 0.004911279305815697, "step": 170, "step_time": 7.701874809999936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 400.4375, "completions/mean_terminated_length": 400.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.17996254190802574, "epoch": 0.00342, "frac_reward_zero_std": 0.0, "grad_norm": 0.03964705020189285, "kl": 2.4370532482862473, "learning_rate": 6.801810602489183e-05, "loss": 0.0139, "num_tokens": 7379633.0, "reward": 0.7272516489028931, "reward_std": 0.48924848437309265, "rewards/rollout_reward_func/mean": 0.7272516489028931, "rewards/rollout_reward_func/std": 0.4800207316875458, "sampling/importance_sampling_ratio/max": 1.1198415756225586, "sampling/importance_sampling_ratio/mean": 1.0057014226913452, "sampling/importance_sampling_ratio/min": 0.893578827381134, "sampling/sampling_logp_difference/max": 0.11318683624267578, "sampling/sampling_logp_difference/mean": 0.005813958123326302, "step": 171, "step_time": 7.924614707999353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 438.0, "completions/mean_terminated_length": 438.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.22665715403854847, "epoch": 0.00344, "frac_reward_zero_std": 0.25, "grad_norm": 0.12204638123512268, "kl": 1.9749927520751953, "learning_rate": 6.79787562277779e-05, "loss": 0.0036, "num_tokens": 7424915.0, "reward": 0.7364818453788757, "reward_std": 0.4436033368110657, "rewards/rollout_reward_func/mean": 0.7364818453788757, "rewards/rollout_reward_func/std": 0.5274467468261719, "sampling/importance_sampling_ratio/max": 1.408168911933899, "sampling/importance_sampling_ratio/mean": 1.0058224201202393, "sampling/importance_sampling_ratio/min": 0.8867501020431519, "sampling/sampling_logp_difference/max": 0.34807002544403076, "sampling/sampling_logp_difference/mean": 0.007504682056605816, "step": 172, "step_time": 8.373021693000737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 350.28125, "completions/mean_terminated_length": 350.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1825819741934538, "epoch": 0.00346, "frac_reward_zero_std": 0.25, "grad_norm": 0.06415095180273056, "kl": 2.034898728132248, "learning_rate": 6.793903424349502e-05, "loss": 0.0051, "num_tokens": 7466185.0, "reward": 0.7249808311462402, "reward_std": 0.38520461320877075, "rewards/rollout_reward_func/mean": 0.7249808311462402, "rewards/rollout_reward_func/std": 0.4840371310710907, "sampling/importance_sampling_ratio/max": 1.0688751935958862, "sampling/importance_sampling_ratio/mean": 0.9946276545524597, "sampling/importance_sampling_ratio/min": 0.8896469473838806, "sampling/sampling_logp_difference/max": 0.11692094802856445, "sampling/sampling_logp_difference/mean": 0.0040700240060687065, "step": 173, "step_time": 8.155524639000305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 372.40625, "completions/mean_terminated_length": 372.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.17617107182741165, "epoch": 0.00348, "frac_reward_zero_std": 0.25, "grad_norm": 0.03839481249451637, "kl": 1.9483859091997147, "learning_rate": 6.789894064112407e-05, "loss": 0.0113, "num_tokens": 7508123.0, "reward": 0.864082932472229, "reward_std": 0.3185461163520813, "rewards/rollout_reward_func/mean": 0.864082932472229, "rewards/rollout_reward_func/std": 0.3654150068759918, "sampling/importance_sampling_ratio/max": 1.0635660886764526, "sampling/importance_sampling_ratio/mean": 0.993345320224762, "sampling/importance_sampling_ratio/min": 0.8757802248001099, "sampling/sampling_logp_difference/max": 0.13266372680664062, "sampling/sampling_logp_difference/mean": 0.004590728785842657, "step": 174, "step_time": 7.90633680900055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 384.875, "completions/mean_terminated_length": 384.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1971382461488247, "epoch": 0.0035, "frac_reward_zero_std": 0.0, "grad_norm": 0.027194950729608536, "kl": 2.364773318171501, "learning_rate": 6.785847599506988e-05, "loss": 0.0067, "num_tokens": 7550242.0, "reward": 0.522782564163208, "reward_std": 0.4908466339111328, "rewards/rollout_reward_func/mean": 0.522782564163208, "rewards/rollout_reward_func/std": 0.5498824119567871, "sampling/importance_sampling_ratio/max": 1.1404954195022583, "sampling/importance_sampling_ratio/mean": 1.0017932653427124, "sampling/importance_sampling_ratio/min": 0.9245549440383911, "sampling/sampling_logp_difference/max": 0.12492203712463379, "sampling/sampling_logp_difference/mean": 0.004730165936052799, "step": 175, "step_time": 8.08983896999871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 368.8125, "completions/mean_terminated_length": 368.8125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.20483252312988043, "epoch": 0.00352, "frac_reward_zero_std": 0.25, "grad_norm": 0.06558651477098465, "kl": 1.806079477071762, "learning_rate": 6.781764088505318e-05, "loss": 0.0073, "num_tokens": 7591276.0, "reward": 0.7978453636169434, "reward_std": 0.3590865135192871, "rewards/rollout_reward_func/mean": 0.7978453636169434, "rewards/rollout_reward_func/std": 0.4276452660560608, "sampling/importance_sampling_ratio/max": 1.1260316371917725, "sampling/importance_sampling_ratio/mean": 0.9976564645767212, "sampling/importance_sampling_ratio/min": 0.8102641701698303, "sampling/sampling_logp_difference/max": 0.11887693405151367, "sampling/sampling_logp_difference/mean": 0.005529333837330341, "step": 176, "step_time": 7.685523156000272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 355.34375, "completions/mean_terminated_length": 355.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.24260246567428112, "epoch": 0.00354, "frac_reward_zero_std": 0.0, "grad_norm": 0.09904715418815613, "kl": 1.75944085419178, "learning_rate": 6.777643589610209e-05, "loss": -0.0019, "num_tokens": 7632959.0, "reward": 0.4928133487701416, "reward_std": 0.5877783298492432, "rewards/rollout_reward_func/mean": 0.4928133487701416, "rewards/rollout_reward_func/std": 0.6047735810279846, "sampling/importance_sampling_ratio/max": 1.0769001245498657, "sampling/importance_sampling_ratio/mean": 0.9824893474578857, "sampling/importance_sampling_ratio/min": 0.8179572820663452, "sampling/sampling_logp_difference/max": 0.1141819953918457, "sampling/sampling_logp_difference/mean": 0.0068785687908530235, "step": 177, "step_time": 8.340609593999488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 469.09375, "completions/mean_terminated_length": 469.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.22250043600797653, "epoch": 0.00356, "frac_reward_zero_std": 0.0, "grad_norm": 0.07050890475511551, "kl": 2.656134247779846, "learning_rate": 6.773486161854389e-05, "loss": 0.0053, "num_tokens": 7678469.0, "reward": 0.6254128813743591, "reward_std": 0.538325309753418, "rewards/rollout_reward_func/mean": 0.6254128813743591, "rewards/rollout_reward_func/std": 0.5260134935379028, "sampling/importance_sampling_ratio/max": 1.323096513748169, "sampling/importance_sampling_ratio/mean": 0.9918699264526367, "sampling/importance_sampling_ratio/min": 0.8964437246322632, "sampling/sampling_logp_difference/max": 0.22683453559875488, "sampling/sampling_logp_difference/mean": 0.008141700178384781, "step": 178, "step_time": 8.549395176999951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 433.59375, "completions/mean_terminated_length": 433.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.23459923453629017, "epoch": 0.00358, "frac_reward_zero_std": 0.0, "grad_norm": 0.25083062052726746, "kl": 2.2459611147642136, "learning_rate": 6.769291864799651e-05, "loss": 0.0178, "num_tokens": 7722656.0, "reward": 0.6338169574737549, "reward_std": 0.565753161907196, "rewards/rollout_reward_func/mean": 0.6338169574737549, "rewards/rollout_reward_func/std": 0.5722853541374207, "sampling/importance_sampling_ratio/max": 1.2165311574935913, "sampling/importance_sampling_ratio/mean": 1.0006380081176758, "sampling/importance_sampling_ratio/min": 0.892166793346405, "sampling/sampling_logp_difference/max": 0.22674059867858887, "sampling/sampling_logp_difference/mean": 0.008931713178753853, "step": 179, "step_time": 7.970021409000765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 471.5, "completions/mean_terminated_length": 471.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2468790840357542, "epoch": 0.0036, "frac_reward_zero_std": 0.0, "grad_norm": 0.6004800796508789, "kl": 2.4116610884666443, "learning_rate": 6.765060758535996e-05, "loss": 0.0079, "num_tokens": 7768353.0, "reward": 0.6324113011360168, "reward_std": 0.5849053859710693, "rewards/rollout_reward_func/mean": 0.6324113011360168, "rewards/rollout_reward_func/std": 0.5751650929450989, "sampling/importance_sampling_ratio/max": 1.0864413976669312, "sampling/importance_sampling_ratio/mean": 0.9886354207992554, "sampling/importance_sampling_ratio/min": 0.8967463970184326, "sampling/sampling_logp_difference/max": 0.11317682266235352, "sampling/sampling_logp_difference/mean": 0.006645881570875645, "step": 180, "step_time": 8.121073177000198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 468.78125, "completions/mean_terminated_length": 468.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.21785606164485216, "epoch": 0.00362, "frac_reward_zero_std": 0.25, "grad_norm": 0.025508321821689606, "kl": 2.0714156478643417, "learning_rate": 6.76079290368078e-05, "loss": 0.0148, "num_tokens": 7814440.0, "reward": 0.6264437437057495, "reward_std": 0.3769237995147705, "rewards/rollout_reward_func/mean": 0.6264437437057495, "rewards/rollout_reward_func/std": 0.5244996547698975, "sampling/importance_sampling_ratio/max": 1.0744510889053345, "sampling/importance_sampling_ratio/mean": 0.9940012097358704, "sampling/importance_sampling_ratio/min": 0.8991582989692688, "sampling/sampling_logp_difference/max": 0.10627460479736328, "sampling/sampling_logp_difference/mean": 0.004842186812311411, "step": 181, "step_time": 8.27885940599981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 410.40625, "completions/mean_terminated_length": 410.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.27280512172728777, "epoch": 0.00364, "frac_reward_zero_std": 0.25, "grad_norm": 0.24693135917186737, "kl": 1.9363086521625519, "learning_rate": 6.756488361377838e-05, "loss": 0.0387, "num_tokens": 7857379.0, "reward": 0.768875241279602, "reward_std": 0.4121905565261841, "rewards/rollout_reward_func/mean": 0.768875241279602, "rewards/rollout_reward_func/std": 0.5097793340682983, "sampling/importance_sampling_ratio/max": 1.1622765064239502, "sampling/importance_sampling_ratio/mean": 0.9870535135269165, "sampling/importance_sampling_ratio/min": 0.691150963306427, "sampling/sampling_logp_difference/max": 0.3211463689804077, "sampling/sampling_logp_difference/mean": 0.009326688945293427, "step": 182, "step_time": 8.86808794199942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 435.78125, "completions/mean_terminated_length": 448.8064270019531, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.26561162900179625, "epoch": 0.00366, "frac_reward_zero_std": 0.0, "grad_norm": 0.12201409786939621, "kl": 1.9621595740318298, "learning_rate": 6.752147193296611e-05, "loss": -0.021, "num_tokens": 7902291.0, "reward": 0.5801291465759277, "reward_std": 0.6570796966552734, "rewards/rollout_reward_func/mean": 0.5801291465759277, "rewards/rollout_reward_func/std": 0.673110842704773, "sampling/importance_sampling_ratio/max": 1.2577877044677734, "sampling/importance_sampling_ratio/mean": 1.0102720260620117, "sampling/importance_sampling_ratio/min": 0.9187896251678467, "sampling/sampling_logp_difference/max": 0.18996238708496094, "sampling/sampling_logp_difference/mean": 0.006219998933374882, "step": 183, "step_time": 9.244647356999849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 454.46875, "completions/mean_terminated_length": 454.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.24074017815291882, "epoch": 0.00368, "frac_reward_zero_std": 0.0, "grad_norm": 0.14085891842842102, "kl": 2.086748793721199, "learning_rate": 6.747769461631269e-05, "loss": 0.059, "num_tokens": 7947442.0, "reward": 0.7292914390563965, "reward_std": 0.4865405261516571, "rewards/rollout_reward_func/mean": 0.7292914390563965, "rewards/rollout_reward_func/std": 0.4764947295188904, "sampling/importance_sampling_ratio/max": 1.2009979486465454, "sampling/importance_sampling_ratio/mean": 1.0068989992141724, "sampling/importance_sampling_ratio/min": 0.918373167514801, "sampling/sampling_logp_difference/max": 0.2451333999633789, "sampling/sampling_logp_difference/mean": 0.008742563426494598, "step": 184, "step_time": 8.23115694399985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 421.875, "completions/mean_terminated_length": 434.45159912109375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.31865684129297733, "epoch": 0.0037, "frac_reward_zero_std": 0.0, "grad_norm": 0.347099244594574, "kl": 2.2802437990903854, "learning_rate": 6.743355229099807e-05, "loss": 0.0849, "num_tokens": 7991068.0, "reward": 0.6382155418395996, "reward_std": 0.6130216121673584, "rewards/rollout_reward_func/mean": 0.6382155418395996, "rewards/rollout_reward_func/std": 0.6210012435913086, "sampling/importance_sampling_ratio/max": 2.3582236766815186, "sampling/importance_sampling_ratio/mean": 1.03019380569458, "sampling/importance_sampling_ratio/min": 0.5412710309028625, "sampling/sampling_logp_difference/max": 0.8574271202087402, "sampling/sampling_logp_difference/mean": 0.02565600350499153, "step": 185, "step_time": 8.387365961999421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 931.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 421.28125, "completions/mean_terminated_length": 421.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.381010290235281, "epoch": 0.00372, "frac_reward_zero_std": 0.0, "grad_norm": 0.5700381398200989, "kl": 2.807343691587448, "learning_rate": 6.738904558943151e-05, "loss": -0.0005, "num_tokens": 8034479.0, "reward": 0.5980798602104187, "reward_std": 0.5996876358985901, "rewards/rollout_reward_func/mean": 0.5980798602104187, "rewards/rollout_reward_func/std": 0.5856677889823914, "sampling/importance_sampling_ratio/max": 2.1092312335968018, "sampling/importance_sampling_ratio/mean": 0.9871307611465454, "sampling/importance_sampling_ratio/min": 0.41361916065216064, "sampling/sampling_logp_difference/max": 0.7705702781677246, "sampling/sampling_logp_difference/mean": 0.027383536100387573, "step": 186, "step_time": 9.252318302000276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 472.40625, "completions/mean_terminated_length": 472.40625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.29325065948069096, "epoch": 0.00374, "frac_reward_zero_std": 0.0, "grad_norm": 0.39501211047172546, "kl": 2.7425270080566406, "learning_rate": 6.734417514924263e-05, "loss": 0.0384, "num_tokens": 8079092.0, "reward": 0.6620178818702698, "reward_std": 0.520715057849884, "rewards/rollout_reward_func/mean": 0.6620178818702698, "rewards/rollout_reward_func/std": 0.5094770789146423, "sampling/importance_sampling_ratio/max": 1.9417282342910767, "sampling/importance_sampling_ratio/mean": 1.0034770965576172, "sampling/importance_sampling_ratio/min": 0.6186121106147766, "sampling/sampling_logp_difference/max": 0.49492835998535156, "sampling/sampling_logp_difference/mean": 0.016787899658083916, "step": 187, "step_time": 11.277424065000105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 376.3125, "completions/mean_terminated_length": 387.4193420410156, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.23011791927274317, "epoch": 0.00376, "frac_reward_zero_std": 0.0, "grad_norm": 0.32031598687171936, "kl": 2.646861284971237, "learning_rate": 6.729894161327213e-05, "loss": 0.1448, "num_tokens": 8119678.0, "reward": 0.7034637928009033, "reward_std": 0.5442278385162354, "rewards/rollout_reward_func/mean": 0.7034637928009033, "rewards/rollout_reward_func/std": 0.5416025519371033, "sampling/importance_sampling_ratio/max": 1.7449699640274048, "sampling/importance_sampling_ratio/mean": 1.002528190612793, "sampling/importance_sampling_ratio/min": 0.19437186419963837, "sampling/sampling_logp_difference/max": 1.7532627582550049, "sampling/sampling_logp_difference/mean": 0.015016734600067139, "step": 188, "step_time": 9.403966018000574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 414.25, "completions/mean_terminated_length": 426.58062744140625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.28522290103137493, "epoch": 0.00378, "frac_reward_zero_std": 0.0, "grad_norm": 0.35443058609962463, "kl": 2.3377997875213623, "learning_rate": 6.725334562956265e-05, "loss": 0.1161, "num_tokens": 8161947.0, "reward": 0.7016137838363647, "reward_std": 0.5370473861694336, "rewards/rollout_reward_func/mean": 0.7016137838363647, "rewards/rollout_reward_func/std": 0.5457635521888733, "sampling/importance_sampling_ratio/max": 1.2677720785140991, "sampling/importance_sampling_ratio/mean": 0.9620894193649292, "sampling/importance_sampling_ratio/min": 0.5358086824417114, "sampling/sampling_logp_difference/max": 0.6241637468338013, "sampling/sampling_logp_difference/mean": 0.0121621023863554, "step": 189, "step_time": 9.794809867999447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 448.3125, "completions/mean_terminated_length": 461.7419128417969, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.3095178920775652, "epoch": 0.0038, "frac_reward_zero_std": 0.0, "grad_norm": 1.0403172969818115, "kl": 3.981646493077278, "learning_rate": 6.720738785134943e-05, "loss": 0.1765, "num_tokens": 8206876.0, "reward": 0.7023882865905762, "reward_std": 0.5217618942260742, "rewards/rollout_reward_func/mean": 0.7023882865905762, "rewards/rollout_reward_func/std": 0.5447872877120972, "sampling/importance_sampling_ratio/max": 1.3234858512878418, "sampling/importance_sampling_ratio/mean": 0.9910523295402527, "sampling/importance_sampling_ratio/min": 0.5314875841140747, "sampling/sampling_logp_difference/max": 0.4972090721130371, "sampling/sampling_logp_difference/mean": 0.0128726065158844, "step": 190, "step_time": 10.68090830499932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 362.625, "completions/mean_terminated_length": 373.2903137207031, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2384635005146265, "epoch": 0.00382, "frac_reward_zero_std": 0.0, "grad_norm": 0.5082979798316956, "kl": 2.7974633127450943, "learning_rate": 6.716106893705111e-05, "loss": 0.2906, "num_tokens": 8246091.0, "reward": 0.7396824359893799, "reward_std": 0.5232328772544861, "rewards/rollout_reward_func/mean": 0.7396824359893799, "rewards/rollout_reward_func/std": 0.5206373929977417, "sampling/importance_sampling_ratio/max": 1.2811709642410278, "sampling/importance_sampling_ratio/mean": 1.0005989074707031, "sampling/importance_sampling_ratio/min": 0.536393404006958, "sampling/sampling_logp_difference/max": 0.4952974319458008, "sampling/sampling_logp_difference/mean": 0.011393672786653042, "step": 191, "step_time": 9.797926126998846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 544.875, "completions/mean_terminated_length": 544.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.36145881563425064, "epoch": 0.00384, "frac_reward_zero_std": 0.0, "grad_norm": 0.3071439266204834, "kl": 2.114464148879051, "learning_rate": 6.711438955026003e-05, "loss": 0.1798, "num_tokens": 8294524.0, "reward": 0.6968176364898682, "reward_std": 0.49926304817199707, "rewards/rollout_reward_func/mean": 0.6968176364898682, "rewards/rollout_reward_func/std": 0.4924389719963074, "sampling/importance_sampling_ratio/max": 1.512161135673523, "sampling/importance_sampling_ratio/mean": 1.028672456741333, "sampling/importance_sampling_ratio/min": 0.5225725173950195, "sampling/sampling_logp_difference/max": 0.2773146629333496, "sampling/sampling_logp_difference/mean": 0.013574086129665375, "step": 192, "step_time": 10.632555287999821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 924.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 478.25, "completions/mean_terminated_length": 478.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.33849677070975304, "epoch": 0.00386, "frac_reward_zero_std": 0.25, "grad_norm": 0.3701254725456238, "kl": 2.5113009363412857, "learning_rate": 6.706735035973298e-05, "loss": -0.0015, "num_tokens": 8340252.0, "reward": 0.7369871735572815, "reward_std": 0.43890249729156494, "rewards/rollout_reward_func/mean": 0.7369871735572815, "rewards/rollout_reward_func/std": 0.5260563492774963, "sampling/importance_sampling_ratio/max": 1.1817439794540405, "sampling/importance_sampling_ratio/mean": 0.9885131120681763, "sampling/importance_sampling_ratio/min": 0.779888391494751, "sampling/sampling_logp_difference/max": 0.24979305267333984, "sampling/sampling_logp_difference/mean": 0.012673701159656048, "step": 193, "step_time": 9.104537654000524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 476.09375, "completions/mean_terminated_length": 476.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2889076229184866, "epoch": 0.00388, "frac_reward_zero_std": 0.0, "grad_norm": 0.1778896003961563, "kl": 2.518808811903, "learning_rate": 6.701995203938151e-05, "loss": 0.0303, "num_tokens": 8385657.0, "reward": 0.663995623588562, "reward_std": 0.5012062191963196, "rewards/rollout_reward_func/mean": 0.663995623588562, "rewards/rollout_reward_func/std": 0.5064665675163269, "sampling/importance_sampling_ratio/max": 1.1267457008361816, "sampling/importance_sampling_ratio/mean": 0.9972169995307922, "sampling/importance_sampling_ratio/min": 0.7709367275238037, "sampling/sampling_logp_difference/max": 0.2413787841796875, "sampling/sampling_logp_difference/mean": 0.009440215304493904, "step": 194, "step_time": 8.584675436999532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 414.09375, "completions/mean_terminated_length": 414.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.30290352925658226, "epoch": 0.0039, "frac_reward_zero_std": 0.0, "grad_norm": 0.3764919638633728, "kl": 2.4471113830804825, "learning_rate": 6.697219526826221e-05, "loss": 0.0578, "num_tokens": 8429278.0, "reward": 0.6272629499435425, "reward_std": 0.5232462882995605, "rewards/rollout_reward_func/mean": 0.6272629499435425, "rewards/rollout_reward_func/std": 0.5234437584877014, "sampling/importance_sampling_ratio/max": 1.1254360675811768, "sampling/importance_sampling_ratio/mean": 0.9581906795501709, "sampling/importance_sampling_ratio/min": 2.556398612796329e-06, "sampling/sampling_logp_difference/max": 12.90183162689209, "sampling/sampling_logp_difference/mean": 0.040671322494745255, "step": 195, "step_time": 9.196942272999877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 409.625, "completions/mean_terminated_length": 409.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.3115747980773449, "epoch": 0.00392, "frac_reward_zero_std": 0.0, "grad_norm": 0.338215172290802, "kl": 2.634522870182991, "learning_rate": 6.69240807305671e-05, "loss": 0.0141, "num_tokens": 8472413.0, "reward": 0.7277377247810364, "reward_std": 0.4887671172618866, "rewards/rollout_reward_func/mean": 0.7277377247810364, "rewards/rollout_reward_func/std": 0.4792439043521881, "sampling/importance_sampling_ratio/max": 1.302032709121704, "sampling/importance_sampling_ratio/mean": 0.9910115599632263, "sampling/importance_sampling_ratio/min": 0.6424360871315002, "sampling/sampling_logp_difference/max": 0.3409850597381592, "sampling/sampling_logp_difference/mean": 0.013934816233813763, "step": 196, "step_time": 8.960902385999816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 446.96875, "completions/mean_terminated_length": 446.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.37666598334908485, "epoch": 0.00394, "frac_reward_zero_std": 0.0, "grad_norm": 0.22425906360149384, "kl": 2.606361359357834, "learning_rate": 6.687560911561377e-05, "loss": 0.0208, "num_tokens": 8518008.0, "reward": 0.3461553156375885, "reward_std": 0.7335036993026733, "rewards/rollout_reward_func/mean": 0.3461553156375885, "rewards/rollout_reward_func/std": 0.7226306200027466, "sampling/importance_sampling_ratio/max": 1.1754411458969116, "sampling/importance_sampling_ratio/mean": 0.970253586769104, "sampling/importance_sampling_ratio/min": 0.7654891014099121, "sampling/sampling_logp_difference/max": 0.1483602523803711, "sampling/sampling_logp_difference/mean": 0.014466704800724983, "step": 197, "step_time": 8.80009192100033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 339.59375, "completions/mean_terminated_length": 339.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.3192804902791977, "epoch": 0.00396, "frac_reward_zero_std": 0.0, "grad_norm": 0.2560868561267853, "kl": 2.0982080847024918, "learning_rate": 6.682678111783552e-05, "loss": -0.0021, "num_tokens": 8558675.0, "reward": 0.7310920357704163, "reward_std": 0.5271731019020081, "rewards/rollout_reward_func/mean": 0.7310920357704163, "rewards/rollout_reward_func/std": 0.5376037359237671, "sampling/importance_sampling_ratio/max": 1.1234917640686035, "sampling/importance_sampling_ratio/mean": 0.994827389717102, "sampling/importance_sampling_ratio/min": 0.7986771464347839, "sampling/sampling_logp_difference/max": 0.25894713401794434, "sampling/sampling_logp_difference/mean": 0.016717981547117233, "step": 198, "step_time": 8.074021672999606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 930.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 340.125, "completions/mean_terminated_length": 340.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.36887005157768726, "epoch": 0.00398, "frac_reward_zero_std": 0.0, "grad_norm": 0.4387360215187073, "kl": 1.8794338554143906, "learning_rate": 6.677759743677137e-05, "loss": -0.0086, "num_tokens": 8600512.0, "reward": 0.5445839166641235, "reward_std": 0.6923034191131592, "rewards/rollout_reward_func/mean": 0.5445839166641235, "rewards/rollout_reward_func/std": 0.7310091257095337, "sampling/importance_sampling_ratio/max": 1.2208096981048584, "sampling/importance_sampling_ratio/mean": 0.984351634979248, "sampling/importance_sampling_ratio/min": 0.7184396982192993, "sampling/sampling_logp_difference/max": 0.22394752502441406, "sampling/sampling_logp_difference/mean": 0.01513705588877201, "step": 199, "step_time": 8.0978277600002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 408.03125, "completions/mean_terminated_length": 408.03125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.37346187978982925, "epoch": 0.004, "frac_reward_zero_std": 0.0, "grad_norm": 0.27794671058654785, "kl": 1.7905083894729614, "learning_rate": 6.672805877705611e-05, "loss": 0.0001, "num_tokens": 8644754.0, "reward": 0.5772043466567993, "reward_std": 0.6832541227340698, "rewards/rollout_reward_func/mean": 0.5772043466567993, "rewards/rollout_reward_func/std": 0.6798107624053955, "sampling/importance_sampling_ratio/max": 1.1450250148773193, "sampling/importance_sampling_ratio/mean": 1.013873815536499, "sampling/importance_sampling_ratio/min": 0.846538245677948, "sampling/sampling_logp_difference/max": 0.25728726387023926, "sampling/sampling_logp_difference/mean": 0.014063991606235504, "step": 200, "step_time": 8.180994575000568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 417.0, "completions/mean_terminated_length": 417.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.3129821112379432, "epoch": 0.00402, "frac_reward_zero_std": 0.0, "grad_norm": 0.15526515245437622, "kl": 1.6661237329244614, "learning_rate": 6.667816584841016e-05, "loss": 0.0147, "num_tokens": 8688338.0, "reward": 0.7676955461502075, "reward_std": 0.5240449905395508, "rewards/rollout_reward_func/mean": 0.7676955461502075, "rewards/rollout_reward_func/std": 0.5134907960891724, "sampling/importance_sampling_ratio/max": 1.1394858360290527, "sampling/importance_sampling_ratio/mean": 1.0038542747497559, "sampling/importance_sampling_ratio/min": 0.8216003179550171, "sampling/sampling_logp_difference/max": 0.2118973731994629, "sampling/sampling_logp_difference/mean": 0.014402098953723907, "step": 201, "step_time": 9.245062461000543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 449.9375, "completions/mean_terminated_length": 449.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.35800576582551, "epoch": 0.00404, "frac_reward_zero_std": 0.0, "grad_norm": 0.21723118424415588, "kl": 1.4874009266495705, "learning_rate": 6.662791936562939e-05, "loss": 0.0294, "num_tokens": 8732545.0, "reward": 0.6062465310096741, "reward_std": 0.5821770429611206, "rewards/rollout_reward_func/mean": 0.6062465310096741, "rewards/rollout_reward_func/std": 0.6295119524002075, "sampling/importance_sampling_ratio/max": 1.211263656616211, "sampling/importance_sampling_ratio/mean": 1.0074564218521118, "sampling/importance_sampling_ratio/min": 0.8640790581703186, "sampling/sampling_logp_difference/max": 0.23335886001586914, "sampling/sampling_logp_difference/mean": 0.01451953686773777, "step": 202, "step_time": 8.371230279000429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 930.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 420.84375, "completions/mean_terminated_length": 420.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2751971064135432, "epoch": 0.00406, "frac_reward_zero_std": 0.0, "grad_norm": 0.16519516706466675, "kl": 1.5812018513679504, "learning_rate": 6.657732004857488e-05, "loss": 0.0033, "num_tokens": 8776051.0, "reward": 0.6461575031280518, "reward_std": 0.6402170658111572, "rewards/rollout_reward_func/mean": 0.6461575031280518, "rewards/rollout_reward_func/std": 0.6623650789260864, "sampling/importance_sampling_ratio/max": 1.1068602800369263, "sampling/importance_sampling_ratio/mean": 0.9813722372055054, "sampling/importance_sampling_ratio/min": 0.7901610732078552, "sampling/sampling_logp_difference/max": 0.22950315475463867, "sampling/sampling_logp_difference/mean": 0.01619294285774231, "step": 203, "step_time": 8.326014798000415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 391.90625, "completions/mean_terminated_length": 391.90625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.26856876723468304, "epoch": 0.00408, "frac_reward_zero_std": 0.25, "grad_norm": 0.1882360726594925, "kl": 1.6530096530914307, "learning_rate": 6.652636862216267e-05, "loss": 0.0123, "num_tokens": 8818583.0, "reward": 0.7679810523986816, "reward_std": 0.4150759279727936, "rewards/rollout_reward_func/mean": 0.7679810523986816, "rewards/rollout_reward_func/std": 0.5131683945655823, "sampling/importance_sampling_ratio/max": 1.5073987245559692, "sampling/importance_sampling_ratio/mean": 0.9992578029632568, "sampling/importance_sampling_ratio/min": 0.8036995530128479, "sampling/sampling_logp_difference/max": 0.20841699838638306, "sampling/sampling_logp_difference/mean": 0.01146792247891426, "step": 204, "step_time": 8.366809391999595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 329.96875, "completions/mean_terminated_length": 329.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.3050294630229473, "epoch": 0.0041, "frac_reward_zero_std": 0.0, "grad_norm": 0.32519257068634033, "kl": 1.9520803540945053, "learning_rate": 6.64750658163533e-05, "loss": 0.0018, "num_tokens": 8859152.0, "reward": 0.3545837700366974, "reward_std": 0.8099404573440552, "rewards/rollout_reward_func/mean": 0.3545837700366974, "rewards/rollout_reward_func/std": 0.8023747205734253, "sampling/importance_sampling_ratio/max": 1.1725339889526367, "sampling/importance_sampling_ratio/mean": 0.9907645583152771, "sampling/importance_sampling_ratio/min": 0.7974545955657959, "sampling/sampling_logp_difference/max": 0.30623334646224976, "sampling/sampling_logp_difference/mean": 0.02270321734249592, "step": 205, "step_time": 8.00480675100016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 467.28125, "completions/mean_terminated_length": 467.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.27975811064243317, "epoch": 0.00412, "frac_reward_zero_std": 0.0, "grad_norm": 0.246986523270607, "kl": 1.6313919126987457, "learning_rate": 6.642341236614142e-05, "loss": 0.0047, "num_tokens": 8904431.0, "reward": 0.6446257829666138, "reward_std": 0.6641853451728821, "rewards/rollout_reward_func/mean": 0.6446257829666138, "rewards/rollout_reward_func/std": 0.6650885343551636, "sampling/importance_sampling_ratio/max": 1.2459704875946045, "sampling/importance_sampling_ratio/mean": 0.9811293482780457, "sampling/importance_sampling_ratio/min": 0.6793065071105957, "sampling/sampling_logp_difference/max": 0.36866044998168945, "sampling/sampling_logp_difference/mean": 0.017048876732587814, "step": 206, "step_time": 9.069412863999787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 399.90625, "completions/mean_terminated_length": 399.90625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1980579444207251, "epoch": 0.00414, "frac_reward_zero_std": 0.0, "grad_norm": 0.11953364312648773, "kl": 1.7249758839607239, "learning_rate": 6.637140901154516e-05, "loss": 0.0215, "num_tokens": 8946974.0, "reward": 0.7967815399169922, "reward_std": 0.4432019293308258, "rewards/rollout_reward_func/mean": 0.7967815399169922, "rewards/rollout_reward_func/std": 0.42983901500701904, "sampling/importance_sampling_ratio/max": 1.0845228433609009, "sampling/importance_sampling_ratio/mean": 0.9883180856704712, "sampling/importance_sampling_ratio/min": 0.7774369120597839, "sampling/sampling_logp_difference/max": 0.249894917011261, "sampling/sampling_logp_difference/mean": 0.009124305099248886, "step": 207, "step_time": 8.123870977000934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 421.15625, "completions/mean_terminated_length": 421.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.22997253201901913, "epoch": 0.00416, "frac_reward_zero_std": 0.0, "grad_norm": 0.14934802055358887, "kl": 1.662257544696331, "learning_rate": 6.631905649759564e-05, "loss": 0.0253, "num_tokens": 8989813.0, "reward": 0.7639309167861938, "reward_std": 0.453718364238739, "rewards/rollout_reward_func/mean": 0.7639309167861938, "rewards/rollout_reward_func/std": 0.45336294174194336, "sampling/importance_sampling_ratio/max": 1.1903895139694214, "sampling/importance_sampling_ratio/mean": 0.9963328838348389, "sampling/importance_sampling_ratio/min": 0.731601893901825, "sampling/sampling_logp_difference/max": 0.21688008308410645, "sampling/sampling_logp_difference/mean": 0.011678927578032017, "step": 208, "step_time": 8.087910742999611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 471.75, "completions/mean_terminated_length": 471.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.22063090186566114, "epoch": 0.00418, "frac_reward_zero_std": 0.25, "grad_norm": 0.19611601531505585, "kl": 1.6378168910741806, "learning_rate": 6.626635557432619e-05, "loss": 0.016, "num_tokens": 9035944.0, "reward": 0.7598427534103394, "reward_std": 0.39722299575805664, "rewards/rollout_reward_func/mean": 0.7598427534103394, "rewards/rollout_reward_func/std": 0.4614258408546448, "sampling/importance_sampling_ratio/max": 1.2083497047424316, "sampling/importance_sampling_ratio/mean": 0.9988184571266174, "sampling/importance_sampling_ratio/min": 0.7522462010383606, "sampling/sampling_logp_difference/max": 0.23008131980895996, "sampling/sampling_logp_difference/mean": 0.013020459562540054, "step": 209, "step_time": 8.30271783400076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 465.34375, "completions/mean_terminated_length": 465.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.23169718496501446, "epoch": 0.0042, "frac_reward_zero_std": 0.0, "grad_norm": 0.16928784549236298, "kl": 1.4353340342640877, "learning_rate": 6.621330699676174e-05, "loss": 0.0135, "num_tokens": 9081524.0, "reward": 0.7408862113952637, "reward_std": 0.5458085536956787, "rewards/rollout_reward_func/mean": 0.7408862113952637, "rewards/rollout_reward_func/std": 0.5792800188064575, "sampling/importance_sampling_ratio/max": 1.2197306156158447, "sampling/importance_sampling_ratio/mean": 0.9990067481994629, "sampling/importance_sampling_ratio/min": 0.7842026948928833, "sampling/sampling_logp_difference/max": 0.23786401748657227, "sampling/sampling_logp_difference/mean": 0.008605476468801498, "step": 210, "step_time": 8.315209765000418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 309.25, "completions/mean_terminated_length": 309.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2134710382670164, "epoch": 0.00422, "frac_reward_zero_std": 0.0, "grad_norm": 0.21624822914600372, "kl": 1.6343066841363907, "learning_rate": 6.615991152490784e-05, "loss": 0.0291, "num_tokens": 9121008.0, "reward": 0.6426441669464111, "reward_std": 0.6426317691802979, "rewards/rollout_reward_func/mean": 0.6426441669464111, "rewards/rollout_reward_func/std": 0.6661960482597351, "sampling/importance_sampling_ratio/max": 1.2807894945144653, "sampling/importance_sampling_ratio/mean": 1.0030893087387085, "sampling/importance_sampling_ratio/min": 0.8235617280006409, "sampling/sampling_logp_difference/max": 0.2406165599822998, "sampling/sampling_logp_difference/mean": 0.01379447802901268, "step": 211, "step_time": 8.65923962999932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 328.59375, "completions/mean_terminated_length": 328.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2642470486462116, "epoch": 0.00424, "frac_reward_zero_std": 0.0, "grad_norm": 0.19245360791683197, "kl": 1.5241184458136559, "learning_rate": 6.610616992373991e-05, "loss": -0.0367, "num_tokens": 9162390.0, "reward": 0.5177299976348877, "reward_std": 0.7392485737800598, "rewards/rollout_reward_func/mean": 0.5177299976348877, "rewards/rollout_reward_func/std": 0.7691447138786316, "sampling/importance_sampling_ratio/max": 1.475467324256897, "sampling/importance_sampling_ratio/mean": 1.0539498329162598, "sampling/importance_sampling_ratio/min": 0.9373591542243958, "sampling/sampling_logp_difference/max": 0.19473910331726074, "sampling/sampling_logp_difference/mean": 0.013571429997682571, "step": 212, "step_time": 7.870138690999283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 413.6875, "completions/mean_terminated_length": 413.6875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.20648062648251653, "epoch": 0.00426, "frac_reward_zero_std": 0.25, "grad_norm": 0.22886118292808533, "kl": 1.477797731757164, "learning_rate": 6.605208296319221e-05, "loss": 0.001, "num_tokens": 9205180.0, "reward": 0.831994891166687, "reward_std": 0.3433237075805664, "rewards/rollout_reward_func/mean": 0.831994891166687, "rewards/rollout_reward_func/std": 0.3966811001300812, "sampling/importance_sampling_ratio/max": 1.2839349508285522, "sampling/importance_sampling_ratio/mean": 0.9961127042770386, "sampling/importance_sampling_ratio/min": 0.8056564331054688, "sampling/sampling_logp_difference/max": 0.26025718450546265, "sampling/sampling_logp_difference/mean": 0.014957357197999954, "step": 213, "step_time": 8.204849511000248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 265.375, "completions/mean_terminated_length": 265.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.10607258723757695, "epoch": 0.00428, "frac_reward_zero_std": 0.25, "grad_norm": 0.11912494152784348, "kl": 1.644753873348236, "learning_rate": 6.599765141814683e-05, "loss": -0.0114, "num_tokens": 9241820.0, "reward": 0.8615936040878296, "reward_std": 0.32459884881973267, "rewards/rollout_reward_func/mean": 0.8615936040878296, "rewards/rollout_reward_func/std": 0.3721197843551636, "sampling/importance_sampling_ratio/max": 1.1619200706481934, "sampling/importance_sampling_ratio/mean": 0.966556966304779, "sampling/importance_sampling_ratio/min": 0.612433910369873, "sampling/sampling_logp_difference/max": 0.4937472343444824, "sampling/sampling_logp_difference/mean": 0.014159895479679108, "step": 214, "step_time": 7.803084663999016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 330.59375, "completions/mean_terminated_length": 330.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.11627220083028078, "epoch": 0.0043, "frac_reward_zero_std": 0.0, "grad_norm": 0.3070707619190216, "kl": 1.6782458424568176, "learning_rate": 6.594287606842255e-05, "loss": 0.083, "num_tokens": 9280632.0, "reward": 0.7746362090110779, "reward_std": 0.5705786943435669, "rewards/rollout_reward_func/mean": 0.7746362090110779, "rewards/rollout_reward_func/std": 0.5595805048942566, "sampling/importance_sampling_ratio/max": 1.325667381286621, "sampling/importance_sampling_ratio/mean": 0.9911642074584961, "sampling/importance_sampling_ratio/min": 0.7031992077827454, "sampling/sampling_logp_difference/max": 0.42556536197662354, "sampling/sampling_logp_difference/mean": 0.00990360975265503, "step": 215, "step_time": 9.55579622400046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 362.75, "completions/mean_terminated_length": 362.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.10872773046139628, "epoch": 0.00432, "frac_reward_zero_std": 0.25, "grad_norm": 0.21132104098796844, "kl": 1.8276768028736115, "learning_rate": 6.588775769876375e-05, "loss": -0.0255, "num_tokens": 9322222.0, "reward": 0.6332675814628601, "reward_std": 0.5233778953552246, "rewards/rollout_reward_func/mean": 0.6332675814628601, "rewards/rollout_reward_func/std": 0.6283921599388123, "sampling/importance_sampling_ratio/max": 1.0977802276611328, "sampling/importance_sampling_ratio/mean": 0.9869405031204224, "sampling/importance_sampling_ratio/min": 0.7643289566040039, "sampling/sampling_logp_difference/max": 0.24790287017822266, "sampling/sampling_logp_difference/mean": 0.007239294704049826, "step": 216, "step_time": 9.033014733000527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 340.5625, "completions/mean_terminated_length": 340.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.10057177860289812, "epoch": 0.00434, "frac_reward_zero_std": 0.25, "grad_norm": 0.19732525944709778, "kl": 1.9953864961862564, "learning_rate": 6.583229709882908e-05, "loss": -0.0173, "num_tokens": 9364609.0, "reward": 0.5738862752914429, "reward_std": 0.5405762195587158, "rewards/rollout_reward_func/mean": 0.5738862752914429, "rewards/rollout_reward_func/std": 0.6829141974449158, "sampling/importance_sampling_ratio/max": 1.2425639629364014, "sampling/importance_sampling_ratio/mean": 0.9861011505126953, "sampling/importance_sampling_ratio/min": 0.6302760243415833, "sampling/sampling_logp_difference/max": 0.3427753448486328, "sampling/sampling_logp_difference/mean": 0.006589896976947784, "step": 217, "step_time": 8.38382427500028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 444.21875, "completions/mean_terminated_length": 444.21875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1292445883154869, "epoch": 0.00436, "frac_reward_zero_std": 0.5, "grad_norm": 0.16957780718803406, "kl": 2.24883796274662, "learning_rate": 6.57764950631802e-05, "loss": 0.0136, "num_tokens": 9409255.0, "reward": 0.8014612197875977, "reward_std": 0.3284684419631958, "rewards/rollout_reward_func/mean": 0.8014612197875977, "rewards/rollout_reward_func/std": 0.4894164502620697, "sampling/importance_sampling_ratio/max": 1.2099711894989014, "sampling/importance_sampling_ratio/mean": 0.9718891382217407, "sampling/importance_sampling_ratio/min": 0.8186468482017517, "sampling/sampling_logp_difference/max": 0.17636823654174805, "sampling/sampling_logp_difference/mean": 0.00794858206063509, "step": 218, "step_time": 8.629906650000521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 444.75, "completions/mean_terminated_length": 444.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.059542165603488684, "epoch": 0.00438, "frac_reward_zero_std": 0.0, "grad_norm": 0.11695202440023422, "kl": 1.2414145842194557, "learning_rate": 6.572035239127041e-05, "loss": -0.0773, "num_tokens": 9453833.0, "reward": 0.7340176701545715, "reward_std": 0.5142470598220825, "rewards/rollout_reward_func/mean": 0.7340176701545715, "rewards/rollout_reward_func/std": 0.5328094363212585, "sampling/importance_sampling_ratio/max": 1.2117832899093628, "sampling/importance_sampling_ratio/mean": 0.9818223118782043, "sampling/importance_sampling_ratio/min": 0.8542850613594055, "sampling/sampling_logp_difference/max": 0.19948673248291016, "sampling/sampling_logp_difference/mean": 0.004604026675224304, "step": 219, "step_time": 9.53932645499981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 413.875, "completions/mean_terminated_length": 426.19354248046875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.038059107726439834, "epoch": 0.0044, "frac_reward_zero_std": 0.0, "grad_norm": 0.10634060949087143, "kl": 1.2563352435827255, "learning_rate": 6.566386988743311e-05, "loss": -0.0882, "num_tokens": 9497657.0, "reward": 0.5674043893814087, "reward_std": 0.6257844567298889, "rewards/rollout_reward_func/mean": 0.5674043893814087, "rewards/rollout_reward_func/std": 0.6406186819076538, "sampling/importance_sampling_ratio/max": 1.0887833833694458, "sampling/importance_sampling_ratio/mean": 0.9937493205070496, "sampling/importance_sampling_ratio/min": 0.5241602659225464, "sampling/sampling_logp_difference/max": 0.3241729736328125, "sampling/sampling_logp_difference/mean": 0.002492698607966304, "step": 220, "step_time": 10.786571442999048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 431.78125, "completions/mean_terminated_length": 444.6773986816406, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0328872362151742, "epoch": 0.00442, "frac_reward_zero_std": 0.25, "grad_norm": 0.06259305775165558, "kl": 1.4149154275655746, "learning_rate": 6.560704836087037e-05, "loss": -0.0346, "num_tokens": 9542513.0, "reward": 0.8004412651062012, "reward_std": 0.4161609411239624, "rewards/rollout_reward_func/mean": 0.8004412651062012, "rewards/rollout_reward_func/std": 0.49286219477653503, "sampling/importance_sampling_ratio/max": 1.0322428941726685, "sampling/importance_sampling_ratio/mean": 0.9738302230834961, "sampling/importance_sampling_ratio/min": 0.7522615790367126, "sampling/sampling_logp_difference/max": 0.2471332550048828, "sampling/sampling_logp_difference/mean": 0.0022347013000398874, "step": 221, "step_time": 9.742999955999949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 902.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 384.375, "completions/mean_terminated_length": 407.86669921875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.015270332689397037, "epoch": 0.00444, "frac_reward_zero_std": 0.25, "grad_norm": 0.15543992817401886, "kl": 1.2219791412353516, "learning_rate": 6.55498886256413e-05, "loss": -0.046, "num_tokens": 9585059.0, "reward": 0.6083602905273438, "reward_std": 0.5643774271011353, "rewards/rollout_reward_func/mean": 0.6083602905273438, "rewards/rollout_reward_func/std": 0.6754488945007324, "sampling/importance_sampling_ratio/max": 1.0159441232681274, "sampling/importance_sampling_ratio/mean": 0.9805773496627808, "sampling/importance_sampling_ratio/min": 0.5930522680282593, "sampling/sampling_logp_difference/max": 0.41130197048187256, "sampling/sampling_logp_difference/mean": 0.001562439720146358, "step": 222, "step_time": 10.275746951000201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 902.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 453.125, "completions/mean_terminated_length": 466.70965576171875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.015934305149130523, "epoch": 0.00446, "frac_reward_zero_std": 0.0, "grad_norm": 0.1340453326702118, "kl": 1.288367159664631, "learning_rate": 6.549239150065038e-05, "loss": 0.0802, "num_tokens": 9629221.0, "reward": 0.7686713933944702, "reward_std": 0.4805063307285309, "rewards/rollout_reward_func/mean": 0.7686713933944702, "rewards/rollout_reward_func/std": 0.5098506808280945, "sampling/importance_sampling_ratio/max": 1.1302770376205444, "sampling/importance_sampling_ratio/mean": 1.0013370513916016, "sampling/importance_sampling_ratio/min": 0.9265798926353455, "sampling/sampling_logp_difference/max": 0.12491464614868164, "sampling/sampling_logp_difference/mean": 0.0010387648362666368, "step": 223, "step_time": 10.194640707999952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 415.09375, "completions/mean_terminated_length": 427.45159912109375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.030947560211643577, "epoch": 0.00448, "frac_reward_zero_std": 0.25, "grad_norm": 0.17147816717624664, "kl": 1.3192967027425766, "learning_rate": 6.543455780963577e-05, "loss": 0.0943, "num_tokens": 9672651.0, "reward": 0.8018882274627686, "reward_std": 0.427806556224823, "rewards/rollout_reward_func/mean": 0.8018882274627686, "rewards/rollout_reward_func/std": 0.4901379942893982, "sampling/importance_sampling_ratio/max": 1.2057719230651855, "sampling/importance_sampling_ratio/mean": 1.002761721611023, "sampling/importance_sampling_ratio/min": 0.8570718169212341, "sampling/sampling_logp_difference/max": 0.15554046630859375, "sampling/sampling_logp_difference/mean": 0.0022777426056563854, "step": 224, "step_time": 10.020922199999859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 932.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 339.0, "completions/mean_terminated_length": 348.9032287597656, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05414318712428212, "epoch": 0.0045, "frac_reward_zero_std": 0.25, "grad_norm": 0.4855145215988159, "kl": 1.2459043711423874, "learning_rate": 6.537638838115743e-05, "loss": 0.1237, "num_tokens": 9711704.0, "reward": 0.7688395977020264, "reward_std": 0.422979474067688, "rewards/rollout_reward_func/mean": 0.7688395977020264, "rewards/rollout_reward_func/std": 0.5114967823028564, "sampling/importance_sampling_ratio/max": 1.4629011154174805, "sampling/importance_sampling_ratio/mean": 1.0197663307189941, "sampling/importance_sampling_ratio/min": 0.8414681553840637, "sampling/sampling_logp_difference/max": 0.2432692050933838, "sampling/sampling_logp_difference/mean": 0.0042849211022257805, "step": 225, "step_time": 10.401777036999647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 932.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 390.21875, "completions/mean_terminated_length": 412.5000305175781, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.020928325364366174, "epoch": 0.00452, "frac_reward_zero_std": 0.25, "grad_norm": 0.285619854927063, "kl": 1.3021946847438812, "learning_rate": 6.531788404858531e-05, "loss": 0.1588, "num_tokens": 9753703.0, "reward": 0.6870023012161255, "reward_std": 0.5494800209999084, "rewards/rollout_reward_func/mean": 0.6870023012161255, "rewards/rollout_reward_func/std": 0.6871160268783569, "sampling/importance_sampling_ratio/max": 1.1274466514587402, "sampling/importance_sampling_ratio/mean": 0.9910320043563843, "sampling/importance_sampling_ratio/min": 0.7720257043838501, "sampling/sampling_logp_difference/max": 0.25953805446624756, "sampling/sampling_logp_difference/mean": 0.001526001957245171, "step": 226, "step_time": 9.969494539999687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 439.625, "completions/mean_terminated_length": 466.8000183105469, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.01511647435836494, "epoch": 0.00454, "frac_reward_zero_std": 0.25, "grad_norm": 0.16454598307609558, "kl": 1.372649073600769, "learning_rate": 6.525904565008741e-05, "loss": 0.1382, "num_tokens": 9797301.0, "reward": 0.6399471759796143, "reward_std": 0.5228070020675659, "rewards/rollout_reward_func/mean": 0.6399471759796143, "rewards/rollout_reward_func/std": 0.6187947988510132, "sampling/importance_sampling_ratio/max": 1.1005831956863403, "sampling/importance_sampling_ratio/mean": 0.9975316524505615, "sampling/importance_sampling_ratio/min": 0.8069126605987549, "sampling/sampling_logp_difference/max": 0.23482537269592285, "sampling/sampling_logp_difference/mean": 0.0011870869202539325, "step": 227, "step_time": 9.883564939999815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 941.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 418.90625, "completions/mean_terminated_length": 444.70001220703125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.007411483762552962, "epoch": 0.00456, "frac_reward_zero_std": 0.0, "grad_norm": 0.07721781730651855, "kl": 1.4116452932357788, "learning_rate": 6.519987402861773e-05, "loss": 0.1773, "num_tokens": 9841195.0, "reward": 0.5416731238365173, "reward_std": 0.617161214351654, "rewards/rollout_reward_func/mean": 0.5416731238365173, "rewards/rollout_reward_func/std": 0.6357067823410034, "sampling/importance_sampling_ratio/max": 1.0106494426727295, "sampling/importance_sampling_ratio/mean": 0.9871832132339478, "sampling/importance_sampling_ratio/min": 0.6838159561157227, "sampling/sampling_logp_difference/max": 0.3798474073410034, "sampling/sampling_logp_difference/mean": 0.0007951428415253758, "step": 228, "step_time": 9.75437322200014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 457.96875, "completions/mean_terminated_length": 457.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0037640260125044733, "epoch": 0.00458, "frac_reward_zero_std": 0.0, "grad_norm": 0.015067197382450104, "kl": 1.4324913024902344, "learning_rate": 6.514037003190428e-05, "loss": 0.0853, "num_tokens": 9885669.0, "reward": 0.7625328898429871, "reward_std": 0.45770537853240967, "rewards/rollout_reward_func/mean": 0.7625328898429871, "rewards/rollout_reward_func/std": 0.4560849964618683, "sampling/importance_sampling_ratio/max": 1.0177735090255737, "sampling/importance_sampling_ratio/mean": 0.9996566772460938, "sampling/importance_sampling_ratio/min": 0.9814097881317139, "sampling/sampling_logp_difference/max": 0.018286503851413727, "sampling/sampling_logp_difference/mean": 0.00010163441766053438, "step": 229, "step_time": 10.456702469999072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 439.5625, "completions/mean_terminated_length": 439.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.021693755057640374, "epoch": 0.0046, "frac_reward_zero_std": 0.25, "grad_norm": 0.1580529510974884, "kl": 1.4338557124137878, "learning_rate": 6.508053451243681e-05, "loss": 0.1392, "num_tokens": 9929849.0, "reward": 0.764330267906189, "reward_std": 0.3637913465499878, "rewards/rollout_reward_func/mean": 0.764330267906189, "rewards/rollout_reward_func/std": 0.4525086283683777, "sampling/importance_sampling_ratio/max": 1.0050158500671387, "sampling/importance_sampling_ratio/mean": 0.9908139109611511, "sampling/importance_sampling_ratio/min": 0.8274100422859192, "sampling/sampling_logp_difference/max": 0.1891191303730011, "sampling/sampling_logp_difference/mean": 0.000550256110727787, "step": 230, "step_time": 10.664491079000527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 365.71875, "completions/mean_terminated_length": 387.9666748046875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.021960796089842916, "epoch": 0.00462, "frac_reward_zero_std": 0.25, "grad_norm": 0.06384147703647614, "kl": 1.5252776145935059, "learning_rate": 6.502036832745469e-05, "loss": 0.0592, "num_tokens": 9971473.0, "reward": 0.6718032360076904, "reward_std": 0.5026345252990723, "rewards/rollout_reward_func/mean": 0.6718032360076904, "rewards/rollout_reward_func/std": 0.6084029078483582, "sampling/importance_sampling_ratio/max": 1.0191559791564941, "sampling/importance_sampling_ratio/mean": 0.9894522428512573, "sampling/importance_sampling_ratio/min": 0.8510310649871826, "sampling/sampling_logp_difference/max": 0.16368746757507324, "sampling/sampling_logp_difference/mean": 0.0007452635909430683, "step": 231, "step_time": 9.85717811099903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 455.5625, "completions/mean_terminated_length": 469.2257995605469, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.026700853370130062, "epoch": 0.00464, "frac_reward_zero_std": 0.0, "grad_norm": 0.23785929381847382, "kl": 1.5571517646312714, "learning_rate": 6.495987233893458e-05, "loss": 0.1292, "num_tokens": 10015964.0, "reward": 0.7003435492515564, "reward_std": 0.5268506407737732, "rewards/rollout_reward_func/mean": 0.7003435492515564, "rewards/rollout_reward_func/std": 0.5493570566177368, "sampling/importance_sampling_ratio/max": 1.1704710721969604, "sampling/importance_sampling_ratio/mean": 1.006547451019287, "sampling/importance_sampling_ratio/min": 0.8503082990646362, "sampling/sampling_logp_difference/max": 0.1641693115234375, "sampling/sampling_logp_difference/mean": 0.0016930445563048124, "step": 232, "step_time": 10.499969933999637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 486.84375, "completions/mean_terminated_length": 486.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.022194642340764403, "epoch": 0.00466, "frac_reward_zero_std": 0.0, "grad_norm": 0.22000561654567719, "kl": 1.546009287238121, "learning_rate": 6.489904741357817e-05, "loss": -0.0238, "num_tokens": 10062400.0, "reward": 0.7300143241882324, "reward_std": 0.5000483989715576, "rewards/rollout_reward_func/mean": 0.7300143241882324, "rewards/rollout_reward_func/std": 0.4752616584300995, "sampling/importance_sampling_ratio/max": 1.12747061252594, "sampling/importance_sampling_ratio/mean": 1.0090688467025757, "sampling/importance_sampling_ratio/min": 0.9103929996490479, "sampling/sampling_logp_difference/max": 0.10493659973144531, "sampling/sampling_logp_difference/mean": 0.0011490816250443459, "step": 233, "step_time": 10.753715330999512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 479.46875, "completions/mean_terminated_length": 493.2333679199219, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.024705523625016212, "epoch": 0.00468, "frac_reward_zero_std": 0.0, "grad_norm": 0.6147395968437195, "kl": 1.5489514544606209, "learning_rate": 6.483789442279961e-05, "loss": 0.0807, "num_tokens": 10108427.0, "reward": 0.5410307049751282, "reward_std": 0.6161845326423645, "rewards/rollout_reward_func/mean": 0.5410307049751282, "rewards/rollout_reward_func/std": 0.6369966268539429, "sampling/importance_sampling_ratio/max": 1.3398884534835815, "sampling/importance_sampling_ratio/mean": 1.00101900100708, "sampling/importance_sampling_ratio/min": 0.6520875692367554, "sampling/sampling_logp_difference/max": 0.2924211025238037, "sampling/sampling_logp_difference/mean": 0.0018652037251740694, "step": 234, "step_time": 11.504729888000384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 463.96875, "completions/mean_terminated_length": 463.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.024365806952118874, "epoch": 0.0047, "frac_reward_zero_std": 0.25, "grad_norm": 0.123259037733078, "kl": 1.6395191848278046, "learning_rate": 6.477641424271316e-05, "loss": 0.1124, "num_tokens": 10153450.0, "reward": 0.7703279256820679, "reward_std": 0.4095226228237152, "rewards/rollout_reward_func/mean": 0.7703279256820679, "rewards/rollout_reward_func/std": 0.5074303150177002, "sampling/importance_sampling_ratio/max": 1.553205966949463, "sampling/importance_sampling_ratio/mean": 1.0345032215118408, "sampling/importance_sampling_ratio/min": 0.8816506862640381, "sampling/sampling_logp_difference/max": 0.4402703046798706, "sampling/sampling_logp_difference/mean": 0.0031826975755393505, "step": 235, "step_time": 10.561447424999642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 436.375, "completions/mean_terminated_length": 436.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.009311198024079204, "epoch": 0.00472, "frac_reward_zero_std": 0.0, "grad_norm": 0.005134935490787029, "kl": 2.4463338255882263, "learning_rate": 6.47146077541206e-05, "loss": 0.0061, "num_tokens": 10197568.0, "reward": 0.7079228162765503, "reward_std": 0.5885865092277527, "rewards/rollout_reward_func/mean": 0.7079228162765503, "rewards/rollout_reward_func/std": 0.5936511754989624, "sampling/importance_sampling_ratio/max": 1.0054621696472168, "sampling/importance_sampling_ratio/mean": 0.9996936917304993, "sampling/importance_sampling_ratio/min": 0.9905997514724731, "sampling/sampling_logp_difference/max": 0.009525632485747337, "sampling/sampling_logp_difference/mean": 0.00015923452156130224, "step": 236, "step_time": 8.518732483999884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 435.125, "completions/mean_terminated_length": 435.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.004507428966462612, "epoch": 0.00474, "frac_reward_zero_std": 0.25, "grad_norm": 0.0015722453827038407, "kl": 2.5231307446956635, "learning_rate": 6.465247584249853e-05, "loss": 0.0134, "num_tokens": 10242289.0, "reward": 0.7091089487075806, "reward_std": 0.5003358125686646, "rewards/rollout_reward_func/mean": 0.7091089487075806, "rewards/rollout_reward_func/std": 0.5906311273574829, "sampling/importance_sampling_ratio/max": 1.0048024654388428, "sampling/importance_sampling_ratio/mean": 1.0001246929168701, "sampling/importance_sampling_ratio/min": 0.9977025389671326, "sampling/sampling_logp_difference/max": 0.004082078114151955, "sampling/sampling_logp_difference/mean": 7.437775639118627e-05, "step": 237, "step_time": 8.377104144999066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 407.3125, "completions/mean_terminated_length": 407.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.004164711193880066, "epoch": 0.00476, "frac_reward_zero_std": 0.0, "grad_norm": 0.0024398942478001118, "kl": 2.3346392810344696, "learning_rate": 6.459001939798584e-05, "loss": 0.0271, "num_tokens": 10284913.0, "reward": 0.7024825811386108, "reward_std": 0.5023645162582397, "rewards/rollout_reward_func/mean": 0.7024825811386108, "rewards/rollout_reward_func/std": 0.5459490418434143, "sampling/importance_sampling_ratio/max": 1.0019805431365967, "sampling/importance_sampling_ratio/mean": 1.0002102851867676, "sampling/importance_sampling_ratio/min": 0.9989463090896606, "sampling/sampling_logp_difference/max": 0.0020096921361982822, "sampling/sampling_logp_difference/mean": 7.04285193933174e-05, "step": 238, "step_time": 8.051481960999354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 489.3125, "completions/mean_terminated_length": 489.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0041988667217083275, "epoch": 0.00478, "frac_reward_zero_std": 0.0, "grad_norm": 0.0014634397812187672, "kl": 2.188842386007309, "learning_rate": 6.452723931537084e-05, "loss": 0.0247, "num_tokens": 10330611.0, "reward": 0.6363575458526611, "reward_std": 0.5442573428153992, "rewards/rollout_reward_func/mean": 0.6363575458526611, "rewards/rollout_reward_func/std": 0.5687863230705261, "sampling/importance_sampling_ratio/max": 1.0009722709655762, "sampling/importance_sampling_ratio/mean": 1.0000011920928955, "sampling/importance_sampling_ratio/min": 0.9971250891685486, "sampling/sampling_logp_difference/max": 0.0027593541890382767, "sampling/sampling_logp_difference/mean": 6.252797174965963e-05, "step": 239, "step_time": 9.07281144200033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 402.34375, "completions/mean_terminated_length": 402.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.003171816118992865, "epoch": 0.0048, "frac_reward_zero_std": 0.25, "grad_norm": 0.0006657041958533227, "kl": 2.2885120809078217, "learning_rate": 6.446413649407845e-05, "loss": 0.0257, "num_tokens": 10372257.0, "reward": 0.8324346542358398, "reward_std": 0.34325742721557617, "rewards/rollout_reward_func/mean": 0.8324346542358398, "rewards/rollout_reward_func/std": 0.3956550657749176, "sampling/importance_sampling_ratio/max": 1.0009502172470093, "sampling/importance_sampling_ratio/mean": 1.0000749826431274, "sampling/importance_sampling_ratio/min": 0.9986613392829895, "sampling/sampling_logp_difference/max": 0.0016375163104385138, "sampling/sampling_logp_difference/mean": 4.942509622196667e-05, "step": 240, "step_time": 8.406777945999238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 383.40625, "completions/mean_terminated_length": 383.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.003844678867608309, "epoch": 0.00482, "frac_reward_zero_std": 0.25, "grad_norm": 0.002024407032877207, "kl": 2.130301773548126, "learning_rate": 6.440071183815737e-05, "loss": 0.0257, "num_tokens": 10413541.0, "reward": 0.7667964696884155, "reward_std": 0.3695795238018036, "rewards/rollout_reward_func/mean": 0.7667964696884155, "rewards/rollout_reward_func/std": 0.44809257984161377, "sampling/importance_sampling_ratio/max": 1.001082181930542, "sampling/importance_sampling_ratio/mean": 1.0000512599945068, "sampling/importance_sampling_ratio/min": 0.9991231560707092, "sampling/sampling_logp_difference/max": 0.000864407978951931, "sampling/sampling_logp_difference/mean": 4.5884349674452096e-05, "step": 241, "step_time": 8.002086370001052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 479.75, "completions/mean_terminated_length": 479.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.003334205161081627, "epoch": 0.00484, "frac_reward_zero_std": 0.0, "grad_norm": 0.0014854873297736049, "kl": 2.552787348628044, "learning_rate": 6.433696625626706e-05, "loss": 0.0371, "num_tokens": 10459353.0, "reward": 0.6974210143089294, "reward_std": 0.4973956048488617, "rewards/rollout_reward_func/mean": 0.6974210143089294, "rewards/rollout_reward_func/std": 0.49145758152008057, "sampling/importance_sampling_ratio/max": 1.00095534324646, "sampling/importance_sampling_ratio/mean": 1.000216007232666, "sampling/importance_sampling_ratio/min": 0.99976646900177, "sampling/sampling_logp_difference/max": 0.0008187387138605118, "sampling/sampling_logp_difference/mean": 4.765057019540109e-05, "step": 242, "step_time": 8.430309567999302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 533.03125, "completions/mean_terminated_length": 533.03125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.003812253737123683, "epoch": 0.00486, "frac_reward_zero_std": 0.0, "grad_norm": 0.0009188986150547862, "kl": 2.498543679714203, "learning_rate": 6.42729006616648e-05, "loss": 0.0285, "num_tokens": 10507664.0, "reward": 0.6634383201599121, "reward_std": 0.5285376310348511, "rewards/rollout_reward_func/mean": 0.6634383201599121, "rewards/rollout_reward_func/std": 0.5073186159133911, "sampling/importance_sampling_ratio/max": 1.0010195970535278, "sampling/importance_sampling_ratio/mean": 1.0001513957977295, "sampling/importance_sampling_ratio/min": 0.9990923404693604, "sampling/sampling_logp_difference/max": 0.0008008028380572796, "sampling/sampling_logp_difference/mean": 5.300189513945952e-05, "step": 243, "step_time": 8.457962305999445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 438.28125, "completions/mean_terminated_length": 438.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.006378606252837926, "epoch": 0.00488, "frac_reward_zero_std": 0.0, "grad_norm": 0.010866153053939342, "kl": 2.4677091240882874, "learning_rate": 6.420851597219251e-05, "loss": 0.0193, "num_tokens": 10551862.0, "reward": 0.7028543949127197, "reward_std": 0.5344079732894897, "rewards/rollout_reward_func/mean": 0.7028543949127197, "rewards/rollout_reward_func/std": 0.5439242720603943, "sampling/importance_sampling_ratio/max": 1.0302919149398804, "sampling/importance_sampling_ratio/mean": 1.001118540763855, "sampling/importance_sampling_ratio/min": 0.9988480806350708, "sampling/sampling_logp_difference/max": 0.030303478240966797, "sampling/sampling_logp_difference/mean": 0.0002510591584723443, "step": 244, "step_time": 8.779784452000058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 392.28125, "completions/mean_terminated_length": 392.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.005062321724835783, "epoch": 0.0049, "frac_reward_zero_std": 0.0, "grad_norm": 0.001358396839350462, "kl": 2.4237941205501556, "learning_rate": 6.414381311026373e-05, "loss": 0.0222, "num_tokens": 10594246.0, "reward": 0.6011388301849365, "reward_std": 0.5701137781143188, "rewards/rollout_reward_func/mean": 0.6011388301849365, "rewards/rollout_reward_func/std": 0.5804308652877808, "sampling/importance_sampling_ratio/max": 1.0009706020355225, "sampling/importance_sampling_ratio/mean": 0.9999701976776123, "sampling/importance_sampling_ratio/min": 0.9978493452072144, "sampling/sampling_logp_difference/max": 0.00260159932076931, "sampling/sampling_logp_difference/mean": 6.5814791014418e-05, "step": 245, "step_time": 8.686353241000234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 345.46875, "completions/mean_terminated_length": 345.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.006576828105608001, "epoch": 0.00492, "frac_reward_zero_std": 0.0, "grad_norm": 0.003218095749616623, "kl": 2.2055741399526596, "learning_rate": 6.407879300285024e-05, "loss": 0.0064, "num_tokens": 10633933.0, "reward": 0.6118491888046265, "reward_std": 0.6905747652053833, "rewards/rollout_reward_func/mean": 0.6118491888046265, "rewards/rollout_reward_func/std": 0.6713321208953857, "sampling/importance_sampling_ratio/max": 1.0009870529174805, "sampling/importance_sampling_ratio/mean": 1.0000510215759277, "sampling/importance_sampling_ratio/min": 0.9985733032226562, "sampling/sampling_logp_difference/max": 0.0015099290758371353, "sampling/sampling_logp_difference/mean": 9.101521573029459e-05, "step": 246, "step_time": 8.038291103000574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 352.84375, "completions/mean_terminated_length": 352.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.009017245436552912, "epoch": 0.00494, "frac_reward_zero_std": 0.0, "grad_norm": 0.2796958386898041, "kl": 2.5517167150974274, "learning_rate": 6.401345658146892e-05, "loss": 0.008, "num_tokens": 10674875.0, "reward": 0.5724894404411316, "reward_std": 0.6401242017745972, "rewards/rollout_reward_func/mean": 0.5724894404411316, "rewards/rollout_reward_func/std": 0.6327422857284546, "sampling/importance_sampling_ratio/max": 1.2607884407043457, "sampling/importance_sampling_ratio/mean": 1.008306860923767, "sampling/importance_sampling_ratio/min": 0.9992477297782898, "sampling/sampling_logp_difference/max": 0.24675512313842773, "sampling/sampling_logp_difference/mean": 0.0015485573094338179, "step": 247, "step_time": 8.027744080999128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 522.625, "completions/mean_terminated_length": 522.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.011792385601438582, "epoch": 0.00496, "frac_reward_zero_std": 0.0, "grad_norm": 0.006360654719173908, "kl": 2.609351262450218, "learning_rate": 6.394780478216836e-05, "loss": 0.0351, "num_tokens": 10722688.0, "reward": 0.6631276607513428, "reward_std": 0.5137397646903992, "rewards/rollout_reward_func/mean": 0.6631276607513428, "rewards/rollout_reward_func/std": 0.507660448551178, "sampling/importance_sampling_ratio/max": 1.002481460571289, "sampling/importance_sampling_ratio/mean": 1.000439167022705, "sampling/importance_sampling_ratio/min": 0.9949095845222473, "sampling/sampling_logp_difference/max": 0.004916932433843613, "sampling/sampling_logp_difference/mean": 0.00021457942784763873, "step": 248, "step_time": 8.448373940999772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 448.09375, "completions/mean_terminated_length": 448.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.027103576867375523, "epoch": 0.00498, "frac_reward_zero_std": 0.25, "grad_norm": 0.03137468546628952, "kl": 2.365100249648094, "learning_rate": 6.388183854551542e-05, "loss": 0.0195, "num_tokens": 10766627.0, "reward": 0.6672890186309814, "reward_std": 0.45543742179870605, "rewards/rollout_reward_func/mean": 0.6672890186309814, "rewards/rollout_reward_func/std": 0.5624145269393921, "sampling/importance_sampling_ratio/max": 1.0227241516113281, "sampling/importance_sampling_ratio/mean": 0.999199390411377, "sampling/importance_sampling_ratio/min": 0.9412746429443359, "sampling/sampling_logp_difference/max": 0.05899608135223389, "sampling/sampling_logp_difference/mean": 0.000636872835457325, "step": 249, "step_time": 8.822670563999964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 507.75, "completions/mean_terminated_length": 507.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.03771817509550601, "epoch": 0.005, "frac_reward_zero_std": 0.0, "grad_norm": 0.07511385530233383, "kl": 2.580221176147461, "learning_rate": 6.381555881658178e-05, "loss": 0.032, "num_tokens": 10813811.0, "reward": 0.6369813680648804, "reward_std": 0.5616729259490967, "rewards/rollout_reward_func/mean": 0.6369813680648804, "rewards/rollout_reward_func/std": 0.5679377913475037, "sampling/importance_sampling_ratio/max": 1.0288165807724, "sampling/importance_sampling_ratio/mean": 0.9974526166915894, "sampling/importance_sampling_ratio/min": 0.9610143303871155, "sampling/sampling_logp_difference/max": 0.03643842041492462, "sampling/sampling_logp_difference/mean": 0.000941781559959054, "step": 250, "step_time": 9.015214557000945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 519.5, "completions/mean_terminated_length": 519.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08551338268443942, "epoch": 0.00502, "frac_reward_zero_std": 0.0, "grad_norm": 0.2990966737270355, "kl": 2.629890114068985, "learning_rate": 6.37489665449304e-05, "loss": 0.037, "num_tokens": 10861235.0, "reward": 0.7642848491668701, "reward_std": 0.4549909234046936, "rewards/rollout_reward_func/mean": 0.7642848491668701, "rewards/rollout_reward_func/std": 0.4526221454143524, "sampling/importance_sampling_ratio/max": 1.2804147005081177, "sampling/importance_sampling_ratio/mean": 1.017478346824646, "sampling/importance_sampling_ratio/min": 0.9282390475273132, "sampling/sampling_logp_difference/max": 0.240570068359375, "sampling/sampling_logp_difference/mean": 0.005271648522466421, "step": 251, "step_time": 8.963683867999862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 426.0625, "completions/mean_terminated_length": 426.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08127801516093314, "epoch": 0.00504, "frac_reward_zero_std": 0.0, "grad_norm": 0.10498397052288055, "kl": 2.610559716820717, "learning_rate": 6.36820626846019e-05, "loss": 0.0181, "num_tokens": 10905625.0, "reward": 0.4774540364742279, "reward_std": 0.6595019102096558, "rewards/rollout_reward_func/mean": 0.4774540364742279, "rewards/rollout_reward_func/std": 0.6870877146720886, "sampling/importance_sampling_ratio/max": 1.0907483100891113, "sampling/importance_sampling_ratio/mean": 1.0015499591827393, "sampling/importance_sampling_ratio/min": 0.9328101277351379, "sampling/sampling_logp_difference/max": 0.08904814720153809, "sampling/sampling_logp_difference/mean": 0.0022453509736806154, "step": 252, "step_time": 8.646530514999995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 357.40625, "completions/mean_terminated_length": 357.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09417299751657993, "epoch": 0.00506, "frac_reward_zero_std": 0.25, "grad_norm": 0.1555328369140625, "kl": 2.163857474923134, "learning_rate": 6.361484819410094e-05, "loss": 0.0081, "num_tokens": 10946742.0, "reward": 0.8447847962379456, "reward_std": 0.43901485204696655, "rewards/rollout_reward_func/mean": 0.8447847962379456, "rewards/rollout_reward_func/std": 0.5091371536254883, "sampling/importance_sampling_ratio/max": 1.0059832334518433, "sampling/importance_sampling_ratio/mean": 0.9964969158172607, "sampling/importance_sampling_ratio/min": 0.9154714345932007, "sampling/sampling_logp_difference/max": 0.09332478046417236, "sampling/sampling_logp_difference/mean": 0.002493849955499172, "step": 253, "step_time": 8.06065582699921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 318.5625, "completions/mean_terminated_length": 318.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07950882520526648, "epoch": 0.00508, "frac_reward_zero_std": 0.0, "grad_norm": 0.18658527731895447, "kl": 2.203503891825676, "learning_rate": 6.354732403638242e-05, "loss": 0.0197, "num_tokens": 10985698.0, "reward": 0.7061766386032104, "reward_std": 0.5973896384239197, "rewards/rollout_reward_func/mean": 0.7061766386032104, "rewards/rollout_reward_func/std": 0.5971222519874573, "sampling/importance_sampling_ratio/max": 1.1240532398223877, "sampling/importance_sampling_ratio/mean": 1.0026638507843018, "sampling/importance_sampling_ratio/min": 0.8900558352470398, "sampling/sampling_logp_difference/max": 0.12197208404541016, "sampling/sampling_logp_difference/mean": 0.0041364324279129505, "step": 254, "step_time": 8.715509789000862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 365.125, "completions/mean_terminated_length": 365.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04349161102436483, "epoch": 0.0051, "frac_reward_zero_std": 0.0, "grad_norm": 0.19359737634658813, "kl": 2.147986263036728, "learning_rate": 6.347949117883772e-05, "loss": 0.0134, "num_tokens": 11026728.0, "reward": 0.7340857982635498, "reward_std": 0.510056734085083, "rewards/rollout_reward_func/mean": 0.7340857982635498, "rewards/rollout_reward_func/std": 0.5311858654022217, "sampling/importance_sampling_ratio/max": 1.1334878206253052, "sampling/importance_sampling_ratio/mean": 0.9958428144454956, "sampling/importance_sampling_ratio/min": 0.7480095028877258, "sampling/sampling_logp_difference/max": 0.3315223455429077, "sampling/sampling_logp_difference/mean": 0.0036376367788761854, "step": 255, "step_time": 8.962371794000774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 495.5, "completions/mean_terminated_length": 495.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08672897517681122, "epoch": 0.00512, "frac_reward_zero_std": 0.0, "grad_norm": 0.15367817878723145, "kl": 2.461927816271782, "learning_rate": 6.34113505932809e-05, "loss": 0.0116, "num_tokens": 11073802.0, "reward": 0.5399074554443359, "reward_std": 0.642728328704834, "rewards/rollout_reward_func/mean": 0.5399074554443359, "rewards/rollout_reward_func/std": 0.6378235816955566, "sampling/importance_sampling_ratio/max": 1.105070948600769, "sampling/importance_sampling_ratio/mean": 0.9966817498207092, "sampling/importance_sampling_ratio/min": 0.8176730275154114, "sampling/sampling_logp_difference/max": 0.23705053329467773, "sampling/sampling_logp_difference/mean": 0.002929958514869213, "step": 256, "step_time": 9.01389638899991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 449.8125, "completions/mean_terminated_length": 449.8125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08130422490648925, "epoch": 0.00514, "frac_reward_zero_std": 0.0, "grad_norm": 0.12242479622364044, "kl": 2.165251702070236, "learning_rate": 6.334290325593458e-05, "loss": 0.0179, "num_tokens": 11118260.0, "reward": 0.8377646207809448, "reward_std": 0.458871066570282, "rewards/rollout_reward_func/mean": 0.8377646207809448, "rewards/rollout_reward_func/std": 0.45766669511795044, "sampling/importance_sampling_ratio/max": 1.0386099815368652, "sampling/importance_sampling_ratio/mean": 0.9935059547424316, "sampling/importance_sampling_ratio/min": 0.9203097820281982, "sampling/sampling_logp_difference/max": 0.0923079252243042, "sampling/sampling_logp_difference/mean": 0.0024042311124503613, "step": 257, "step_time": 8.62596835399927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 489.15625, "completions/mean_terminated_length": 489.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07127884565852582, "epoch": 0.00516, "frac_reward_zero_std": 0.0, "grad_norm": 0.11366286128759384, "kl": 2.725199058651924, "learning_rate": 6.327415014741625e-05, "loss": 0.0154, "num_tokens": 11165713.0, "reward": 0.49701908230781555, "reward_std": 0.6157691478729248, "rewards/rollout_reward_func/mean": 0.49701908230781555, "rewards/rollout_reward_func/std": 0.5994330048561096, "sampling/importance_sampling_ratio/max": 1.035167932510376, "sampling/importance_sampling_ratio/mean": 0.9995943307876587, "sampling/importance_sampling_ratio/min": 0.9140203595161438, "sampling/sampling_logp_difference/max": 0.08991479873657227, "sampling/sampling_logp_difference/mean": 0.0016419283347204328, "step": 258, "step_time": 9.016928541999732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 393.4375, "completions/mean_terminated_length": 393.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.083243687171489, "epoch": 0.00518, "frac_reward_zero_std": 0.0, "grad_norm": 0.06341417878866196, "kl": 2.4599571377038956, "learning_rate": 6.320509225272394e-05, "loss": 0.011, "num_tokens": 11208524.0, "reward": 0.6727246046066284, "reward_std": 0.5735231637954712, "rewards/rollout_reward_func/mean": 0.6727246046066284, "rewards/rollout_reward_func/std": 0.608762264251709, "sampling/importance_sampling_ratio/max": 1.0326348543167114, "sampling/importance_sampling_ratio/mean": 1.0011831521987915, "sampling/importance_sampling_ratio/min": 0.9268929362297058, "sampling/sampling_logp_difference/max": 0.07591855525970459, "sampling/sampling_logp_difference/mean": 0.0017163517186418176, "step": 259, "step_time": 8.134221903999787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 445.875, "completions/mean_terminated_length": 445.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.02960486034862697, "epoch": 0.0052, "frac_reward_zero_std": 0.0, "grad_norm": 0.006647562142461538, "kl": 2.6571558713912964, "learning_rate": 6.313573056122227e-05, "loss": 0.0193, "num_tokens": 11253564.0, "reward": 0.7014866471290588, "reward_std": 0.5378434062004089, "rewards/rollout_reward_func/mean": 0.7014866471290588, "rewards/rollout_reward_func/std": 0.5460693836212158, "sampling/importance_sampling_ratio/max": 1.007468581199646, "sampling/importance_sampling_ratio/mean": 1.0000649690628052, "sampling/importance_sampling_ratio/min": 0.9947108626365662, "sampling/sampling_logp_difference/max": 0.006557689979672432, "sampling/sampling_logp_difference/mean": 0.00048389993025921285, "step": 260, "step_time": 8.868549499999972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 527.375, "completions/mean_terminated_length": 527.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06631899275816977, "epoch": 0.00522, "frac_reward_zero_std": 0.0, "grad_norm": 0.07997402548789978, "kl": 2.559489354491234, "learning_rate": 6.306606606662827e-05, "loss": 0.0183, "num_tokens": 11301515.0, "reward": 0.5587782859802246, "reward_std": 0.5373678207397461, "rewards/rollout_reward_func/mean": 0.5587782859802246, "rewards/rollout_reward_func/std": 0.5420159697532654, "sampling/importance_sampling_ratio/max": 1.0105793476104736, "sampling/importance_sampling_ratio/mean": 0.9896367192268372, "sampling/importance_sampling_ratio/min": 0.6963644027709961, "sampling/sampling_logp_difference/max": 0.3555893898010254, "sampling/sampling_logp_difference/mean": 0.005080785136669874, "step": 261, "step_time": 8.726615998000398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 389.875, "completions/mean_terminated_length": 389.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.12808919791132212, "epoch": 0.00524, "frac_reward_zero_std": 0.0, "grad_norm": 0.27760031819343567, "kl": 2.5073808431625366, "learning_rate": 6.299609976699705e-05, "loss": -0.0077, "num_tokens": 11344853.0, "reward": 0.5761151313781738, "reward_std": 0.6842832565307617, "rewards/rollout_reward_func/mean": 0.5761151313781738, "rewards/rollout_reward_func/std": 0.6808129549026489, "sampling/importance_sampling_ratio/max": 1.665401577949524, "sampling/importance_sampling_ratio/mean": 1.016595482826233, "sampling/importance_sampling_ratio/min": 0.7873161435127258, "sampling/sampling_logp_difference/max": 0.25197267532348633, "sampling/sampling_logp_difference/mean": 0.00808113906532526, "step": 262, "step_time": 8.619374897999478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 325.8125, "completions/mean_terminated_length": 325.8125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.12187617272138596, "epoch": 0.00526, "frac_reward_zero_std": 0.0, "grad_norm": 0.16854622960090637, "kl": 2.1459820568561554, "learning_rate": 6.292583266470758e-05, "loss": -0.0174, "num_tokens": 11384067.0, "reward": 0.6699318885803223, "reward_std": 0.5426480770111084, "rewards/rollout_reward_func/mean": 0.6699318885803223, "rewards/rollout_reward_func/std": 0.6126158833503723, "sampling/importance_sampling_ratio/max": 1.0891118049621582, "sampling/importance_sampling_ratio/mean": 0.990691065788269, "sampling/importance_sampling_ratio/min": 0.7542006373405457, "sampling/sampling_logp_difference/max": 0.2821618914604187, "sampling/sampling_logp_difference/mean": 0.005715424660593271, "step": 263, "step_time": 9.196162427000218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 930.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 354.03125, "completions/mean_terminated_length": 354.03125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.14860437694005668, "epoch": 0.00528, "frac_reward_zero_std": 0.0, "grad_norm": 0.2296353131532669, "kl": 2.424390956759453, "learning_rate": 6.285526576644831e-05, "loss": 0.0174, "num_tokens": 11424607.0, "reward": 0.6729596257209778, "reward_std": 0.626765251159668, "rewards/rollout_reward_func/mean": 0.6729596257209778, "rewards/rollout_reward_func/std": 0.6096981763839722, "sampling/importance_sampling_ratio/max": 1.0197169780731201, "sampling/importance_sampling_ratio/mean": 0.9937441349029541, "sampling/importance_sampling_ratio/min": 0.8337488174438477, "sampling/sampling_logp_difference/max": 0.10154008865356445, "sampling/sampling_logp_difference/mean": 0.0025921124033629894, "step": 264, "step_time": 9.05147826800021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 471.1875, "completions/mean_terminated_length": 471.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1506953351199627, "epoch": 0.0053, "frac_reward_zero_std": 0.0, "grad_norm": 0.19616885483264923, "kl": 2.1333375424146652, "learning_rate": 6.278440008320275e-05, "loss": -0.0074, "num_tokens": 11469071.0, "reward": 0.7288044095039368, "reward_std": 0.48746949434280396, "rewards/rollout_reward_func/mean": 0.7288044095039368, "rewards/rollout_reward_func/std": 0.4772927761077881, "sampling/importance_sampling_ratio/max": 1.2056735754013062, "sampling/importance_sampling_ratio/mean": 0.9954376220703125, "sampling/importance_sampling_ratio/min": 0.9441303014755249, "sampling/sampling_logp_difference/max": 0.22346091270446777, "sampling/sampling_logp_difference/mean": 0.0047704922035336494, "step": 265, "step_time": 8.858129580999503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 235.65625, "completions/mean_terminated_length": 249.2333526611328, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.3863384108990431, "epoch": 0.00532, "frac_reward_zero_std": 0.0, "grad_norm": 1317.490234375, "kl": 704.968816563487, "learning_rate": 6.271323663023497e-05, "loss": 5.3659, "num_tokens": 11505188.0, "reward": 0.731412410736084, "reward_std": 0.5413020849227905, "rewards/rollout_reward_func/mean": 0.731412410736084, "rewards/rollout_reward_func/std": 0.5355080366134644, "sampling/importance_sampling_ratio/max": 1.1163909435272217, "sampling/importance_sampling_ratio/mean": 0.9165816903114319, "sampling/importance_sampling_ratio/min": 5.612208520801496e-24, "sampling/sampling_logp_difference/max": 38.09370803833008, "sampling/sampling_logp_difference/mean": 0.11862760037183762, "step": 266, "step_time": 9.268076851999922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 358.96875, "completions/mean_terminated_length": 421.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6393178142607212, "epoch": 0.00534, "frac_reward_zero_std": 0.0, "grad_norm": 2198.415283203125, "kl": 509.7628004923463, "learning_rate": 6.264177642707511e-05, "loss": 13.2029, "num_tokens": 11546771.0, "reward": 0.5818237066268921, "reward_std": 0.7111034393310547, "rewards/rollout_reward_func/mean": 0.5818237066268921, "rewards/rollout_reward_func/std": 0.7216598987579346, "sampling/importance_sampling_ratio/max": 1.6000864505767822, "sampling/importance_sampling_ratio/mean": 0.8269320726394653, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 38.62480163574219, "sampling/sampling_logp_difference/mean": 0.18712274730205536, "step": 267, "step_time": 11.289040325999395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1512.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 467.75, "completions/mean_terminated_length": 633.6875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7376525104045868, "epoch": 0.00536, "frac_reward_zero_std": 0.0, "grad_norm": 0.8371621370315552, "kl": 1.1368668004870415, "learning_rate": 6.257002049750467e-05, "loss": 0.0505, "num_tokens": 11591976.0, "reward": 0.35675281286239624, "reward_std": 0.7766343355178833, "rewards/rollout_reward_func/mean": 0.35675281286239624, "rewards/rollout_reward_func/std": 0.8013144731521606, "sampling/importance_sampling_ratio/max": 1.781150221824646, "sampling/importance_sampling_ratio/mean": 1.065082311630249, "sampling/importance_sampling_ratio/min": 0.7035311460494995, "sampling/sampling_logp_difference/max": 0.47119951248168945, "sampling/sampling_logp_difference/mean": 0.016271252185106277, "step": 268, "step_time": 13.580166332999852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1467.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 413.46875, "completions/mean_terminated_length": 336.95654296875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6913115046918392, "epoch": 0.00538, "frac_reward_zero_std": 0.0, "grad_norm": 0.6185923218727112, "kl": 2.3529683351516724, "learning_rate": 6.249796986954193e-05, "loss": -0.0764, "num_tokens": 11634990.0, "reward": 0.8016738891601562, "reward_std": 0.49405503273010254, "rewards/rollout_reward_func/mean": 0.8016738891601562, "rewards/rollout_reward_func/std": 0.49037879705429077, "sampling/importance_sampling_ratio/max": 2.2475059032440186, "sampling/importance_sampling_ratio/mean": 0.9425992965698242, "sampling/importance_sampling_ratio/min": 3.634556648314991e-19, "sampling/sampling_logp_difference/max": 38.15586471557617, "sampling/sampling_logp_difference/mean": 0.07170183956623077, "step": 269, "step_time": 13.457229844999347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 970.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 425.875, "completions/mean_terminated_length": 67.8888931274414, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9795157872140408, "epoch": 0.0054, "frac_reward_zero_std": 0.0, "grad_norm": 0.6997625827789307, "kl": 1.2407596856355667, "learning_rate": 6.242562557542721e-05, "loss": -0.2584, "num_tokens": 11680204.0, "reward": 0.6251269578933716, "reward_std": 0.5162147283554077, "rewards/rollout_reward_func/mean": 0.6251269578933716, "rewards/rollout_reward_func/std": 0.5264024138450623, "sampling/importance_sampling_ratio/max": 1.8723818063735962, "sampling/importance_sampling_ratio/mean": 0.7574995160102844, "sampling/importance_sampling_ratio/min": 7.107428244326848e-21, "sampling/sampling_logp_difference/max": 36.46747970581055, "sampling/sampling_logp_difference/mean": 0.17094120383262634, "step": 270, "step_time": 13.070817968000483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 240.0, "completions/mean_terminated_length": 90.58333587646484, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.933756910264492, "epoch": 0.00542, "frac_reward_zero_std": 0.0, "grad_norm": 1.0864405632019043, "kl": 2.271683469414711, "learning_rate": 6.235298865160802e-05, "loss": -0.1477, "num_tokens": 11717820.0, "reward": 0.6204566955566406, "reward_std": 0.5454114675521851, "rewards/rollout_reward_func/mean": 0.6204566955566406, "rewards/rollout_reward_func/std": 0.5328903198242188, "sampling/importance_sampling_ratio/max": 2.4538261890411377, "sampling/importance_sampling_ratio/mean": 0.9123595952987671, "sampling/importance_sampling_ratio/min": 2.653163630455654e-32, "sampling/sampling_logp_difference/max": 35.82770919799805, "sampling/sampling_logp_difference/mean": 0.17664334177970886, "step": 271, "step_time": 12.191392419000294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.84375, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 400.34375, "completions/mean_terminated_length": 468.6000061035156, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.405089184641838, "epoch": 0.00544, "frac_reward_zero_std": 0.0, "grad_norm": 3.315850257873535, "kl": 1.811746746301651, "learning_rate": 6.228006013872432e-05, "loss": 0.1124, "num_tokens": 11761980.0, "reward": 0.6361057162284851, "reward_std": 0.6073228120803833, "rewards/rollout_reward_func/mean": 0.6361057162284851, "rewards/rollout_reward_func/std": 0.6237601041793823, "sampling/importance_sampling_ratio/max": 1.450617790222168, "sampling/importance_sampling_ratio/mean": 0.7723623514175415, "sampling/importance_sampling_ratio/min": 6.144576184474197e-17, "sampling/sampling_logp_difference/max": 17.61748695373535, "sampling/sampling_logp_difference/mean": 0.1221485286951065, "step": 272, "step_time": 13.684030693000295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 1015.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 392.28125, "completions/mean_terminated_length": 350.3333435058594, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 1.2637573182582855, "epoch": 0.00546, "frac_reward_zero_std": 0.25, "grad_norm": 1.0591007471084595, "kl": 1.2848418205976486, "learning_rate": 6.22068410815935e-05, "loss": -0.082, "num_tokens": 11804554.0, "reward": 0.4795195758342743, "reward_std": 0.5800077319145203, "rewards/rollout_reward_func/mean": 0.4795195758342743, "rewards/rollout_reward_func/std": 0.7278445959091187, "sampling/importance_sampling_ratio/max": 2.146909713745117, "sampling/importance_sampling_ratio/mean": 1.1118907928466797, "sampling/importance_sampling_ratio/min": 0.41008421778678894, "sampling/sampling_logp_difference/max": 0.567047119140625, "sampling/sampling_logp_difference/mean": 0.03169000893831253, "step": 273, "step_time": 12.611932855999385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.84375, "completions/max_length": 1629.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 450.5625, "completions/mean_terminated_length": 142.8000030517578, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 1.2026740312576294, "epoch": 0.00548, "frac_reward_zero_std": 0.0, "grad_norm": 1.6827574968338013, "kl": 1.319353125989437, "learning_rate": 6.21333325291955e-05, "loss": 0.3318, "num_tokens": 11849077.0, "reward": 0.05941234901547432, "reward_std": 0.9205846786499023, "rewards/rollout_reward_func/mean": 0.05941234901547432, "rewards/rollout_reward_func/std": 0.897829532623291, "sampling/importance_sampling_ratio/max": 2.514439821243286, "sampling/importance_sampling_ratio/mean": 1.042556881904602, "sampling/importance_sampling_ratio/min": 2.536263421504259e-09, "sampling/sampling_logp_difference/max": 9.659708023071289, "sampling/sampling_logp_difference/mean": 0.04706016927957535, "step": 274, "step_time": 15.88925072100119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1627.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 457.75, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "entropy": 1.555177167057991, "epoch": 0.0055, "frac_reward_zero_std": 0.0, "grad_norm": 2.591740608215332, "kl": 1.932930089533329, "learning_rate": 6.20595355346577e-05, "loss": 0.3632, "num_tokens": 11893781.0, "reward": 0.03698612004518509, "reward_std": 0.967666506767273, "rewards/rollout_reward_func/mean": 0.03698612004518509, "rewards/rollout_reward_func/std": 0.950773298740387, "sampling/importance_sampling_ratio/max": 1.8177111148834229, "sampling/importance_sampling_ratio/mean": 0.9663249254226685, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.898632049560547, "sampling/sampling_logp_difference/mean": 0.04233323037624359, "step": 275, "step_time": 15.36014968200061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 1756.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 449.6875, "completions/mean_terminated_length": 30.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 1.4719903469085693, "epoch": 0.00552, "frac_reward_zero_std": 0.0, "grad_norm": 1.156265139579773, "kl": 1.2203484028577805, "learning_rate": 6.198545115523989e-05, "loss": -0.066, "num_tokens": 11938322.0, "reward": 0.22006705403327942, "reward_std": 0.9005897045135498, "rewards/rollout_reward_func/mean": 0.22006705403327942, "rewards/rollout_reward_func/std": 0.8880067467689514, "sampling/importance_sampling_ratio/max": 1.5426743030548096, "sampling/importance_sampling_ratio/mean": 0.9003181457519531, "sampling/importance_sampling_ratio/min": 0.00030529513605870306, "sampling/sampling_logp_difference/max": 5.06326961517334, "sampling/sampling_logp_difference/mean": 0.05061705410480499, "step": 276, "step_time": 16.85454968300064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1549.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 295.4375, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "entropy": 1.6155288070440292, "epoch": 0.00554, "frac_reward_zero_std": 0.0, "grad_norm": 1.1301952600479126, "kl": 1.3682776391506195, "learning_rate": 6.191108045231912e-05, "loss": 0.1668, "num_tokens": 11977502.0, "reward": 0.4872196912765503, "reward_std": 0.7315820455551147, "rewards/rollout_reward_func/mean": 0.4872196912765503, "rewards/rollout_reward_func/std": 0.7631668448448181, "sampling/importance_sampling_ratio/max": 2.1793949604034424, "sampling/importance_sampling_ratio/mean": 0.9408875703811646, "sampling/importance_sampling_ratio/min": 1.3828326261844381e-09, "sampling/sampling_logp_difference/max": 11.191086769104004, "sampling/sampling_logp_difference/mean": 0.05766785517334938, "step": 277, "step_time": 13.89749083200104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 997.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 340.1875, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "entropy": 1.7408141791820526, "epoch": 0.00556, "frac_reward_zero_std": 0.0, "grad_norm": 1.1473487615585327, "kl": 1.2665244936943054, "learning_rate": 6.183642449137449e-05, "loss": 0.0736, "num_tokens": 12018731.0, "reward": 0.356174111366272, "reward_std": 0.7368665933609009, "rewards/rollout_reward_func/mean": 0.356174111366272, "rewards/rollout_reward_func/std": 0.7954673767089844, "sampling/importance_sampling_ratio/max": 2.3125994205474854, "sampling/importance_sampling_ratio/mean": 0.9075043201446533, "sampling/importance_sampling_ratio/min": 1.3289812190336102e-22, "sampling/sampling_logp_difference/max": 23.370494842529297, "sampling/sampling_logp_difference/mean": 0.1203530877828598, "step": 278, "step_time": 13.447768376999647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1563.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 325.84375, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "entropy": 1.6115550994873047, "epoch": 0.00558, "frac_reward_zero_std": 0.0, "grad_norm": 1.625993013381958, "kl": 1.4613315016031265, "learning_rate": 6.176148434197182e-05, "loss": -0.0232, "num_tokens": 12057481.0, "reward": 0.6645359992980957, "reward_std": 0.5565258264541626, "rewards/rollout_reward_func/mean": 0.6645359992980957, "rewards/rollout_reward_func/std": 0.5647537708282471, "sampling/importance_sampling_ratio/max": 2.6410555839538574, "sampling/importance_sampling_ratio/mean": 0.9931078553199768, "sampling/importance_sampling_ratio/min": 3.3173124956612043e-28, "sampling/sampling_logp_difference/max": 11.770353317260742, "sampling/sampling_logp_difference/mean": 0.07821263372898102, "step": 279, "step_time": 15.057088260999535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 941.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 297.5625, "completions/mean_terminated_length": 23.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 1.6022724658250809, "epoch": 0.0056, "frac_reward_zero_std": 0.0, "grad_norm": 1.1089247465133667, "kl": 1.3154785335063934, "learning_rate": 6.16862610777485e-05, "loss": -0.0858, "num_tokens": 12098155.0, "reward": 0.49577024579048157, "reward_std": 0.6782914400100708, "rewards/rollout_reward_func/mean": 0.49577024579048157, "rewards/rollout_reward_func/std": 0.6540427803993225, "sampling/importance_sampling_ratio/max": 2.0701098442077637, "sampling/importance_sampling_ratio/mean": 0.9826447367668152, "sampling/importance_sampling_ratio/min": 1.617099405848421e-05, "sampling/sampling_logp_difference/max": 4.9370832443237305, "sampling/sampling_logp_difference/mean": 0.04248109087347984, "step": 280, "step_time": 12.83039122199989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 1015.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 468.84375, "completions/mean_terminated_length": 609.0, "completions/min_length": 32.0, "completions/min_terminated_length": 609.0, "entropy": 1.7577238231897354, "epoch": 0.00562, "frac_reward_zero_std": 0.0, "grad_norm": 1.0655779838562012, "kl": 1.3566550314426422, "learning_rate": 6.161075577639782e-05, "loss": -0.0757, "num_tokens": 12144410.0, "reward": 0.5740200877189636, "reward_std": 0.7103265523910522, "rewards/rollout_reward_func/mean": 0.5740200877189636, "rewards/rollout_reward_func/std": 0.6818423867225647, "sampling/importance_sampling_ratio/max": 1.9912487268447876, "sampling/importance_sampling_ratio/mean": 0.9269586801528931, "sampling/importance_sampling_ratio/min": 0.36611220240592957, "sampling/sampling_logp_difference/max": 0.5099055767059326, "sampling/sampling_logp_difference/mean": 0.03832422196865082, "step": 281, "step_time": 14.740809193999667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 961.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 377.875, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "entropy": 1.6036306470632553, "epoch": 0.00564, "frac_reward_zero_std": 0.0, "grad_norm": 1.4242982864379883, "kl": 1.2665131390094757, "learning_rate": 6.153496951965386e-05, "loss": -0.0188, "num_tokens": 12187304.0, "reward": 0.4991670846939087, "reward_std": 0.6721706390380859, "rewards/rollout_reward_func/mean": 0.4991670846939087, "rewards/rollout_reward_func/std": 0.6479058861732483, "sampling/importance_sampling_ratio/max": 1.9100453853607178, "sampling/importance_sampling_ratio/mean": 0.9740859270095825, "sampling/importance_sampling_ratio/min": 4.675242940184454e-13, "sampling/sampling_logp_difference/max": 7.345028877258301, "sampling/sampling_logp_difference/mean": 0.06884044408798218, "step": 282, "step_time": 13.4671803250003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 351.5625, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "entropy": 1.74751615524292, "epoch": 0.00566, "frac_reward_zero_std": 0.0, "grad_norm": 1.0576722621917725, "kl": 1.3585702031850815, "learning_rate": 6.145890339327575e-05, "loss": 0.0422, "num_tokens": 12229152.0, "reward": 0.5601360201835632, "reward_std": 0.5831190347671509, "rewards/rollout_reward_func/mean": 0.5601360201835632, "rewards/rollout_reward_func/std": 0.5975658893585205, "sampling/importance_sampling_ratio/max": 1.652535080909729, "sampling/importance_sampling_ratio/mean": 0.8865768909454346, "sampling/importance_sampling_ratio/min": 0.47476717829704285, "sampling/sampling_logp_difference/max": 0.3411126136779785, "sampling/sampling_logp_difference/mean": 0.03651587292551994, "step": 283, "step_time": 13.222593508000045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 476.03125, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "entropy": 1.648721307516098, "epoch": 0.00568, "frac_reward_zero_std": 0.0, "grad_norm": 1.4424277544021606, "kl": 1.3750703483819962, "learning_rate": 6.138255848703222e-05, "loss": -0.0492, "num_tokens": 12275048.0, "reward": 0.5890101194381714, "reward_std": 0.5385011434555054, "rewards/rollout_reward_func/mean": 0.5890101194381714, "rewards/rollout_reward_func/std": 0.539211630821228, "sampling/importance_sampling_ratio/max": 2.249671697616577, "sampling/importance_sampling_ratio/mean": 1.1014320850372314, "sampling/importance_sampling_ratio/min": 0.5493256449699402, "sampling/sampling_logp_difference/max": 0.5027457475662231, "sampling/sampling_logp_difference/mean": 0.036985620856285095, "step": 284, "step_time": 14.63291839600015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 1033.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 416.0625, "completions/mean_terminated_length": 479.5, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 1.604083389043808, "epoch": 0.0057, "frac_reward_zero_std": 0.0, "grad_norm": 1.3159347772598267, "kl": 1.4029508829116821, "learning_rate": 6.130593589468598e-05, "loss": 0.1057, "num_tokens": 12317243.0, "reward": 0.692679762840271, "reward_std": 0.4951537549495697, "rewards/rollout_reward_func/mean": 0.692679762840271, "rewards/rollout_reward_func/std": 0.4992617964744568, "sampling/importance_sampling_ratio/max": 2.004826545715332, "sampling/importance_sampling_ratio/mean": 1.0153238773345947, "sampling/importance_sampling_ratio/min": 0.4229728877544403, "sampling/sampling_logp_difference/max": 0.38543128967285156, "sampling/sampling_logp_difference/mean": 0.036722391843795776, "step": 285, "step_time": 13.539222654999321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 1031.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 455.6875, "completions/mean_terminated_length": 753.0, "completions/min_length": 32.0, "completions/min_terminated_length": 753.0, "entropy": 1.6817754656076431, "epoch": 0.00572, "frac_reward_zero_std": 0.0, "grad_norm": 1.2740130424499512, "kl": 1.35941182076931, "learning_rate": 6.122903671397803e-05, "loss": -0.1214, "num_tokens": 12363108.0, "reward": 0.5256404876708984, "reward_std": 0.6070274114608765, "rewards/rollout_reward_func/mean": 0.5256404876708984, "rewards/rollout_reward_func/std": 0.6017917990684509, "sampling/importance_sampling_ratio/max": 2.9226672649383545, "sampling/importance_sampling_ratio/mean": 1.0453054904937744, "sampling/importance_sampling_ratio/min": 0.2806028127670288, "sampling/sampling_logp_difference/max": 0.40816378593444824, "sampling/sampling_logp_difference/mean": 0.038322776556015015, "step": 286, "step_time": 14.419579556999906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 901.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 332.46875, "completions/mean_terminated_length": 839.0, "completions/min_length": 32.0, "completions/min_terminated_length": 839.0, "entropy": 1.6685753166675568, "epoch": 0.00574, "frac_reward_zero_std": 0.0, "grad_norm": 0.9662615060806274, "kl": 1.2091451436281204, "learning_rate": 6.115186204661191e-05, "loss": 0.0736, "num_tokens": 12402155.0, "reward": 0.7634872198104858, "reward_std": 0.4701966345310211, "rewards/rollout_reward_func/mean": 0.7634872198104858, "rewards/rollout_reward_func/std": 0.45423388481140137, "sampling/importance_sampling_ratio/max": 2.3031647205352783, "sampling/importance_sampling_ratio/mean": 0.9314023852348328, "sampling/importance_sampling_ratio/min": 0.3897619843482971, "sampling/sampling_logp_difference/max": 0.33718061447143555, "sampling/sampling_logp_difference/mean": 0.03690643608570099, "step": 287, "step_time": 13.032526026999676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 979.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 385.21875, "completions/mean_terminated_length": 29.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 1.4786080569028854, "epoch": 0.00576, "frac_reward_zero_std": 0.0, "grad_norm": 0.8810586333274841, "kl": 1.3366494476795197, "learning_rate": 6.1074412998238e-05, "loss": -0.0003, "num_tokens": 12443588.0, "reward": 0.6311408281326294, "reward_std": 0.5865784883499146, "rewards/rollout_reward_func/mean": 0.6311408281326294, "rewards/rollout_reward_func/std": 0.5772413611412048, "sampling/importance_sampling_ratio/max": 1.6377943754196167, "sampling/importance_sampling_ratio/mean": 0.905623197555542, "sampling/importance_sampling_ratio/min": 0.4572822153568268, "sampling/sampling_logp_difference/max": 0.7164381742477417, "sampling/sampling_logp_difference/mean": 0.035175248980522156, "step": 288, "step_time": 13.926471045999278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 1478.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 496.53125, "completions/mean_terminated_length": 1001.0, "completions/min_length": 32.0, "completions/min_terminated_length": 1001.0, "entropy": 1.6840482503175735, "epoch": 0.00578, "frac_reward_zero_std": 0.0, "grad_norm": 0.9257209300994873, "kl": 1.251721978187561, "learning_rate": 6.0996690678437596e-05, "loss": -0.0383, "num_tokens": 12490612.0, "reward": 0.5182172060012817, "reward_std": 0.5701149702072144, "rewards/rollout_reward_func/mean": 0.5182172060012817, "rewards/rollout_reward_func/std": 0.5551774501800537, "sampling/importance_sampling_ratio/max": 1.9926742315292358, "sampling/importance_sampling_ratio/mean": 0.8441840410232544, "sampling/importance_sampling_ratio/min": 2.1039876547684418e-17, "sampling/sampling_logp_difference/max": 13.610227584838867, "sampling/sampling_logp_difference/mean": 0.06268449127674103, "step": 289, "step_time": 15.352369717999409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 409.28125, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "entropy": 1.7304475158452988, "epoch": 0.0058, "frac_reward_zero_std": 0.0, "grad_norm": 1.0736196041107178, "kl": 1.258081778883934, "learning_rate": 6.091869620070702e-05, "loss": 0.0988, "num_tokens": 12533779.0, "reward": 0.5688494443893433, "reward_std": 0.6204570531845093, "rewards/rollout_reward_func/mean": 0.5688494443893433, "rewards/rollout_reward_func/std": 0.6396588683128357, "sampling/importance_sampling_ratio/max": 2.8853752613067627, "sampling/importance_sampling_ratio/mean": 1.0570144653320312, "sampling/importance_sampling_ratio/min": 0.407880961894989, "sampling/sampling_logp_difference/max": 0.3973665237426758, "sampling/sampling_logp_difference/mean": 0.03513234108686447, "step": 290, "step_time": 13.672343009999622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 402.8125, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "entropy": 1.556164637207985, "epoch": 0.00582, "frac_reward_zero_std": 0.25, "grad_norm": 0.8541924953460693, "kl": 1.37609501183033, "learning_rate": 6.084043068244173e-05, "loss": 0.2005, "num_tokens": 12575215.0, "reward": 0.7952468991279602, "reward_std": 0.36397647857666016, "rewards/rollout_reward_func/mean": 0.7952468991279602, "rewards/rollout_reward_func/std": 0.4331655204296112, "sampling/importance_sampling_ratio/max": 2.2204596996307373, "sampling/importance_sampling_ratio/mean": 1.0878386497497559, "sampling/importance_sampling_ratio/min": 0.4822189211845398, "sampling/sampling_logp_difference/max": 0.2646932601928711, "sampling/sampling_logp_difference/mean": 0.03318876400589943, "step": 291, "step_time": 12.407582866000212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 997.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 384.375, "completions/mean_terminated_length": 21.0, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 1.5174337178468704, "epoch": 0.00584, "frac_reward_zero_std": 0.0, "grad_norm": 1.1320741176605225, "kl": 1.2430267035961151, "learning_rate": 6.076189524492026e-05, "loss": -0.0555, "num_tokens": 12617880.0, "reward": 0.5619276165962219, "reward_std": 0.5554611086845398, "rewards/rollout_reward_func/mean": 0.5619276165962219, "rewards/rollout_reward_func/std": 0.5955392122268677, "sampling/importance_sampling_ratio/max": 2.186042547225952, "sampling/importance_sampling_ratio/mean": 0.9694903492927551, "sampling/importance_sampling_ratio/min": 4.443242536347099e-19, "sampling/sampling_logp_difference/max": 9.904940605163574, "sampling/sampling_logp_difference/mean": 0.061056092381477356, "step": 292, "step_time": 13.416809759000898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1589.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 469.65625, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "entropy": 1.5607326328754425, "epoch": 0.00586, "frac_reward_zero_std": 0.25, "grad_norm": 1.1039814949035645, "kl": 1.2713143080472946, "learning_rate": 6.0683091013288175e-05, "loss": 0.0177, "num_tokens": 12663425.0, "reward": 0.6996991038322449, "reward_std": 0.43048518896102905, "rewards/rollout_reward_func/mean": 0.6996991038322449, "rewards/rollout_reward_func/std": 0.55036461353302, "sampling/importance_sampling_ratio/max": 2.5476691722869873, "sampling/importance_sampling_ratio/mean": 1.0416839122772217, "sampling/importance_sampling_ratio/min": 0.41075384616851807, "sampling/sampling_logp_difference/max": 0.40274715423583984, "sampling/sampling_logp_difference/mean": 0.03340768814086914, "step": 293, "step_time": 15.849380879000364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1508.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 425.46875, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "entropy": 1.5173302441835403, "epoch": 0.00588, "frac_reward_zero_std": 0.0, "grad_norm": 0.8863282799720764, "kl": 1.2644618898630142, "learning_rate": 6.0604019116541925e-05, "loss": -0.0567, "num_tokens": 12707768.0, "reward": 0.7351365089416504, "reward_std": 0.506820559501648, "rewards/rollout_reward_func/mean": 0.7351365089416504, "rewards/rollout_reward_func/std": 0.5296523571014404, "sampling/importance_sampling_ratio/max": 1.7010269165039062, "sampling/importance_sampling_ratio/mean": 0.9410216808319092, "sampling/importance_sampling_ratio/min": 7.727933863034702e-23, "sampling/sampling_logp_difference/max": 13.173898696899414, "sampling/sampling_logp_difference/mean": 0.06413191556930542, "step": 294, "step_time": 14.916648064000583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 1952.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 491.03125, "completions/mean_terminated_length": 716.5, "completions/min_length": 32.0, "completions/min_terminated_length": 626.0, "entropy": 1.5511125475168228, "epoch": 0.0059, "frac_reward_zero_std": 0.0, "grad_norm": 1.1362706422805786, "kl": 1.3738711029291153, "learning_rate": 6.0524680687512715e-05, "loss": 0.3864, "num_tokens": 12753533.0, "reward": 0.5699449777603149, "reward_std": 0.5925971269607544, "rewards/rollout_reward_func/mean": 0.5699449777603149, "rewards/rollout_reward_func/std": 0.5817768573760986, "sampling/importance_sampling_ratio/max": 1.7104294300079346, "sampling/importance_sampling_ratio/mean": 0.9789931178092957, "sampling/importance_sampling_ratio/min": 0.46051037311553955, "sampling/sampling_logp_difference/max": 0.49524354934692383, "sampling/sampling_logp_difference/mean": 0.03280593827366829, "step": 295, "step_time": 16.327676473999873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 988.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 296.46875, "completions/mean_terminated_length": 880.0, "completions/min_length": 32.0, "completions/min_terminated_length": 880.0, "entropy": 1.6449557095766068, "epoch": 0.00592, "frac_reward_zero_std": 0.0, "grad_norm": 0.9376301169395447, "kl": 1.3798616975545883, "learning_rate": 6.0445076862850226e-05, "loss": 0.0058, "num_tokens": 12790547.0, "reward": 0.7329756021499634, "reward_std": 0.5218021273612976, "rewards/rollout_reward_func/mean": 0.7329756021499634, "rewards/rollout_reward_func/std": 0.5346468687057495, "sampling/importance_sampling_ratio/max": 1.7864199876785278, "sampling/importance_sampling_ratio/mean": 0.9625564217567444, "sampling/importance_sampling_ratio/min": 2.083764577716623e-16, "sampling/sampling_logp_difference/max": 10.337621688842773, "sampling/sampling_logp_difference/mean": 0.06091579049825668, "step": 296, "step_time": 12.234243394999794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 979.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 467.34375, "completions/mean_terminated_length": 618.0, "completions/min_length": 32.0, "completions/min_terminated_length": 580.0, "entropy": 1.575576052069664, "epoch": 0.00594, "frac_reward_zero_std": 0.25, "grad_norm": 0.868667483329773, "kl": 1.2338551133871078, "learning_rate": 6.036520878300637e-05, "loss": 0.1165, "num_tokens": 12836263.0, "reward": 0.639566957950592, "reward_std": 0.4129408001899719, "rewards/rollout_reward_func/mean": 0.639566957950592, "rewards/rollout_reward_func/std": 0.6185584664344788, "sampling/importance_sampling_ratio/max": 1.5771390199661255, "sampling/importance_sampling_ratio/mean": 0.937774121761322, "sampling/importance_sampling_ratio/min": 0.45420682430267334, "sampling/sampling_logp_difference/max": 0.767693042755127, "sampling/sampling_logp_difference/mean": 0.033158302307128906, "step": 297, "step_time": 13.472735053000179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 1015.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 475.0625, "completions/mean_terminated_length": 9.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 1.8637316226959229, "epoch": 0.00596, "frac_reward_zero_std": 0.0, "grad_norm": 1.1221648454666138, "kl": 1.4032704830169678, "learning_rate": 6.028507759221894e-05, "loss": -0.0264, "num_tokens": 12882515.0, "reward": 0.5061816573143005, "reward_std": 0.6195111274719238, "rewards/rollout_reward_func/mean": 0.5061816573143005, "rewards/rollout_reward_func/std": 0.6889325976371765, "sampling/importance_sampling_ratio/max": 2.849435806274414, "sampling/importance_sampling_ratio/mean": 0.9369213581085205, "sampling/importance_sampling_ratio/min": 0.45871809124946594, "sampling/sampling_logp_difference/max": 0.6026906967163086, "sampling/sampling_logp_difference/mean": 0.03729845583438873, "step": 298, "step_time": 14.327750060999279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 1033.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 348.8125, "completions/mean_terminated_length": 888.0, "completions/min_length": 32.0, "completions/min_terminated_length": 888.0, "entropy": 1.7520618289709091, "epoch": 0.00598, "frac_reward_zero_std": 0.25, "grad_norm": 1.0526667833328247, "kl": 1.1753466725349426, "learning_rate": 6.0204684438495194e-05, "loss": -0.055, "num_tokens": 12922702.0, "reward": 0.4194687604904175, "reward_std": 0.5888150930404663, "rewards/rollout_reward_func/mean": 0.4194687604904175, "rewards/rollout_reward_func/std": 0.7677082419395447, "sampling/importance_sampling_ratio/max": 2.7621238231658936, "sampling/importance_sampling_ratio/mean": 1.0072945356369019, "sampling/importance_sampling_ratio/min": 4.6322558504074155e-28, "sampling/sampling_logp_difference/max": 13.810802459716797, "sampling/sampling_logp_difference/mean": 0.08528067916631699, "step": 299, "step_time": 12.772198099999969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 206.78125, "completions/mean_terminated_length": 0.0, "completions/min_length": 32.0, "completions/min_terminated_length": 0.0, "entropy": 1.8439958095550537, "epoch": 0.006, "frac_reward_zero_std": 0.0, "grad_norm": 1.1286392211914062, "kl": 1.3561551719903946, "learning_rate": 6.012403047359545e-05, "loss": -0.0446, "num_tokens": 12957069.0, "reward": 0.7022931575775146, "reward_std": 0.5971763730049133, "rewards/rollout_reward_func/mean": 0.7022931575775146, "rewards/rollout_reward_func/std": 0.6030493974685669, "sampling/importance_sampling_ratio/max": 2.706031322479248, "sampling/importance_sampling_ratio/mean": 1.124272108078003, "sampling/importance_sampling_ratio/min": 0.40633055567741394, "sampling/sampling_logp_difference/max": 0.465440034866333, "sampling/sampling_logp_difference/mean": 0.04173918813467026, "step": 300, "step_time": 12.526091776000158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 1006.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 582.15625, "completions/mean_terminated_length": 669.75, "completions/min_length": 32.0, "completions/min_terminated_length": 479.0, "entropy": 1.9109542071819305, "epoch": 0.00602, "frac_reward_zero_std": 0.25, "grad_norm": 1.0298231840133667, "kl": 1.2945332452654839, "learning_rate": 6.004311685301655e-05, "loss": 0.0342, "num_tokens": 13007770.0, "reward": 0.6029155850410461, "reward_std": 0.4511142671108246, "rewards/rollout_reward_func/mean": 0.6029155850410461, "rewards/rollout_reward_func/std": 0.578252375125885, "sampling/importance_sampling_ratio/max": 1.7694451808929443, "sampling/importance_sampling_ratio/mean": 0.8985217809677124, "sampling/importance_sampling_ratio/min": 5.5123115273725265e-14, "sampling/sampling_logp_difference/max": 6.568675518035889, "sampling/sampling_logp_difference/mean": 0.057860761880874634, "step": 301, "step_time": 13.774676660000296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 988.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 536.03125, "completions/mean_terminated_length": 679.5, "completions/min_length": 32.0, "completions/min_terminated_length": 598.0, "entropy": 2.14649461209774, "epoch": 0.00604, "frac_reward_zero_std": 0.0, "grad_norm": 1.2746901512145996, "kl": 1.4187272638082504, "learning_rate": 5.9961944735975327e-05, "loss": 0.1392, "num_tokens": 13056578.0, "reward": 0.696951687335968, "reward_std": 0.5142950415611267, "rewards/rollout_reward_func/mean": 0.696951687335968, "rewards/rollout_reward_func/std": 0.4923655390739441, "sampling/importance_sampling_ratio/max": 2.545111656188965, "sampling/importance_sampling_ratio/mean": 0.9123826026916504, "sampling/importance_sampling_ratio/min": 2.7804312671686482e-11, "sampling/sampling_logp_difference/max": 11.366047859191895, "sampling/sampling_logp_difference/mean": 0.060925938189029694, "step": 302, "step_time": 14.233163184000205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 1006.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 427.5625, "completions/mean_terminated_length": 411.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 2.1621054857969284, "epoch": 0.00606, "frac_reward_zero_std": 0.0, "grad_norm": 0.959158718585968, "kl": 1.4765708148479462, "learning_rate": 5.988051528539199e-05, "loss": -0.0947, "num_tokens": 13101062.0, "reward": 0.6654813289642334, "reward_std": 0.5374094843864441, "rewards/rollout_reward_func/mean": 0.6654813289642334, "rewards/rollout_reward_func/std": 0.5633363723754883, "sampling/importance_sampling_ratio/max": 1.9851253032684326, "sampling/importance_sampling_ratio/mean": 0.9064241647720337, "sampling/importance_sampling_ratio/min": 0.3265509605407715, "sampling/sampling_logp_difference/max": 0.48175716400146484, "sampling/sampling_logp_difference/mean": 0.04202505201101303, "step": 303, "step_time": 13.689876930000537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 1006.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 438.25, "completions/mean_terminated_length": 463.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 2.2263659834861755, "epoch": 0.00608, "frac_reward_zero_std": 0.0, "grad_norm": 1.176809549331665, "kl": 1.5984282493591309, "learning_rate": 5.979882966787343e-05, "loss": 0.0761, "num_tokens": 13146221.0, "reward": 0.7058923840522766, "reward_std": 0.5788325071334839, "rewards/rollout_reward_func/mean": 0.7058923840522766, "rewards/rollout_reward_func/std": 0.5953075885772705, "sampling/importance_sampling_ratio/max": 1.6555259227752686, "sampling/importance_sampling_ratio/mean": 0.9600812792778015, "sampling/importance_sampling_ratio/min": 0.29721322655677795, "sampling/sampling_logp_difference/max": 0.5891098976135254, "sampling/sampling_logp_difference/mean": 0.04064274579286575, "step": 304, "step_time": 12.968028083999798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 1883.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 324.28125, "completions/mean_terminated_length": 701.5, "completions/min_length": 32.0, "completions/min_terminated_length": 580.0, "entropy": 2.1908774375915527, "epoch": 0.0061, "frac_reward_zero_std": 0.0, "grad_norm": 1.1114805936813354, "kl": 1.562452718615532, "learning_rate": 5.971688905369659e-05, "loss": 0.0149, "num_tokens": 13186005.0, "reward": 0.6469302177429199, "reward_std": 0.6624607443809509, "rewards/rollout_reward_func/mean": 0.6469302177429199, "rewards/rollout_reward_func/std": 0.6555702686309814, "sampling/importance_sampling_ratio/max": 2.4671170711517334, "sampling/importance_sampling_ratio/mean": 0.9721499681472778, "sampling/importance_sampling_ratio/min": 1.355046499480171e-17, "sampling/sampling_logp_difference/max": 19.160154342651367, "sampling/sampling_logp_difference/mean": 0.11085817217826843, "step": 305, "step_time": 16.222092960000282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 979.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 418.71875, "completions/mean_terminated_length": 650.7142944335938, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 2.175156131386757, "epoch": 0.00612, "frac_reward_zero_std": 0.25, "grad_norm": 1.3451061248779297, "kl": 1.4554658979177475, "learning_rate": 5.9634694616791595e-05, "loss": -0.2293, "num_tokens": 13230166.0, "reward": 0.6040306091308594, "reward_std": 0.5063743591308594, "rewards/rollout_reward_func/mean": 0.6040306091308594, "rewards/rollout_reward_func/std": 0.627680242061615, "sampling/importance_sampling_ratio/max": 2.9138834476470947, "sampling/importance_sampling_ratio/mean": 1.1110565662384033, "sampling/importance_sampling_ratio/min": 0.3532570004463196, "sampling/sampling_logp_difference/max": 0.4842567443847656, "sampling/sampling_logp_difference/mean": 0.043274201452732086, "step": 306, "step_time": 12.660607355000138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 961.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 347.46875, "completions/mean_terminated_length": 699.0, "completions/min_length": 32.0, "completions/min_terminated_length": 493.0, "entropy": 2.201367035508156, "epoch": 0.00614, "frac_reward_zero_std": 0.25, "grad_norm": 0.8696091771125793, "kl": 1.5525203794240952, "learning_rate": 5.955224753472502e-05, "loss": 0.0307, "num_tokens": 13271823.0, "reward": 0.46905237436294556, "reward_std": 0.49651390314102173, "rewards/rollout_reward_func/mean": 0.46905237436294556, "rewards/rollout_reward_func/std": 0.6455743312835693, "sampling/importance_sampling_ratio/max": 1.6154719591140747, "sampling/importance_sampling_ratio/mean": 0.8586403131484985, "sampling/importance_sampling_ratio/min": 4.1075417259594546e-26, "sampling/sampling_logp_difference/max": 22.316316604614258, "sampling/sampling_logp_difference/mean": 0.08881887048482895, "step": 307, "step_time": 12.978106795000258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 1033.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 372.25, "completions/mean_terminated_length": 452.5555725097656, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 2.0461243242025375, "epoch": 0.00616, "frac_reward_zero_std": 0.25, "grad_norm": 1.175984263420105, "kl": 1.7427055835723877, "learning_rate": 5.9469548988683e-05, "loss": 0.2454, "num_tokens": 13313157.0, "reward": 0.8288477659225464, "reward_std": 0.3498895764350891, "rewards/rollout_reward_func/mean": 0.8288477659225464, "rewards/rollout_reward_func/std": 0.404166579246521, "sampling/importance_sampling_ratio/max": 1.7835373878479004, "sampling/importance_sampling_ratio/mean": 0.8986623883247375, "sampling/importance_sampling_ratio/min": 2.4318873737688707e-12, "sampling/sampling_logp_difference/max": 10.346508979797363, "sampling/sampling_logp_difference/mean": 0.06272260844707489, "step": 308, "step_time": 12.176779116000034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 1699.0, "completions/max_terminated_length": 1699.0, "completions/mean_length": 553.0, "completions/mean_terminated_length": 619.3846435546875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 2.213667869567871, "epoch": 0.00618, "frac_reward_zero_std": 0.0, "grad_norm": 1.1547057628631592, "kl": 1.409785121679306, "learning_rate": 5.938660016345423e-05, "loss": 0.1555, "num_tokens": 13361966.0, "reward": 0.3531684875488281, "reward_std": 0.7409747838973999, "rewards/rollout_reward_func/mean": 0.3531684875488281, "rewards/rollout_reward_func/std": 0.7137225270271301, "sampling/importance_sampling_ratio/max": 1.7193982601165771, "sampling/importance_sampling_ratio/mean": 1.0088361501693726, "sampling/importance_sampling_ratio/min": 4.714482927312136e-13, "sampling/sampling_logp_difference/max": 8.110475540161133, "sampling/sampling_logp_difference/mean": 0.0610441118478775, "step": 309, "step_time": 15.343923702999746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 1015.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 436.46875, "completions/mean_terminated_length": 366.0000305175781, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 2.090760737657547, "epoch": 0.0062, "frac_reward_zero_std": 0.0, "grad_norm": 1.0569647550582886, "kl": 1.6135514676570892, "learning_rate": 5.930340224741313e-05, "loss": 0.2032, "num_tokens": 13406173.0, "reward": 0.6266924142837524, "reward_std": 0.5103944540023804, "rewards/rollout_reward_func/mean": 0.6266924142837524, "rewards/rollout_reward_func/std": 0.5242377519607544, "sampling/importance_sampling_ratio/max": 1.5259442329406738, "sampling/importance_sampling_ratio/mean": 0.7855511903762817, "sampling/importance_sampling_ratio/min": 4.5165660220083254e-14, "sampling/sampling_logp_difference/max": 8.713134765625, "sampling/sampling_logp_difference/mean": 0.08065584301948547, "step": 310, "step_time": 12.869390595998539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 1688.0, "completions/max_terminated_length": 1648.0, "completions/mean_length": 508.59375, "completions/mean_terminated_length": 655.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 2.0209807604551315, "epoch": 0.00622, "frac_reward_zero_std": 0.0, "grad_norm": 1.0715346336364746, "kl": 1.344336986541748, "learning_rate": 5.92199564325027e-05, "loss": 0.1541, "num_tokens": 13452802.0, "reward": 0.6392390727996826, "reward_std": 0.6024003624916077, "rewards/rollout_reward_func/mean": 0.6392390727996826, "rewards/rollout_reward_func/std": 0.6178409457206726, "sampling/importance_sampling_ratio/max": 2.4571316242218018, "sampling/importance_sampling_ratio/mean": 0.9618554711341858, "sampling/importance_sampling_ratio/min": 0.3573444187641144, "sampling/sampling_logp_difference/max": 0.40203285217285156, "sampling/sampling_logp_difference/mean": 0.0398002490401268, "step": 311, "step_time": 14.81664649000004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1828.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 403.78125, "completions/mean_terminated_length": 404.8333435058594, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 2.0663836896419525, "epoch": 0.00624, "frac_reward_zero_std": 0.0, "grad_norm": 1.047993779182434, "kl": 1.3759985566139221, "learning_rate": 5.913626391421747e-05, "loss": 0.1686, "num_tokens": 13494809.0, "reward": 0.4545784592628479, "reward_std": 0.7400583028793335, "rewards/rollout_reward_func/mean": 0.4545784592628479, "rewards/rollout_reward_func/std": 0.721449077129364, "sampling/importance_sampling_ratio/max": 1.884018063545227, "sampling/importance_sampling_ratio/mean": 0.9143320918083191, "sampling/importance_sampling_ratio/min": 8.761638926180334e-16, "sampling/sampling_logp_difference/max": 14.427348136901855, "sampling/sampling_logp_difference/mean": 0.09081155061721802, "step": 312, "step_time": 15.455757763001202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 2692.0, "completions/max_terminated_length": 2692.0, "completions/mean_length": 556.59375, "completions/mean_terminated_length": 675.0625, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 1.8435325771570206, "epoch": 0.00626, "frac_reward_zero_std": 0.25, "grad_norm": 0.7694547772407532, "kl": 1.399752140045166, "learning_rate": 5.9052325891586445e-05, "loss": -0.1426, "num_tokens": 13544327.0, "reward": 0.7297097444534302, "reward_std": 0.4513795077800751, "rewards/rollout_reward_func/mean": 0.7297097444534302, "rewards/rollout_reward_func/std": 0.5397657155990601, "sampling/importance_sampling_ratio/max": 1.6712501049041748, "sampling/importance_sampling_ratio/mean": 0.8220624923706055, "sampling/importance_sampling_ratio/min": 2.7606159003733255e-14, "sampling/sampling_logp_difference/max": 8.875541687011719, "sampling/sampling_logp_difference/mean": 0.058046624064445496, "step": 313, "step_time": 18.002680855001017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 1638.0, "completions/max_terminated_length": 1638.0, "completions/mean_length": 408.375, "completions/mean_terminated_length": 468.5000305175781, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 1.7957092076539993, "epoch": 0.00628, "frac_reward_zero_std": 0.75, "grad_norm": 0.4280748665332794, "kl": 1.1955655068159103, "learning_rate": 5.8968143567155826e-05, "loss": 0.1797, "num_tokens": 13586573.0, "reward": 0.8449472784996033, "reward_std": 0.22370442748069763, "rewards/rollout_reward_func/mean": 0.8449472784996033, "rewards/rollout_reward_func/std": 0.505226194858551, "sampling/importance_sampling_ratio/max": 1.5105870962142944, "sampling/importance_sampling_ratio/mean": 1.0147426128387451, "sampling/importance_sampling_ratio/min": 0.41671618819236755, "sampling/sampling_logp_difference/max": 0.2964317798614502, "sampling/sampling_logp_difference/mean": 0.036489006131887436, "step": 314, "step_time": 14.342760653000369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 3246.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 458.34375, "completions/mean_terminated_length": 432.0714416503906, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 1.8831585794687271, "epoch": 0.0063, "frac_reward_zero_std": 0.5, "grad_norm": 1.0025805234909058, "kl": 1.1863482594490051, "learning_rate": 5.888371814697185e-05, "loss": 0.5149, "num_tokens": 13629998.0, "reward": 0.7728590369224548, "reward_std": 0.32343482971191406, "rewards/rollout_reward_func/mean": 0.7728590369224548, "rewards/rollout_reward_func/std": 0.49647581577301025, "sampling/importance_sampling_ratio/max": 1.8183441162109375, "sampling/importance_sampling_ratio/mean": 0.8610658645629883, "sampling/importance_sampling_ratio/min": 6.355002244926022e-13, "sampling/sampling_logp_difference/max": 10.727499961853027, "sampling/sampling_logp_difference/mean": 0.05741002410650253, "step": 315, "step_time": 19.33523995500036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1622.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 575.8125, "completions/mean_terminated_length": 567.26318359375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 1.9129594266414642, "epoch": 0.00632, "frac_reward_zero_std": 0.0, "grad_norm": 0.8137322664260864, "kl": 1.2814254313707352, "learning_rate": 5.8799050840563466e-05, "loss": 0.2635, "num_tokens": 13680239.0, "reward": 0.5655438899993896, "reward_std": 0.7607361078262329, "rewards/rollout_reward_func/mean": 0.5655438899993896, "rewards/rollout_reward_func/std": 0.7889004349708557, "sampling/importance_sampling_ratio/max": 1.6158668994903564, "sampling/importance_sampling_ratio/mean": 0.9425360560417175, "sampling/importance_sampling_ratio/min": 0.008557315915822983, "sampling/sampling_logp_difference/max": 5.890020847320557, "sampling/sampling_logp_difference/mean": 0.0476326122879982, "step": 316, "step_time": 14.391424087000814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1571.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 380.75, "completions/mean_terminated_length": 490.0909118652344, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 1.9316744804382324, "epoch": 0.00634, "frac_reward_zero_std": 0.25, "grad_norm": 0.8405368328094482, "kl": 1.4428828805685043, "learning_rate": 5.871414286092507e-05, "loss": 0.1305, "num_tokens": 13721345.0, "reward": 0.8654934167861938, "reward_std": 0.31479746103286743, "rewards/rollout_reward_func/mean": 0.8654934167861938, "rewards/rollout_reward_func/std": 0.36156710982322693, "sampling/importance_sampling_ratio/max": 1.9625356197357178, "sampling/importance_sampling_ratio/mean": 0.9455552697181702, "sampling/importance_sampling_ratio/min": 2.0487216335486304e-16, "sampling/sampling_logp_difference/max": 11.719476699829102, "sampling/sampling_logp_difference/mean": 0.06652772426605225, "step": 317, "step_time": 13.954184503999386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1533.0, "completions/max_terminated_length": 1533.0, "completions/mean_length": 481.71875, "completions/mean_terminated_length": 478.4761962890625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 1.8650391697883606, "epoch": 0.00636, "frac_reward_zero_std": 0.25, "grad_norm": 0.5286858677864075, "kl": 1.2741876691579819, "learning_rate": 5.862899542449904e-05, "loss": 0.0551, "num_tokens": 13767430.0, "reward": 0.7429414987564087, "reward_std": 0.45704787969589233, "rewards/rollout_reward_func/mean": 0.7429414987564087, "rewards/rollout_reward_func/std": 0.5715904831886292, "sampling/importance_sampling_ratio/max": 1.9098185300827026, "sampling/importance_sampling_ratio/mean": 0.9631974697113037, "sampling/importance_sampling_ratio/min": 2.6884480579925487e-15, "sampling/sampling_logp_difference/max": 12.997298240661621, "sampling/sampling_logp_difference/mean": 0.06052371859550476, "step": 318, "step_time": 13.862813988000198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1718.0, "completions/max_terminated_length": 1718.0, "completions/mean_length": 576.875, "completions/mean_terminated_length": 593.1666870117188, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 2.0289696753025055, "epoch": 0.00638, "frac_reward_zero_std": 0.0, "grad_norm": 1.2149608135223389, "kl": 1.3769783228635788, "learning_rate": 5.85436097511584e-05, "loss": 0.1766, "num_tokens": 13815918.0, "reward": 0.6568731069564819, "reward_std": 0.7245488166809082, "rewards/rollout_reward_func/mean": 0.6568731069564819, "rewards/rollout_reward_func/std": 0.6916787624359131, "sampling/importance_sampling_ratio/max": 2.1425058841705322, "sampling/importance_sampling_ratio/mean": 0.9090965986251831, "sampling/importance_sampling_ratio/min": 1.2975231999945436e-09, "sampling/sampling_logp_difference/max": 5.508400917053223, "sampling/sampling_logp_difference/mean": 0.054766349494457245, "step": 319, "step_time": 16.026848456000153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1771.0, "completions/max_terminated_length": 1771.0, "completions/mean_length": 533.03125, "completions/mean_terminated_length": 611.9199829101562, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 2.0916898250579834, "epoch": 0.0064, "frac_reward_zero_std": 0.0, "grad_norm": 0.9412707686424255, "kl": 1.3568750023841858, "learning_rate": 5.845798706418925e-05, "loss": 0.2268, "num_tokens": 13863834.0, "reward": 0.5841221809387207, "reward_std": 0.6364234685897827, "rewards/rollout_reward_func/mean": 0.5841221809387207, "rewards/rollout_reward_func/std": 0.6685865521430969, "sampling/importance_sampling_ratio/max": 1.7635217905044556, "sampling/importance_sampling_ratio/mean": 0.9275067448616028, "sampling/importance_sampling_ratio/min": 1.0298627755314872e-17, "sampling/sampling_logp_difference/max": 9.309558868408203, "sampling/sampling_logp_difference/mean": 0.0974733978509903, "step": 320, "step_time": 14.812958155998786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1533.0, "completions/max_terminated_length": 1533.0, "completions/mean_length": 359.6875, "completions/mean_terminated_length": 413.727294921875, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 1.7971670180559158, "epoch": 0.00642, "frac_reward_zero_std": 0.0, "grad_norm": 1.068855881690979, "kl": 1.3647612631320953, "learning_rate": 5.837212859027332e-05, "loss": 0.0924, "num_tokens": 13903067.0, "reward": 0.5303407907485962, "reward_std": 0.8057674169540405, "rewards/rollout_reward_func/mean": 0.5303407907485962, "rewards/rollout_reward_func/std": 0.7956884503364563, "sampling/importance_sampling_ratio/max": 1.7413064241409302, "sampling/importance_sampling_ratio/mean": 0.9779105186462402, "sampling/importance_sampling_ratio/min": 5.18334576649071e-11, "sampling/sampling_logp_difference/max": 12.970199584960938, "sampling/sampling_logp_difference/mean": 0.0653417557477951, "step": 321, "step_time": 13.172426044998247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 979.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 381.09375, "completions/mean_terminated_length": 356.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 1.6524961590766907, "epoch": 0.00644, "frac_reward_zero_std": 0.0, "grad_norm": 1.015548825263977, "kl": 1.440118283033371, "learning_rate": 5.8286035559470364e-05, "loss": 0.0466, "num_tokens": 13945067.0, "reward": 0.6828256845474243, "reward_std": 0.6917945146560669, "rewards/rollout_reward_func/mean": 0.6828256845474243, "rewards/rollout_reward_func/std": 0.6975859999656677, "sampling/importance_sampling_ratio/max": 1.8103212118148804, "sampling/importance_sampling_ratio/mean": 0.942000687122345, "sampling/importance_sampling_ratio/min": 0.450409859418869, "sampling/sampling_logp_difference/max": 0.7783463001251221, "sampling/sampling_logp_difference/mean": 0.03753424063324928, "step": 322, "step_time": 12.30265591600164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1881.0, "completions/max_terminated_length": 1881.0, "completions/mean_length": 443.5, "completions/mean_terminated_length": 480.66668701171875, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 1.870685800909996, "epoch": 0.00646, "frac_reward_zero_std": 0.0, "grad_norm": 1.277451753616333, "kl": 1.482536718249321, "learning_rate": 5.819970920520053e-05, "loss": 0.0696, "num_tokens": 13988428.0, "reward": 0.6857200860977173, "reward_std": 0.6880486011505127, "rewards/rollout_reward_func/mean": 0.6857200860977173, "rewards/rollout_reward_func/std": 0.6916353702545166, "sampling/importance_sampling_ratio/max": 1.98198664188385, "sampling/importance_sampling_ratio/mean": 1.060497522354126, "sampling/importance_sampling_ratio/min": 0.5108248591423035, "sampling/sampling_logp_difference/max": 0.3667895793914795, "sampling/sampling_logp_difference/mean": 0.04019013047218323, "step": 323, "step_time": 14.721898110001348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1532.0, "completions/max_terminated_length": 1532.0, "completions/mean_length": 416.0, "completions/mean_terminated_length": 454.125, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 1.6221411526203156, "epoch": 0.00648, "frac_reward_zero_std": 0.25, "grad_norm": 0.987755537033081, "kl": 1.3540777266025543, "learning_rate": 5.811315076422669e-05, "loss": 0.0513, "num_tokens": 14031940.0, "reward": 0.8095857501029968, "reward_std": 0.4735613763332367, "rewards/rollout_reward_func/mean": 0.8095857501029968, "rewards/rollout_reward_func/std": 0.537667989730835, "sampling/importance_sampling_ratio/max": 2.3065409660339355, "sampling/importance_sampling_ratio/mean": 1.0315005779266357, "sampling/importance_sampling_ratio/min": 2.309412697968499e-13, "sampling/sampling_logp_difference/max": 11.065608024597168, "sampling/sampling_logp_difference/mean": 0.06375093758106232, "step": 324, "step_time": 13.122290125999825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 522.21875, "completions/mean_terminated_length": 613.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 1.7475366294384003, "epoch": 0.0065, "frac_reward_zero_std": 0.0, "grad_norm": 0.8789039850234985, "kl": 1.3749408572912216, "learning_rate": 5.802636147663675e-05, "loss": 0.0133, "num_tokens": 14078270.0, "reward": 0.8002352118492126, "reward_std": 0.4968862235546112, "rewards/rollout_reward_func/mean": 0.8002352118492126, "rewards/rollout_reward_func/std": 0.49343377351760864, "sampling/importance_sampling_ratio/max": 1.9066412448883057, "sampling/importance_sampling_ratio/mean": 0.9309936761856079, "sampling/importance_sampling_ratio/min": 9.68465099782101e-11, "sampling/sampling_logp_difference/max": 20.859172821044922, "sampling/sampling_logp_difference/mean": 0.057179760187864304, "step": 325, "step_time": 14.316074967999157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1857.0, "completions/max_terminated_length": 1857.0, "completions/mean_length": 540.03125, "completions/mean_terminated_length": 581.2142944335938, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 1.7036107927560806, "epoch": 0.00652, "frac_reward_zero_std": 0.0, "grad_norm": 0.9445482492446899, "kl": 1.4076658189296722, "learning_rate": 5.793934258582585e-05, "loss": 0.3191, "num_tokens": 14125826.0, "reward": 0.48526278138160706, "reward_std": 0.6844392418861389, "rewards/rollout_reward_func/mean": 0.48526278138160706, "rewards/rollout_reward_func/std": 0.6783384680747986, "sampling/importance_sampling_ratio/max": 1.7018033266067505, "sampling/importance_sampling_ratio/mean": 1.0237257480621338, "sampling/importance_sampling_ratio/min": 0.4592413902282715, "sampling/sampling_logp_difference/max": 0.3801875114440918, "sampling/sampling_logp_difference/mean": 0.03654684126377106, "step": 326, "step_time": 14.69510712400188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 366.0625, "completions/mean_terminated_length": 374.23077392578125, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 1.6721143871545792, "epoch": 0.00654, "frac_reward_zero_std": 0.25, "grad_norm": 0.8105725049972534, "kl": 1.4074698984622955, "learning_rate": 5.785209533847856e-05, "loss": 0.084, "num_tokens": 14168007.0, "reward": 0.7354402542114258, "reward_std": 0.44130727648735046, "rewards/rollout_reward_func/mean": 0.7354402542114258, "rewards/rollout_reward_func/std": 0.5291643142700195, "sampling/importance_sampling_ratio/max": 1.7749887704849243, "sampling/importance_sampling_ratio/mean": 1.0455050468444824, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 19.504716873168945, "sampling/sampling_logp_difference/mean": 0.08531798422336578, "step": 327, "step_time": 12.016022490999603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1879.0, "completions/max_terminated_length": 1879.0, "completions/mean_length": 625.34375, "completions/mean_terminated_length": 735.2222290039062, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 1.7709321230649948, "epoch": 0.00656, "frac_reward_zero_std": 0.0, "grad_norm": 1.4988988637924194, "kl": 1.504413202404976, "learning_rate": 5.776462098455102e-05, "loss": 0.1201, "num_tokens": 14219135.0, "reward": 0.7354245781898499, "reward_std": 0.510002851486206, "rewards/rollout_reward_func/mean": 0.7354245781898499, "rewards/rollout_reward_func/std": 0.5289419889450073, "sampling/importance_sampling_ratio/max": 2.368122100830078, "sampling/importance_sampling_ratio/mean": 1.0833089351654053, "sampling/importance_sampling_ratio/min": 3.313498066220899e-25, "sampling/sampling_logp_difference/max": 29.867210388183594, "sampling/sampling_logp_difference/mean": 0.08251447975635529, "step": 328, "step_time": 15.542663511000683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1871.0, "completions/max_terminated_length": 1871.0, "completions/mean_length": 636.1875, "completions/mean_terminated_length": 835.6190795898438, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 1.7749462872743607, "epoch": 0.00658, "frac_reward_zero_std": 0.0, "grad_norm": 0.705107569694519, "kl": 1.16939178109169, "learning_rate": 5.7676920777253036e-05, "loss": -0.0362, "num_tokens": 14269674.0, "reward": 0.6230120658874512, "reward_std": 0.7078288793563843, "rewards/rollout_reward_func/mean": 0.6230120658874512, "rewards/rollout_reward_func/std": 0.7040373086929321, "sampling/importance_sampling_ratio/max": 1.7733280658721924, "sampling/importance_sampling_ratio/mean": 0.9775888323783875, "sampling/importance_sampling_ratio/min": 0.296292245388031, "sampling/sampling_logp_difference/max": 0.5219392776489258, "sampling/sampling_logp_difference/mean": 0.03924380987882614, "step": 329, "step_time": 15.96270235999782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1371.0, "completions/max_terminated_length": 1371.0, "completions/mean_length": 317.4375, "completions/mean_terminated_length": 394.9090881347656, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 1.8771961331367493, "epoch": 0.0066, "frac_reward_zero_std": 0.0, "grad_norm": 0.947878360748291, "kl": 1.5240257382392883, "learning_rate": 5.758899597303015e-05, "loss": -0.0047, "num_tokens": 14308398.0, "reward": 0.5859133005142212, "reward_std": 0.6832065582275391, "rewards/rollout_reward_func/mean": 0.5859133005142212, "rewards/rollout_reward_func/std": 0.7161737680435181, "sampling/importance_sampling_ratio/max": 1.7864612340927124, "sampling/importance_sampling_ratio/mean": 0.9574754238128662, "sampling/importance_sampling_ratio/min": 0.4319297671318054, "sampling/sampling_logp_difference/max": 0.5423250198364258, "sampling/sampling_logp_difference/mean": 0.03944649174809456, "step": 330, "step_time": 12.219806164000147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 911.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 427.09375, "completions/mean_terminated_length": 460.54547119140625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 1.7511957734823227, "epoch": 0.00662, "frac_reward_zero_std": 0.0, "grad_norm": 0.7596840262413025, "kl": 1.3348423466086388, "learning_rate": 5.7500847831545565e-05, "loss": 0.0454, "num_tokens": 14352301.0, "reward": 0.6511640548706055, "reward_std": 0.7161141037940979, "rewards/rollout_reward_func/mean": 0.6511640548706055, "rewards/rollout_reward_func/std": 0.701266884803772, "sampling/importance_sampling_ratio/max": 1.8782095909118652, "sampling/importance_sampling_ratio/mean": 0.9354586005210876, "sampling/importance_sampling_ratio/min": 0.3601326644420624, "sampling/sampling_logp_difference/max": 0.42397308349609375, "sampling/sampling_logp_difference/mean": 0.0395401269197464, "step": 331, "step_time": 11.972187067999585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1805.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 398.96875, "completions/mean_terminated_length": 483.3199768066406, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 1.7964060604572296, "epoch": 0.00664, "frac_reward_zero_std": 0.0, "grad_norm": 0.9940391778945923, "kl": 1.4934210777282715, "learning_rate": 5.741247761566219e-05, "loss": 0.1625, "num_tokens": 14393864.0, "reward": 0.5078614354133606, "reward_std": 0.6361294984817505, "rewards/rollout_reward_func/mean": 0.5078614354133606, "rewards/rollout_reward_func/std": 0.6415656208992004, "sampling/importance_sampling_ratio/max": 2.337026596069336, "sampling/importance_sampling_ratio/mean": 1.0020885467529297, "sampling/importance_sampling_ratio/min": 2.7452451447243027e-34, "sampling/sampling_logp_difference/max": 15.03607177734375, "sampling/sampling_logp_difference/mean": 0.14753322303295135, "step": 332, "step_time": 14.733003164001275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1871.0, "completions/max_terminated_length": 1871.0, "completions/mean_length": 480.90625, "completions/mean_terminated_length": 534.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 1.8106079995632172, "epoch": 0.00666, "frac_reward_zero_std": 0.0, "grad_norm": 0.9075634479522705, "kl": 1.3264703005552292, "learning_rate": 5.732388659142449e-05, "loss": 0.1139, "num_tokens": 14440181.0, "reward": 0.6172409057617188, "reward_std": 0.59410560131073, "rewards/rollout_reward_func/mean": 0.6172409057617188, "rewards/rollout_reward_func/std": 0.6634979248046875, "sampling/importance_sampling_ratio/max": 1.9714504480361938, "sampling/importance_sampling_ratio/mean": 0.8980029225349426, "sampling/importance_sampling_ratio/min": 0.3687222898006439, "sampling/sampling_logp_difference/max": 0.47480344772338867, "sampling/sampling_logp_difference/mean": 0.03970661386847496, "step": 333, "step_time": 14.951573126998483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1604.0, "completions/max_terminated_length": 1604.0, "completions/mean_length": 452.65625, "completions/mean_terminated_length": 428.68182373046875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 1.9442303776741028, "epoch": 0.00668, "frac_reward_zero_std": 0.25, "grad_norm": 0.8010836839675903, "kl": 1.3189978748559952, "learning_rate": 5.723507602804034e-05, "loss": 0.0535, "num_tokens": 14483952.0, "reward": 0.7751100063323975, "reward_std": 0.48571693897247314, "rewards/rollout_reward_func/mean": 0.7751100063323975, "rewards/rollout_reward_func/std": 0.557670533657074, "sampling/importance_sampling_ratio/max": 2.4884440898895264, "sampling/importance_sampling_ratio/mean": 0.9962681531906128, "sampling/importance_sampling_ratio/min": 6.747588004798011e-27, "sampling/sampling_logp_difference/max": 30.020427703857422, "sampling/sampling_logp_difference/mean": 0.12846718728542328, "step": 334, "step_time": 13.95383020600184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1760.0, "completions/max_terminated_length": 1760.0, "completions/mean_length": 330.09375, "completions/mean_terminated_length": 280.0833435058594, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8403267115354538, "epoch": 0.0067, "frac_reward_zero_std": 0.0, "grad_norm": 0.836379885673523, "kl": 1.5451650246977806, "learning_rate": 5.714604719786291e-05, "loss": 0.3393, "num_tokens": 14523195.0, "reward": 0.4195025861263275, "reward_std": 0.7674053907394409, "rewards/rollout_reward_func/mean": 0.4195025861263275, "rewards/rollout_reward_func/std": 0.7639087438583374, "sampling/importance_sampling_ratio/max": 2.0040528774261475, "sampling/importance_sampling_ratio/mean": 1.0116745233535767, "sampling/importance_sampling_ratio/min": 0.5902377963066101, "sampling/sampling_logp_difference/max": 0.3364601135253906, "sampling/sampling_logp_difference/mean": 0.040053896605968475, "step": 335, "step_time": 12.88753123999959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1728.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 652.6875, "completions/mean_terminated_length": 611.3599853515625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9175704270601273, "epoch": 0.00672, "frac_reward_zero_std": 0.25, "grad_norm": 0.6775393486022949, "kl": 1.312347412109375, "learning_rate": 5.7056801376372336e-05, "loss": -0.0573, "num_tokens": 14575024.0, "reward": 0.5681095719337463, "reward_std": 0.6378161311149597, "rewards/rollout_reward_func/mean": 0.5681095719337463, "rewards/rollout_reward_func/std": 0.7843438386917114, "sampling/importance_sampling_ratio/max": 2.444888114929199, "sampling/importance_sampling_ratio/mean": 1.0297870635986328, "sampling/importance_sampling_ratio/min": 0.2999047636985779, "sampling/sampling_logp_difference/max": 0.44620227813720703, "sampling/sampling_logp_difference/mean": 0.03945869207382202, "step": 336, "step_time": 15.153042409999216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1728.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 562.8125, "completions/mean_terminated_length": 517.2692260742188, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6205780804157257, "epoch": 0.00674, "frac_reward_zero_std": 0.0, "grad_norm": 0.5834155082702637, "kl": 1.443606436252594, "learning_rate": 5.6967339842157554e-05, "loss": 0.1848, "num_tokens": 14623981.0, "reward": 0.3851495385169983, "reward_std": 0.8788095712661743, "rewards/rollout_reward_func/mean": 0.3851495385169983, "rewards/rollout_reward_func/std": 0.8879690766334534, "sampling/importance_sampling_ratio/max": 2.4352755546569824, "sampling/importance_sampling_ratio/mean": 1.021919846534729, "sampling/importance_sampling_ratio/min": 0.45131590962409973, "sampling/sampling_logp_difference/max": 0.30412769317626953, "sampling/sampling_logp_difference/mean": 0.03588489443063736, "step": 337, "step_time": 13.215937141001632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1786.0, "completions/max_terminated_length": 1786.0, "completions/mean_length": 361.46875, "completions/mean_terminated_length": 368.3793029785156, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.595387078821659, "epoch": 0.00676, "frac_reward_zero_std": 0.0, "grad_norm": 0.7206841111183167, "kl": 1.626193344593048, "learning_rate": 5.687766387689789e-05, "loss": 0.0789, "num_tokens": 14664881.0, "reward": 0.4694904386997223, "reward_std": 0.845159649848938, "rewards/rollout_reward_func/mean": 0.4694904386997223, "rewards/rollout_reward_func/std": 0.834299623966217, "sampling/importance_sampling_ratio/max": 2.641738176345825, "sampling/importance_sampling_ratio/mean": 1.1516799926757812, "sampling/importance_sampling_ratio/min": 0.8350897431373596, "sampling/sampling_logp_difference/max": 0.3199906349182129, "sampling/sampling_logp_difference/mean": 0.033841103315353394, "step": 338, "step_time": 12.364065835000474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1636.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 466.71875, "completions/mean_terminated_length": 466.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.127852726727724, "epoch": 0.00678, "frac_reward_zero_std": 0.0, "grad_norm": 0.5049527287483215, "kl": 2.238286644220352, "learning_rate": 5.6787774765344765e-05, "loss": 0.0821, "num_tokens": 14709671.0, "reward": 0.5924090147018433, "reward_std": 0.7776800990104675, "rewards/rollout_reward_func/mean": 0.5924090147018433, "rewards/rollout_reward_func/std": 0.7513605952262878, "sampling/importance_sampling_ratio/max": 1.2131597995758057, "sampling/importance_sampling_ratio/mean": 0.9618440866470337, "sampling/importance_sampling_ratio/min": 0.6197219491004944, "sampling/sampling_logp_difference/max": 0.20769214630126953, "sampling/sampling_logp_difference/mean": 0.028911111876368523, "step": 339, "step_time": 10.558482098999775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1376.0, "completions/max_terminated_length": 1376.0, "completions/mean_length": 375.40625, "completions/mean_terminated_length": 375.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6573960520327091, "epoch": 0.0068, "frac_reward_zero_std": 0.5, "grad_norm": 0.26827532052993774, "kl": 1.7425964325666428, "learning_rate": 5.669767379530321e-05, "loss": 0.0109, "num_tokens": 14750737.0, "reward": 0.8414770364761353, "reward_std": 0.36255210638046265, "rewards/rollout_reward_func/mean": 0.8414770364761353, "rewards/rollout_reward_func/std": 0.5163783431053162, "sampling/importance_sampling_ratio/max": 1.4381166696548462, "sampling/importance_sampling_ratio/mean": 0.9988709092140198, "sampling/importance_sampling_ratio/min": 0.7888900637626648, "sampling/sampling_logp_difference/max": 0.274397611618042, "sampling/sampling_logp_difference/mean": 0.027827434241771698, "step": 340, "step_time": 9.472722025000621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1662.0, "completions/max_terminated_length": 1662.0, "completions/mean_length": 545.875, "completions/mean_terminated_length": 545.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5146646233042702, "epoch": 0.00682, "frac_reward_zero_std": 0.5, "grad_norm": 0.19750210642814636, "kl": 1.2779754847288132, "learning_rate": 5.660736225761356e-05, "loss": 0.0037, "num_tokens": 14797697.0, "reward": 0.9070193767547607, "reward_std": 0.2629888355731964, "rewards/rollout_reward_func/mean": 0.9070193767547607, "rewards/rollout_reward_func/std": 0.38215431571006775, "sampling/importance_sampling_ratio/max": 1.3834794759750366, "sampling/importance_sampling_ratio/mean": 1.001630425453186, "sampling/importance_sampling_ratio/min": 0.6713005304336548, "sampling/sampling_logp_difference/max": 0.26518332958221436, "sampling/sampling_logp_difference/mean": 0.03095483034849167, "step": 341, "step_time": 11.431528274999437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1567.0, "completions/max_terminated_length": 1567.0, "completions/mean_length": 373.71875, "completions/mean_terminated_length": 373.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.3572801072150469, "epoch": 0.00684, "frac_reward_zero_std": 0.0, "grad_norm": 0.3401959240436554, "kl": 1.0564737543463707, "learning_rate": 5.651684144613278e-05, "loss": 0.0138, "num_tokens": 14838929.0, "reward": 0.7075363397598267, "reward_std": 0.6120187044143677, "rewards/rollout_reward_func/mean": 0.7075363397598267, "rewards/rollout_reward_func/std": 0.5939435362815857, "sampling/importance_sampling_ratio/max": 1.269450068473816, "sampling/importance_sampling_ratio/mean": 0.9870098829269409, "sampling/importance_sampling_ratio/min": 0.6510058045387268, "sampling/sampling_logp_difference/max": 0.4152100086212158, "sampling/sampling_logp_difference/mean": 0.024194221943616867, "step": 342, "step_time": 10.213652495998758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1756.0, "completions/max_terminated_length": 1756.0, "completions/mean_length": 497.0625, "completions/mean_terminated_length": 497.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.3099426068365574, "epoch": 0.00686, "frac_reward_zero_std": 0.25, "grad_norm": 0.30056580901145935, "kl": 1.0136556699872017, "learning_rate": 5.642611265771605e-05, "loss": 0.0276, "num_tokens": 14884544.0, "reward": 0.8111120462417603, "reward_std": 0.4122530221939087, "rewards/rollout_reward_func/mean": 0.8111120462417603, "rewards/rollout_reward_func/std": 0.537300705909729, "sampling/importance_sampling_ratio/max": 1.6869990825653076, "sampling/importance_sampling_ratio/mean": 0.9913614988327026, "sampling/importance_sampling_ratio/min": 0.7716643214225769, "sampling/sampling_logp_difference/max": 0.3110926151275635, "sampling/sampling_logp_difference/mean": 0.02017364278435707, "step": 343, "step_time": 10.903116887998294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1626.0, "completions/max_terminated_length": 1626.0, "completions/mean_length": 552.09375, "completions/mean_terminated_length": 552.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.25482019782066345, "epoch": 0.00688, "frac_reward_zero_std": 0.0, "grad_norm": 0.362052321434021, "kl": 2.3351273760199547, "learning_rate": 5.6335177192198165e-05, "loss": -0.0093, "num_tokens": 14932224.0, "reward": 0.6516520977020264, "reward_std": 0.6835556030273438, "rewards/rollout_reward_func/mean": 0.6516520977020264, "rewards/rollout_reward_func/std": 0.7004400491714478, "sampling/importance_sampling_ratio/max": 1.4077190160751343, "sampling/importance_sampling_ratio/mean": 1.0053741931915283, "sampling/importance_sampling_ratio/min": 0.7238722443580627, "sampling/sampling_logp_difference/max": 0.33116888999938965, "sampling/sampling_logp_difference/mean": 0.021225009113550186, "step": 344, "step_time": 10.279450086999532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1722.0, "completions/max_terminated_length": 1722.0, "completions/mean_length": 448.78125, "completions/mean_terminated_length": 448.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.26859883684664965, "epoch": 0.0069, "frac_reward_zero_std": 0.0, "grad_norm": 0.30896106362342834, "kl": 1.3344142585992813, "learning_rate": 5.624403635237489e-05, "loss": 0.0071, "num_tokens": 14975800.0, "reward": 0.630664050579071, "reward_std": 0.7646483182907104, "rewards/rollout_reward_func/mean": 0.630664050579071, "rewards/rollout_reward_func/std": 0.7367360591888428, "sampling/importance_sampling_ratio/max": 1.6156985759735107, "sampling/importance_sampling_ratio/mean": 1.015537977218628, "sampling/importance_sampling_ratio/min": 0.8715435266494751, "sampling/sampling_logp_difference/max": 0.38169097900390625, "sampling/sampling_logp_difference/mean": 0.011256679892539978, "step": 345, "step_time": 10.451012101000742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1727.0, "completions/max_terminated_length": 1727.0, "completions/mean_length": 436.46875, "completions/mean_terminated_length": 436.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.45205962285399437, "epoch": 0.00692, "frac_reward_zero_std": 0.0, "grad_norm": 0.5553202629089355, "kl": 1.0258909948170185, "learning_rate": 5.615269144398426e-05, "loss": -0.0085, "num_tokens": 15019798.0, "reward": 0.4336846172809601, "reward_std": 0.8426246643066406, "rewards/rollout_reward_func/mean": 0.4336846172809601, "rewards/rollout_reward_func/std": 0.8338596820831299, "sampling/importance_sampling_ratio/max": 2.418038845062256, "sampling/importance_sampling_ratio/mean": 1.0186384916305542, "sampling/importance_sampling_ratio/min": 0.26317551732063293, "sampling/sampling_logp_difference/max": 1.3448988199234009, "sampling/sampling_logp_difference/mean": 0.036054618656635284, "step": 346, "step_time": 11.246449744000529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1479.0, "completions/max_terminated_length": 1479.0, "completions/mean_length": 291.6875, "completions/mean_terminated_length": 328.7857360839844, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6980332434177399, "epoch": 0.00694, "frac_reward_zero_std": 0.0, "grad_norm": 1.4485279321670532, "kl": 1.6890353560447693, "learning_rate": 5.606114377568799e-05, "loss": 0.2036, "num_tokens": 15058187.0, "reward": 0.49942439794540405, "reward_std": 0.7699728012084961, "rewards/rollout_reward_func/mean": 0.49942439794540405, "rewards/rollout_reward_func/std": 0.8406020998954773, "sampling/importance_sampling_ratio/max": 2.869196653366089, "sampling/importance_sampling_ratio/mean": 0.8579082489013672, "sampling/importance_sampling_ratio/min": 2.6125670398923295e-18, "sampling/sampling_logp_difference/max": 21.39548110961914, "sampling/sampling_logp_difference/mean": 0.3355177044868469, "step": 347, "step_time": 12.023896689998764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 260.75, "completions/mean_terminated_length": 303.1111145019531, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6054564900696278, "epoch": 0.00696, "frac_reward_zero_std": 0.0, "grad_norm": 0.7562420964241028, "kl": 1.357473723590374, "learning_rate": 5.596939465905262e-05, "loss": 0.0341, "num_tokens": 15096891.0, "reward": 0.31488293409347534, "reward_std": 0.8090741038322449, "rewards/rollout_reward_func/mean": 0.31488293409347534, "rewards/rollout_reward_func/std": 0.9228051900863647, "sampling/importance_sampling_ratio/max": 1.6291882991790771, "sampling/importance_sampling_ratio/mean": 0.6794177293777466, "sampling/importance_sampling_ratio/min": 7.213281293619643e-18, "sampling/sampling_logp_difference/max": 27.935880661010742, "sampling/sampling_logp_difference/mean": 0.4537343978881836, "step": 348, "step_time": 10.317890964999606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1788.0, "completions/max_terminated_length": 1788.0, "completions/mean_length": 533.96875, "completions/mean_terminated_length": 585.8965454101562, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6235922202467918, "epoch": 0.00698, "frac_reward_zero_std": 0.0, "grad_norm": 0.6668912172317505, "kl": 2.3682532608509064, "learning_rate": 5.587744540853076e-05, "loss": 0.0834, "num_tokens": 15144251.0, "reward": 0.4648650288581848, "reward_std": 0.8716228604316711, "rewards/rollout_reward_func/mean": 0.4648650288581848, "rewards/rollout_reward_func/std": 0.842811107635498, "sampling/importance_sampling_ratio/max": 1.565379023551941, "sampling/importance_sampling_ratio/mean": 0.8309555053710938, "sampling/importance_sampling_ratio/min": 5.172586552492827e-20, "sampling/sampling_logp_difference/max": 24.460145950317383, "sampling/sampling_logp_difference/mean": 0.23115403950214386, "step": 349, "step_time": 12.723509564001688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1666.0, "completions/max_terminated_length": 1666.0, "completions/mean_length": 575.9375, "completions/mean_terminated_length": 564.9677124023438, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5153351649641991, "epoch": 0.007, "frac_reward_zero_std": 0.25, "grad_norm": 0.6508499383926392, "kl": 3.0029600262641907, "learning_rate": 5.5785297341442255e-05, "loss": 0.0601, "num_tokens": 15193068.0, "reward": 0.5977025628089905, "reward_std": 0.6221460103988647, "rewards/rollout_reward_func/mean": 0.5977025628089905, "rewards/rollout_reward_func/std": 0.7899593114852905, "sampling/importance_sampling_ratio/max": 1.3995667695999146, "sampling/importance_sampling_ratio/mean": 0.9406589269638062, "sampling/importance_sampling_ratio/min": 6.266917785257566e-16, "sampling/sampling_logp_difference/max": 21.255714416503906, "sampling/sampling_logp_difference/mean": 0.09757798910140991, "step": 350, "step_time": 11.928615687000274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 363.375, "completions/mean_terminated_length": 363.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6317253150045872, "epoch": 0.00702, "frac_reward_zero_std": 0.5, "grad_norm": 0.33659160137176514, "kl": 1.4857578054070473, "learning_rate": 5.569295177795533e-05, "loss": -0.0198, "num_tokens": 15234154.0, "reward": 0.8694150447845459, "reward_std": 0.2851160764694214, "rewards/rollout_reward_func/mean": 0.8694150447845459, "rewards/rollout_reward_func/std": 0.43163222074508667, "sampling/importance_sampling_ratio/max": 1.6811342239379883, "sampling/importance_sampling_ratio/mean": 0.9363423585891724, "sampling/importance_sampling_ratio/min": 6.276675604809936e-17, "sampling/sampling_logp_difference/max": 33.01884460449219, "sampling/sampling_logp_difference/mean": 0.18572190403938293, "step": 351, "step_time": 8.863539136003055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2390.0, "completions/max_terminated_length": 2390.0, "completions/mean_length": 551.03125, "completions/mean_terminated_length": 551.03125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.4232826456427574, "epoch": 0.00704, "frac_reward_zero_std": 0.25, "grad_norm": 0.4128619134426117, "kl": 1.3111083209514618, "learning_rate": 5.5600410041067645e-05, "loss": -0.0029, "num_tokens": 15283104.0, "reward": 0.8392052054405212, "reward_std": 0.37545520067214966, "rewards/rollout_reward_func/mean": 0.8392052054405212, "rewards/rollout_reward_func/std": 0.4532161056995392, "sampling/importance_sampling_ratio/max": 1.3634523153305054, "sampling/importance_sampling_ratio/mean": 1.004796028137207, "sampling/importance_sampling_ratio/min": 0.3870285749435425, "sampling/sampling_logp_difference/max": 0.8312829732894897, "sampling/sampling_logp_difference/mean": 0.030815765261650085, "step": 352, "step_time": 12.508409778998612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1608.0, "completions/max_terminated_length": 1608.0, "completions/mean_length": 483.625, "completions/mean_terminated_length": 483.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2588706910610199, "epoch": 0.00706, "frac_reward_zero_std": 0.25, "grad_norm": 0.20580995082855225, "kl": 1.3041187450289726, "learning_rate": 5.550767345658734e-05, "loss": -0.0039, "num_tokens": 15327833.0, "reward": 0.8063299655914307, "reward_std": 0.4623047709465027, "rewards/rollout_reward_func/mean": 0.8063299655914307, "rewards/rollout_reward_func/std": 0.54346764087677, "sampling/importance_sampling_ratio/max": 1.115119457244873, "sampling/importance_sampling_ratio/mean": 0.9769562482833862, "sampling/importance_sampling_ratio/min": 0.7310574650764465, "sampling/sampling_logp_difference/max": 0.23006248474121094, "sampling/sampling_logp_difference/mean": 0.01754320226609707, "step": 353, "step_time": 10.169423343000744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1723.0, "completions/max_terminated_length": 1723.0, "completions/mean_length": 407.15625, "completions/mean_terminated_length": 407.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.23138930462300777, "epoch": 0.00708, "frac_reward_zero_std": 0.0, "grad_norm": 0.32352226972579956, "kl": 1.3931075558066368, "learning_rate": 5.541474335311412e-05, "loss": 0.0137, "num_tokens": 15368659.0, "reward": 0.6462339162826538, "reward_std": 0.6853234767913818, "rewards/rollout_reward_func/mean": 0.6462339162826538, "rewards/rollout_reward_func/std": 0.660455048084259, "sampling/importance_sampling_ratio/max": 1.2884831428527832, "sampling/importance_sampling_ratio/mean": 1.01698637008667, "sampling/importance_sampling_ratio/min": 0.938805103302002, "sampling/sampling_logp_difference/max": 0.33804845809936523, "sampling/sampling_logp_difference/mean": 0.011274769902229309, "step": 354, "step_time": 10.286917626000104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1745.0, "completions/max_terminated_length": 1745.0, "completions/mean_length": 451.84375, "completions/mean_terminated_length": 451.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2763133477419615, "epoch": 0.0071, "frac_reward_zero_std": 0.25, "grad_norm": 0.14559493958950043, "kl": 1.5571837350726128, "learning_rate": 5.532162106202009e-05, "loss": -0.0062, "num_tokens": 15412544.0, "reward": 0.8086154460906982, "reward_std": 0.47547101974487305, "rewards/rollout_reward_func/mean": 0.8086154460906982, "rewards/rollout_reward_func/std": 0.5394358038902283, "sampling/importance_sampling_ratio/max": 1.118674635887146, "sampling/importance_sampling_ratio/mean": 0.9699651598930359, "sampling/importance_sampling_ratio/min": 0.4991060197353363, "sampling/sampling_logp_difference/max": 0.26390504837036133, "sampling/sampling_logp_difference/mean": 0.016544293612241745, "step": 355, "step_time": 10.815177925000171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1668.0, "completions/max_terminated_length": 1668.0, "completions/mean_length": 429.03125, "completions/mean_terminated_length": 429.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.25587788596749306, "epoch": 0.00712, "frac_reward_zero_std": 0.0, "grad_norm": 0.4682311415672302, "kl": 1.230936884880066, "learning_rate": 5.522830791743078e-05, "loss": 0.0291, "num_tokens": 15454113.0, "reward": 0.642177164554596, "reward_std": 0.5590000152587891, "rewards/rollout_reward_func/mean": 0.642177164554596, "rewards/rollout_reward_func/std": 0.6131576299667358, "sampling/importance_sampling_ratio/max": 1.3378260135650635, "sampling/importance_sampling_ratio/mean": 0.990329384803772, "sampling/importance_sampling_ratio/min": 0.7274982333183289, "sampling/sampling_logp_difference/max": 0.3336830139160156, "sampling/sampling_logp_difference/mean": 0.013576183468103409, "step": 356, "step_time": 10.861892472001273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 479.8125, "completions/mean_terminated_length": 479.8125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.18072244245558977, "epoch": 0.00714, "frac_reward_zero_std": 0.25, "grad_norm": 0.41938164830207825, "kl": 1.9870564341545105, "learning_rate": 5.5134805256206026e-05, "loss": 0.0203, "num_tokens": 15500160.0, "reward": 0.7990494966506958, "reward_std": 0.35742515325546265, "rewards/rollout_reward_func/mean": 0.7990494966506958, "rewards/rollout_reward_func/std": 0.4250444173812866, "sampling/importance_sampling_ratio/max": 1.401049256324768, "sampling/importance_sampling_ratio/mean": 1.0110900402069092, "sampling/importance_sampling_ratio/min": 0.9047029614448547, "sampling/sampling_logp_difference/max": 0.33108723163604736, "sampling/sampling_logp_difference/mean": 0.007975272834300995, "step": 357, "step_time": 8.22315237100156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1669.0, "completions/max_terminated_length": 1669.0, "completions/mean_length": 481.125, "completions/mean_terminated_length": 481.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.24806832615286112, "epoch": 0.00716, "frac_reward_zero_std": 0.0, "grad_norm": 0.7342888116836548, "kl": 1.7505519837141037, "learning_rate": 5.504111441792073e-05, "loss": 0.0296, "num_tokens": 15545312.0, "reward": 0.6955374479293823, "reward_std": 0.5162494778633118, "rewards/rollout_reward_func/mean": 0.6955374479293823, "rewards/rollout_reward_func/std": 0.4946204721927643, "sampling/importance_sampling_ratio/max": 1.537907600402832, "sampling/importance_sampling_ratio/mean": 0.9686342477798462, "sampling/importance_sampling_ratio/min": 0.6676719784736633, "sampling/sampling_logp_difference/max": 0.6809341907501221, "sampling/sampling_logp_difference/mean": 0.027112144976854324, "step": 358, "step_time": 10.384489704999396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 924.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 458.1875, "completions/mean_terminated_length": 458.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.3422762956470251, "epoch": 0.00718, "frac_reward_zero_std": 0.25, "grad_norm": 0.48569920659065247, "kl": 1.7360420525074005, "learning_rate": 5.49472367448458e-05, "loss": 0.0066, "num_tokens": 15590203.0, "reward": 0.797340989112854, "reward_std": 0.35992830991744995, "rewards/rollout_reward_func/mean": 0.797340989112854, "rewards/rollout_reward_func/std": 0.4286954402923584, "sampling/importance_sampling_ratio/max": 2.01469087600708, "sampling/importance_sampling_ratio/mean": 1.0527303218841553, "sampling/importance_sampling_ratio/min": 0.7728344798088074, "sampling/sampling_logp_difference/max": 0.2750670909881592, "sampling/sampling_logp_difference/mean": 0.024681344628334045, "step": 359, "step_time": 8.4021529620004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1635.0, "completions/max_terminated_length": 1635.0, "completions/mean_length": 436.0, "completions/mean_terminated_length": 449.0322570800781, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5354658402502537, "epoch": 0.0072, "frac_reward_zero_std": 0.0, "grad_norm": 0.7487923502922058, "kl": 2.2236952036619186, "learning_rate": 5.485317358192881e-05, "loss": 0.1102, "num_tokens": 15634592.0, "reward": 0.5908601880073547, "reward_std": 0.6981725096702576, "rewards/rollout_reward_func/mean": 0.5908601880073547, "rewards/rollout_reward_func/std": 0.7542515397071838, "sampling/importance_sampling_ratio/max": 1.630153775215149, "sampling/importance_sampling_ratio/mean": 0.9872914552688599, "sampling/importance_sampling_ratio/min": 7.355750580986592e-17, "sampling/sampling_logp_difference/max": 33.35157775878906, "sampling/sampling_logp_difference/mean": 0.16453231871128082, "step": 360, "step_time": 10.637750355001117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 894.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 491.09375, "completions/mean_terminated_length": 491.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.3495447617024183, "epoch": 0.00722, "frac_reward_zero_std": 0.25, "grad_norm": 0.2831980884075165, "kl": 1.665136605501175, "learning_rate": 5.4758926276774764e-05, "loss": 0.0126, "num_tokens": 15682028.0, "reward": 0.681244432926178, "reward_std": 0.4636746942996979, "rewards/rollout_reward_func/mean": 0.681244432926178, "rewards/rollout_reward_func/std": 0.6466308236122131, "sampling/importance_sampling_ratio/max": 1.4961107969284058, "sampling/importance_sampling_ratio/mean": 0.9890972375869751, "sampling/importance_sampling_ratio/min": 0.7461020350456238, "sampling/sampling_logp_difference/max": 0.49579763412475586, "sampling/sampling_logp_difference/mean": 0.02395690605044365, "step": 361, "step_time": 8.72220128400113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1672.0, "completions/max_terminated_length": 1672.0, "completions/mean_length": 431.46875, "completions/mean_terminated_length": 431.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.16433336725458503, "epoch": 0.00724, "frac_reward_zero_std": 0.0, "grad_norm": 0.3079923987388611, "kl": 1.5277294963598251, "learning_rate": 5.466449617962682e-05, "loss": 0.0226, "num_tokens": 15725270.0, "reward": 0.8002012968063354, "reward_std": 0.4358254075050354, "rewards/rollout_reward_func/mean": 0.8002012968063354, "rewards/rollout_reward_func/std": 0.42262834310531616, "sampling/importance_sampling_ratio/max": 1.1932311058044434, "sampling/importance_sampling_ratio/mean": 0.9934345483779907, "sampling/importance_sampling_ratio/min": 0.5525137782096863, "sampling/sampling_logp_difference/max": 0.5941166877746582, "sampling/sampling_logp_difference/mean": 0.015791255980730057, "step": 362, "step_time": 10.074868324000818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1576.0, "completions/max_terminated_length": 1576.0, "completions/mean_length": 562.0, "completions/mean_terminated_length": 562.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1589171104133129, "epoch": 0.00726, "frac_reward_zero_std": 0.25, "grad_norm": 0.3548026978969574, "kl": 1.8210093677043915, "learning_rate": 5.4569884643346925e-05, "loss": 0.0132, "num_tokens": 15773284.0, "reward": 0.7716923952102661, "reward_std": 0.43122154474258423, "rewards/rollout_reward_func/mean": 0.7716923952102661, "rewards/rollout_reward_func/std": 0.5010194778442383, "sampling/importance_sampling_ratio/max": 1.1611007452011108, "sampling/importance_sampling_ratio/mean": 0.9865754842758179, "sampling/importance_sampling_ratio/min": 0.7729814648628235, "sampling/sampling_logp_difference/max": 0.35625243186950684, "sampling/sampling_logp_difference/mean": 0.013179942965507507, "step": 363, "step_time": 10.255349094998564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1766.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 405.8125, "completions/mean_terminated_length": 405.8125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.13207192672416568, "epoch": 0.00728, "frac_reward_zero_std": 0.0, "grad_norm": 0.20439188182353973, "kl": 1.484326884150505, "learning_rate": 5.447509302339641e-05, "loss": 0.0171, "num_tokens": 15814654.0, "reward": 0.7719881534576416, "reward_std": 0.5167226195335388, "rewards/rollout_reward_func/mean": 0.7719881534576416, "rewards/rollout_reward_func/std": 0.506413459777832, "sampling/importance_sampling_ratio/max": 1.0974934101104736, "sampling/importance_sampling_ratio/mean": 0.995803713798523, "sampling/importance_sampling_ratio/min": 0.8259373903274536, "sampling/sampling_logp_difference/max": 0.19388645887374878, "sampling/sampling_logp_difference/mean": 0.004858669824898243, "step": 364, "step_time": 10.745503641000141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1832.0, "completions/max_terminated_length": 1832.0, "completions/mean_length": 613.375, "completions/mean_terminated_length": 613.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1614094553515315, "epoch": 0.0073, "frac_reward_zero_std": 0.0, "grad_norm": 0.2651883065700531, "kl": 1.8524522930383682, "learning_rate": 5.4380122677816633e-05, "loss": 0.0257, "num_tokens": 15864838.0, "reward": 0.8337999582290649, "reward_std": 0.4047524333000183, "rewards/rollout_reward_func/mean": 0.8337999582290649, "rewards/rollout_reward_func/std": 0.3926210403442383, "sampling/importance_sampling_ratio/max": 1.713628888130188, "sampling/importance_sampling_ratio/mean": 1.0309630632400513, "sampling/importance_sampling_ratio/min": 0.8465203642845154, "sampling/sampling_logp_difference/max": 0.6784772872924805, "sampling/sampling_logp_difference/mean": 0.010714750736951828, "step": 365, "step_time": 12.144228509999266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1598.0, "completions/max_terminated_length": 1598.0, "completions/mean_length": 468.59375, "completions/mean_terminated_length": 468.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.193952988833189, "epoch": 0.00732, "frac_reward_zero_std": 0.0, "grad_norm": 0.251219242811203, "kl": 1.6826879009604454, "learning_rate": 5.428497496720943e-05, "loss": 0.0044, "num_tokens": 15910343.0, "reward": 0.6842281818389893, "reward_std": 0.6360292434692383, "rewards/rollout_reward_func/mean": 0.6842281818389893, "rewards/rollout_reward_func/std": 0.6402506828308105, "sampling/importance_sampling_ratio/max": 1.1160138845443726, "sampling/importance_sampling_ratio/mean": 0.9843735694885254, "sampling/importance_sampling_ratio/min": 0.7861959934234619, "sampling/sampling_logp_difference/max": 0.23944759368896484, "sampling/sampling_logp_difference/mean": 0.008780546486377716, "step": 366, "step_time": 10.932753956997658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1721.0, "completions/max_terminated_length": 1721.0, "completions/mean_length": 405.34375, "completions/mean_terminated_length": 405.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.15564095973968506, "epoch": 0.00734, "frac_reward_zero_std": 0.0, "grad_norm": 0.20755866169929504, "kl": 1.6868136525154114, "learning_rate": 5.4189651254717695e-05, "loss": 0.0183, "num_tokens": 15952521.0, "reward": 0.6796813011169434, "reward_std": 0.652112603187561, "rewards/rollout_reward_func/mean": 0.6796813011169434, "rewards/rollout_reward_func/std": 0.6510862112045288, "sampling/importance_sampling_ratio/max": 1.1314738988876343, "sampling/importance_sampling_ratio/mean": 1.0064617395401, "sampling/importance_sampling_ratio/min": 0.9293063879013062, "sampling/sampling_logp_difference/max": 0.11925601959228516, "sampling/sampling_logp_difference/mean": 0.006513582542538643, "step": 367, "step_time": 10.71238228799939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1609.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 457.40625, "completions/mean_terminated_length": 457.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2467304216697812, "epoch": 0.00736, "frac_reward_zero_std": 0.0, "grad_norm": 0.234165221452713, "kl": 1.9651928469538689, "learning_rate": 5.409415290600585e-05, "loss": 0.005, "num_tokens": 15996484.0, "reward": 0.6706942319869995, "reward_std": 0.613129734992981, "rewards/rollout_reward_func/mean": 0.6706942319869995, "rewards/rollout_reward_func/std": 0.6088826060295105, "sampling/importance_sampling_ratio/max": 1.1367413997650146, "sampling/importance_sampling_ratio/mean": 0.9913839101791382, "sampling/importance_sampling_ratio/min": 0.6948832869529724, "sampling/sampling_logp_difference/max": 0.23266315460205078, "sampling/sampling_logp_difference/mean": 0.013194211758673191, "step": 368, "step_time": 10.570538693000344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2325.0, "completions/max_terminated_length": 2325.0, "completions/mean_length": 477.5, "completions/mean_terminated_length": 477.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.18217508401721716, "epoch": 0.00738, "frac_reward_zero_std": 0.5, "grad_norm": 0.3067307472229004, "kl": 2.0075483173131943, "learning_rate": 5.3998481289240226e-05, "loss": 0.0029, "num_tokens": 16041336.0, "reward": 0.8349900245666504, "reward_std": 0.31506890058517456, "rewards/rollout_reward_func/mean": 0.8349900245666504, "rewards/rollout_reward_func/std": 0.46546313166618347, "sampling/importance_sampling_ratio/max": 1.1386356353759766, "sampling/importance_sampling_ratio/mean": 1.0004535913467407, "sampling/importance_sampling_ratio/min": 0.7953882217407227, "sampling/sampling_logp_difference/max": 0.3594703674316406, "sampling/sampling_logp_difference/mean": 0.010846041142940521, "step": 369, "step_time": 13.006640360997153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1728.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 502.625, "completions/mean_terminated_length": 502.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.22418491169810295, "epoch": 0.0074, "frac_reward_zero_std": 0.0, "grad_norm": 0.3023553192615509, "kl": 2.2855331748723984, "learning_rate": 5.390263777506953e-05, "loss": 0.0344, "num_tokens": 16086969.0, "reward": 0.657551646232605, "reward_std": 0.7166868448257446, "rewards/rollout_reward_func/mean": 0.657551646232605, "rewards/rollout_reward_func/std": 0.69100421667099, "sampling/importance_sampling_ratio/max": 1.530936360359192, "sampling/importance_sampling_ratio/mean": 1.0118107795715332, "sampling/importance_sampling_ratio/min": 0.6884041428565979, "sampling/sampling_logp_difference/max": 0.36009860038757324, "sampling/sampling_logp_difference/mean": 0.01780262030661106, "step": 370, "step_time": 11.443769710997913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1649.0, "completions/max_terminated_length": 1649.0, "completions/mean_length": 525.03125, "completions/mean_terminated_length": 525.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.23204800952225924, "epoch": 0.00742, "frac_reward_zero_std": 0.0, "grad_norm": 0.17332705855369568, "kl": 1.6928104013204575, "learning_rate": 5.3806623736605144e-05, "loss": 0.0101, "num_tokens": 16133950.0, "reward": 0.46984362602233887, "reward_std": 0.8246922492980957, "rewards/rollout_reward_func/mean": 0.46984362602233887, "rewards/rollout_reward_func/std": 0.8334593772888184, "sampling/importance_sampling_ratio/max": 1.2764415740966797, "sampling/importance_sampling_ratio/mean": 1.0003893375396729, "sampling/importance_sampling_ratio/min": 0.7712336778640747, "sampling/sampling_logp_difference/max": 0.21071696281433105, "sampling/sampling_logp_difference/mean": 0.011267402209341526, "step": 371, "step_time": 10.74922608900033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2234.0, "completions/max_terminated_length": 2234.0, "completions/mean_length": 624.5625, "completions/mean_terminated_length": 624.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2759281247854233, "epoch": 0.00744, "frac_reward_zero_std": 0.0, "grad_norm": 0.16062955558300018, "kl": 1.933619499206543, "learning_rate": 5.371044054940152e-05, "loss": -0.0067, "num_tokens": 16183997.0, "reward": 0.7135738134384155, "reward_std": 0.6444463729858398, "rewards/rollout_reward_func/mean": 0.7135738134384155, "rewards/rollout_reward_func/std": 0.6364851593971252, "sampling/importance_sampling_ratio/max": 1.4378172159194946, "sampling/importance_sampling_ratio/mean": 1.0111042261123657, "sampling/importance_sampling_ratio/min": 0.8307021260261536, "sampling/sampling_logp_difference/max": 0.2975924611091614, "sampling/sampling_logp_difference/mean": 0.012432577088475227, "step": 372, "step_time": 12.475940202001766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1763.0, "completions/max_terminated_length": 1763.0, "completions/mean_length": 508.8125, "completions/mean_terminated_length": 508.8125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.34908811934292316, "epoch": 0.00746, "frac_reward_zero_std": 0.0, "grad_norm": 0.22763662040233612, "kl": 2.6357754319906235, "learning_rate": 5.361408959143637e-05, "loss": -0.0035, "num_tokens": 16231116.0, "reward": 0.3723641335964203, "reward_std": 0.8783254623413086, "rewards/rollout_reward_func/mean": 0.3723641335964203, "rewards/rollout_reward_func/std": 0.8622666597366333, "sampling/importance_sampling_ratio/max": 1.244858980178833, "sampling/importance_sampling_ratio/mean": 0.9991692304611206, "sampling/importance_sampling_ratio/min": 0.8305888175964355, "sampling/sampling_logp_difference/max": 0.13214898109436035, "sampling/sampling_logp_difference/mean": 0.013413630425930023, "step": 373, "step_time": 10.66228239999964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1696.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 429.65625, "completions/mean_terminated_length": 429.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.30482333712279797, "epoch": 0.00748, "frac_reward_zero_std": 0.0, "grad_norm": 0.28419116139411926, "kl": 1.9293970838189125, "learning_rate": 5.3517572243091084e-05, "loss": -0.01, "num_tokens": 16274033.0, "reward": 0.440564900636673, "reward_std": 0.8759573698043823, "rewards/rollout_reward_func/mean": 0.440564900636673, "rewards/rollout_reward_func/std": 0.8685812950134277, "sampling/importance_sampling_ratio/max": 1.1348769664764404, "sampling/importance_sampling_ratio/mean": 1.0000994205474854, "sampling/importance_sampling_ratio/min": 0.8309101462364197, "sampling/sampling_logp_difference/max": 0.1809018850326538, "sampling/sampling_logp_difference/mean": 0.01667284406721592, "step": 374, "step_time": 11.225290269999277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1350.0, "completions/max_terminated_length": 1350.0, "completions/mean_length": 313.8125, "completions/mean_terminated_length": 313.8125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.17983679100871086, "epoch": 0.0075, "frac_reward_zero_std": 0.0, "grad_norm": 0.18342749774456024, "kl": 1.5149718597531319, "learning_rate": 5.342088988713077e-05, "loss": -0.0011, "num_tokens": 16312899.0, "reward": 0.5808793306350708, "reward_std": 0.6459633708000183, "rewards/rollout_reward_func/mean": 0.5808793306350708, "rewards/rollout_reward_func/std": 0.7195429801940918, "sampling/importance_sampling_ratio/max": 1.2549628019332886, "sampling/importance_sampling_ratio/mean": 1.011016607284546, "sampling/importance_sampling_ratio/min": 0.8621881604194641, "sampling/sampling_logp_difference/max": 0.12050771713256836, "sampling/sampling_logp_difference/mean": 0.009227455593645573, "step": 375, "step_time": 9.787745397000435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 880.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 338.0625, "completions/mean_terminated_length": 338.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.22705092234537005, "epoch": 0.00752, "frac_reward_zero_std": 0.0, "grad_norm": 0.22067661583423615, "kl": 1.6980096846818924, "learning_rate": 5.332404390868455e-05, "loss": 0.0037, "num_tokens": 16352598.0, "reward": 0.4926507771015167, "reward_std": 0.7736526131629944, "rewards/rollout_reward_func/mean": 0.4926507771015167, "rewards/rollout_reward_func/std": 0.8030753135681152, "sampling/importance_sampling_ratio/max": 1.3299237489700317, "sampling/importance_sampling_ratio/mean": 1.0072020292282104, "sampling/importance_sampling_ratio/min": 0.8018565773963928, "sampling/sampling_logp_difference/max": 0.31852054595947266, "sampling/sampling_logp_difference/mean": 0.015216195024549961, "step": 376, "step_time": 8.02306848399985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1714.0, "completions/max_terminated_length": 1714.0, "completions/mean_length": 641.1875, "completions/mean_terminated_length": 641.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.19849132373929024, "epoch": 0.00754, "frac_reward_zero_std": 0.25, "grad_norm": 0.16620299220085144, "kl": 2.220570892095566, "learning_rate": 5.3227035695225784e-05, "loss": 0.0024, "num_tokens": 16404189.0, "reward": 0.7409209609031677, "reward_std": 0.49733152985572815, "rewards/rollout_reward_func/mean": 0.7409209609031677, "rewards/rollout_reward_func/std": 0.5756663680076599, "sampling/importance_sampling_ratio/max": 1.044629454612732, "sampling/importance_sampling_ratio/mean": 0.9805995225906372, "sampling/importance_sampling_ratio/min": 0.8013394474983215, "sampling/sampling_logp_difference/max": 0.2189011573791504, "sampling/sampling_logp_difference/mean": 0.0073119839653372765, "step": 377, "step_time": 11.551375933001509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1673.0, "completions/max_terminated_length": 1673.0, "completions/mean_length": 495.09375, "completions/mean_terminated_length": 495.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.20116573572158813, "epoch": 0.00756, "frac_reward_zero_std": 0.25, "grad_norm": 0.18684422969818115, "kl": 2.6616673171520233, "learning_rate": 5.312986663655199e-05, "loss": 0.0046, "num_tokens": 16449283.0, "reward": 0.6276858448982239, "reward_std": 0.6398844718933105, "rewards/rollout_reward_func/mean": 0.6276858448982239, "rewards/rollout_reward_func/std": 0.7375863790512085, "sampling/importance_sampling_ratio/max": 1.0612633228302002, "sampling/importance_sampling_ratio/mean": 0.9907390475273132, "sampling/importance_sampling_ratio/min": 0.7811134457588196, "sampling/sampling_logp_difference/max": 0.2476201057434082, "sampling/sampling_logp_difference/mean": 0.0065937042236328125, "step": 378, "step_time": 10.76980765800181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1599.0, "completions/max_terminated_length": 1599.0, "completions/mean_length": 552.53125, "completions/mean_terminated_length": 552.53125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1445841370150447, "epoch": 0.00758, "frac_reward_zero_std": 0.25, "grad_norm": 0.1105053722858429, "kl": 2.1076736450195312, "learning_rate": 5.3032538124765136e-05, "loss": 0.0111, "num_tokens": 16497934.0, "reward": 0.7137702703475952, "reward_std": 0.4719046354293823, "rewards/rollout_reward_func/mean": 0.7137702703475952, "rewards/rollout_reward_func/std": 0.5842422246932983, "sampling/importance_sampling_ratio/max": 1.1322194337844849, "sampling/importance_sampling_ratio/mean": 0.9968186616897583, "sampling/importance_sampling_ratio/min": 0.8887686729431152, "sampling/sampling_logp_difference/max": 0.1250920295715332, "sampling/sampling_logp_difference/mean": 0.0044670384377241135, "step": 379, "step_time": 11.403103627999371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1771.0, "completions/max_terminated_length": 1771.0, "completions/mean_length": 534.59375, "completions/mean_terminated_length": 534.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.17791003547608852, "epoch": 0.0076, "frac_reward_zero_std": 0.0, "grad_norm": 0.22216714918613434, "kl": 2.5138309746980667, "learning_rate": 5.293505155425156e-05, "loss": -0.004, "num_tokens": 16546607.0, "reward": 0.5966638326644897, "reward_std": 0.8199155330657959, "rewards/rollout_reward_func/mean": 0.5966638326644897, "rewards/rollout_reward_func/std": 0.7910171151161194, "sampling/importance_sampling_ratio/max": 1.100141167640686, "sampling/importance_sampling_ratio/mean": 0.994981586933136, "sampling/importance_sampling_ratio/min": 0.8106642961502075, "sampling/sampling_logp_difference/max": 0.21000003814697266, "sampling/sampling_logp_difference/mean": 0.008211307227611542, "step": 380, "step_time": 10.806284592000338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2454.0, "completions/max_terminated_length": 2454.0, "completions/mean_length": 606.78125, "completions/mean_terminated_length": 606.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.17060753889381886, "epoch": 0.00762, "frac_reward_zero_std": 0.25, "grad_norm": 0.18209855258464813, "kl": 1.9273536652326584, "learning_rate": 5.283740832166208e-05, "loss": -0.0083, "num_tokens": 16596632.0, "reward": 0.625102162361145, "reward_std": 0.6270291805267334, "rewards/rollout_reward_func/mean": 0.625102162361145, "rewards/rollout_reward_func/std": 0.7443488240242004, "sampling/importance_sampling_ratio/max": 1.2072134017944336, "sampling/importance_sampling_ratio/mean": 1.0118136405944824, "sampling/importance_sampling_ratio/min": 0.9275223016738892, "sampling/sampling_logp_difference/max": 0.11366796493530273, "sampling/sampling_logp_difference/mean": 0.00655343709513545, "step": 381, "step_time": 12.84639911099839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1746.0, "completions/max_terminated_length": 1746.0, "completions/mean_length": 536.9375, "completions/mean_terminated_length": 536.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.21167331538163126, "epoch": 0.00764, "frac_reward_zero_std": 0.25, "grad_norm": 0.3247912526130676, "kl": 2.160668596625328, "learning_rate": 5.2739609825891917e-05, "loss": 0.0203, "num_tokens": 16643386.0, "reward": 0.5853642821311951, "reward_std": 0.5972873568534851, "rewards/rollout_reward_func/mean": 0.5853642821311951, "rewards/rollout_reward_func/std": 0.7131071090698242, "sampling/importance_sampling_ratio/max": 1.1217405796051025, "sampling/importance_sampling_ratio/mean": 0.9966710209846497, "sampling/importance_sampling_ratio/min": 0.8561088442802429, "sampling/sampling_logp_difference/max": 0.2253129482269287, "sampling/sampling_logp_difference/mean": 0.009671906940639019, "step": 382, "step_time": 10.815089351000097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1811.0, "completions/max_terminated_length": 1811.0, "completions/mean_length": 599.4375, "completions/mean_terminated_length": 599.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.14781636651605368, "epoch": 0.00766, "frac_reward_zero_std": 0.0, "grad_norm": 0.1309104859828949, "kl": 1.8711841702461243, "learning_rate": 5.264165746806071e-05, "loss": 0.0055, "num_tokens": 16693628.0, "reward": 0.7115412950515747, "reward_std": 0.5681352615356445, "rewards/rollout_reward_func/mean": 0.7115412950515747, "rewards/rollout_reward_func/std": 0.5859503149986267, "sampling/importance_sampling_ratio/max": 1.101539134979248, "sampling/importance_sampling_ratio/mean": 1.0003762245178223, "sampling/importance_sampling_ratio/min": 0.8118770122528076, "sampling/sampling_logp_difference/max": 0.20527887344360352, "sampling/sampling_logp_difference/mean": 0.005052180495113134, "step": 383, "step_time": 11.433702741996967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1687.0, "completions/max_terminated_length": 1687.0, "completions/mean_length": 400.28125, "completions/mean_terminated_length": 400.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09306072117760777, "epoch": 0.00768, "frac_reward_zero_std": 0.0, "grad_norm": 0.21978360414505005, "kl": 2.1223390698432922, "learning_rate": 5.2543552651492394e-05, "loss": 0.0147, "num_tokens": 16735973.0, "reward": 0.8044307231903076, "reward_std": 0.48861363530158997, "rewards/rollout_reward_func/mean": 0.8044307231903076, "rewards/rollout_reward_func/std": 0.4829864799976349, "sampling/importance_sampling_ratio/max": 1.0457457304000854, "sampling/importance_sampling_ratio/mean": 0.998588502407074, "sampling/importance_sampling_ratio/min": 0.9508208632469177, "sampling/sampling_logp_difference/max": 0.054337143898010254, "sampling/sampling_logp_difference/mean": 0.002319393213838339, "step": 384, "step_time": 10.868102234001526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1781.0, "completions/max_terminated_length": 1781.0, "completions/mean_length": 411.875, "completions/mean_terminated_length": 411.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05354924825951457, "epoch": 0.0077, "frac_reward_zero_std": 0.5, "grad_norm": 0.03983747586607933, "kl": 1.9543060809373856, "learning_rate": 5.244529678169513e-05, "loss": 0.012, "num_tokens": 16778978.0, "reward": 0.8659685850143433, "reward_std": 0.2482631802558899, "rewards/rollout_reward_func/mean": 0.8659685850143433, "rewards/rollout_reward_func/std": 0.3604085147380829, "sampling/importance_sampling_ratio/max": 1.1476041078567505, "sampling/importance_sampling_ratio/mean": 1.003883957862854, "sampling/importance_sampling_ratio/min": 0.9666488170623779, "sampling/sampling_logp_difference/max": 0.1377893090248108, "sampling/sampling_logp_difference/mean": 0.0018278828356415033, "step": 385, "step_time": 10.900852307000605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1746.0, "completions/max_terminated_length": 1746.0, "completions/mean_length": 509.1875, "completions/mean_terminated_length": 509.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09046693565323949, "epoch": 0.00772, "frac_reward_zero_std": 0.0, "grad_norm": 0.07293256372213364, "kl": 1.9507473036646843, "learning_rate": 5.2346891266341196e-05, "loss": 0.004, "num_tokens": 16825848.0, "reward": 0.7092801332473755, "reward_std": 0.5516120195388794, "rewards/rollout_reward_func/mean": 0.7092801332473755, "rewards/rollout_reward_func/std": 0.5915241241455078, "sampling/importance_sampling_ratio/max": 1.0240585803985596, "sampling/importance_sampling_ratio/mean": 0.9811479449272156, "sampling/importance_sampling_ratio/min": 0.650489330291748, "sampling/sampling_logp_difference/max": 0.4302060604095459, "sampling/sampling_logp_difference/mean": 0.006097988225519657, "step": 386, "step_time": 10.890585995000038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 295.28125, "completions/mean_terminated_length": 295.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.11651570326648653, "epoch": 0.00774, "frac_reward_zero_std": 0.0, "grad_norm": 0.39590510725975037, "kl": 1.8577609434723854, "learning_rate": 5.22483375152467e-05, "loss": -0.0129, "num_tokens": 16864667.0, "reward": 0.5239696502685547, "reward_std": 0.7297723293304443, "rewards/rollout_reward_func/mean": 0.5239696502685547, "rewards/rollout_reward_func/std": 0.7605187892913818, "sampling/importance_sampling_ratio/max": 1.187969446182251, "sampling/importance_sampling_ratio/mean": 0.9867372512817383, "sampling/importance_sampling_ratio/min": 0.648615837097168, "sampling/sampling_logp_difference/max": 0.43290209770202637, "sampling/sampling_logp_difference/mean": 0.013611191883683205, "step": 387, "step_time": 7.541108244999123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1398.0, "completions/max_terminated_length": 1398.0, "completions/mean_length": 447.78125, "completions/mean_terminated_length": 447.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06868334556929767, "epoch": 0.00776, "frac_reward_zero_std": 0.0, "grad_norm": 0.13021554052829742, "kl": 2.0371861904859543, "learning_rate": 5.214963694035153e-05, "loss": 0.0079, "num_tokens": 16909026.0, "reward": 0.7973017692565918, "reward_std": 0.4419345259666443, "rewards/rollout_reward_func/mean": 0.7973017692565918, "rewards/rollout_reward_func/std": 0.4287479817867279, "sampling/importance_sampling_ratio/max": 1.0613411664962769, "sampling/importance_sampling_ratio/mean": 0.978864312171936, "sampling/importance_sampling_ratio/min": 0.5608057379722595, "sampling/sampling_logp_difference/max": 0.5771946907043457, "sampling/sampling_logp_difference/mean": 0.00859585776925087, "step": 388, "step_time": 9.566711003999444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 425.125, "completions/mean_terminated_length": 425.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04940441995859146, "epoch": 0.00778, "frac_reward_zero_std": 0.25, "grad_norm": 0.07168085128068924, "kl": 2.6058423668146133, "learning_rate": 5.205079095569905e-05, "loss": 0.0088, "num_tokens": 16953398.0, "reward": 0.7714732885360718, "reward_std": 0.40988850593566895, "rewards/rollout_reward_func/mean": 0.7714732885360718, "rewards/rollout_reward_func/std": 0.5032768845558167, "sampling/importance_sampling_ratio/max": 1.0293927192687988, "sampling/importance_sampling_ratio/mean": 1.0006637573242188, "sampling/importance_sampling_ratio/min": 0.9871603846549988, "sampling/sampling_logp_difference/max": 0.041791677474975586, "sampling/sampling_logp_difference/mean": 0.0009094587294384837, "step": 389, "step_time": 8.968553534999955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 483.0625, "completions/mean_terminated_length": 483.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.056709158699959517, "epoch": 0.0078, "frac_reward_zero_std": 0.25, "grad_norm": 0.22398704290390015, "kl": 2.1525469571352005, "learning_rate": 5.195180097741581e-05, "loss": 0.0119, "num_tokens": 17000238.0, "reward": 0.8056702613830566, "reward_std": 0.3804699778556824, "rewards/rollout_reward_func/mean": 0.8056702613830566, "rewards/rollout_reward_func/std": 0.48140308260917664, "sampling/importance_sampling_ratio/max": 1.1265255212783813, "sampling/importance_sampling_ratio/mean": 0.9981716275215149, "sampling/importance_sampling_ratio/min": 0.7988142371177673, "sampling/sampling_logp_difference/max": 0.22452545166015625, "sampling/sampling_logp_difference/mean": 0.003168161027133465, "step": 390, "step_time": 8.674602408000283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1398.0, "completions/max_terminated_length": 1398.0, "completions/mean_length": 419.71875, "completions/mean_terminated_length": 419.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05602260585874319, "epoch": 0.00782, "frac_reward_zero_std": 0.5, "grad_norm": 0.5230777263641357, "kl": 2.0062749087810516, "learning_rate": 5.1852668423691327e-05, "loss": 0.0064, "num_tokens": 17043186.0, "reward": 0.811331033706665, "reward_std": 0.3689987063407898, "rewards/rollout_reward_func/mean": 0.811331033706665, "rewards/rollout_reward_func/std": 0.5317072868347168, "sampling/importance_sampling_ratio/max": 1.1069998741149902, "sampling/importance_sampling_ratio/mean": 1.00136399269104, "sampling/importance_sampling_ratio/min": 0.9172760844230652, "sampling/sampling_logp_difference/max": 0.1003572940826416, "sampling/sampling_logp_difference/mean": 0.0017778960755094886, "step": 391, "step_time": 9.852701918001003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1793.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 483.3125, "completions/mean_terminated_length": 483.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05285251350142062, "epoch": 0.00784, "frac_reward_zero_std": 0.5, "grad_norm": 0.033940963447093964, "kl": 2.8637658953666687, "learning_rate": 5.175339471475777e-05, "loss": 0.0103, "num_tokens": 17087230.0, "reward": 0.8087888956069946, "reward_std": 0.3108760714530945, "rewards/rollout_reward_func/mean": 0.8087888956069946, "rewards/rollout_reward_func/std": 0.4758809804916382, "sampling/importance_sampling_ratio/max": 1.0030180215835571, "sampling/importance_sampling_ratio/mean": 0.9967939853668213, "sampling/importance_sampling_ratio/min": 0.9342618584632874, "sampling/sampling_logp_difference/max": 0.06729337573051453, "sampling/sampling_logp_difference/mean": 0.001022449927404523, "step": 392, "step_time": 11.088060791998942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1548.0, "completions/max_terminated_length": 1548.0, "completions/mean_length": 318.5625, "completions/mean_terminated_length": 318.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.03639633631973993, "epoch": 0.00786, "frac_reward_zero_std": 0.25, "grad_norm": 0.4340112805366516, "kl": 2.1029968708753586, "learning_rate": 5.1653981272869535e-05, "loss": 0.004, "num_tokens": 17125371.0, "reward": 0.8658812046051025, "reward_std": 0.31437692046165466, "rewards/rollout_reward_func/mean": 0.8658812046051025, "rewards/rollout_reward_func/std": 0.36067435145378113, "sampling/importance_sampling_ratio/max": 1.0676647424697876, "sampling/importance_sampling_ratio/mean": 0.981918454170227, "sampling/importance_sampling_ratio/min": 0.537032961845398, "sampling/sampling_logp_difference/max": 0.7375483512878418, "sampling/sampling_logp_difference/mean": 0.009889071807265282, "step": 393, "step_time": 10.03844457699961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1521.0, "completions/max_terminated_length": 1521.0, "completions/mean_length": 386.75, "completions/mean_terminated_length": 386.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04168717376887798, "epoch": 0.00788, "frac_reward_zero_std": 0.25, "grad_norm": 0.028928883373737335, "kl": 2.696560651063919, "learning_rate": 5.155442952228292e-05, "loss": 0.0026, "num_tokens": 17167334.0, "reward": 0.8359758853912354, "reward_std": 0.39634260535240173, "rewards/rollout_reward_func/mean": 0.8359758853912354, "rewards/rollout_reward_func/std": 0.4587079882621765, "sampling/importance_sampling_ratio/max": 1.004974126815796, "sampling/importance_sampling_ratio/mean": 1.0002007484436035, "sampling/importance_sampling_ratio/min": 0.9977675080299377, "sampling/sampling_logp_difference/max": 0.0035619735717773438, "sampling/sampling_logp_difference/mean": 0.00017922441475093365, "step": 394, "step_time": 10.769969393998508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1843.0, "completions/max_terminated_length": 1843.0, "completions/mean_length": 459.09375, "completions/mean_terminated_length": 459.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.03920790692791343, "epoch": 0.0079, "frac_reward_zero_std": 0.0, "grad_norm": 0.054728589951992035, "kl": 3.260742574930191, "learning_rate": 5.145474088923578e-05, "loss": 0.0102, "num_tokens": 17210629.0, "reward": 0.7471879124641418, "reward_std": 0.5483604669570923, "rewards/rollout_reward_func/mean": 0.7471879124641418, "rewards/rollout_reward_func/std": 0.5684433579444885, "sampling/importance_sampling_ratio/max": 1.0035241842269897, "sampling/importance_sampling_ratio/mean": 0.9952626824378967, "sampling/importance_sampling_ratio/min": 0.8827452659606934, "sampling/sampling_logp_difference/max": 0.12460803985595703, "sampling/sampling_logp_difference/mean": 0.0014425909612327814, "step": 395, "step_time": 10.712669459000608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1757.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 622.84375, "completions/mean_terminated_length": 622.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05399816553108394, "epoch": 0.00792, "frac_reward_zero_std": 0.5, "grad_norm": 0.05563470721244812, "kl": 2.883115381002426, "learning_rate": 5.1354916801926914e-05, "loss": 0.0164, "num_tokens": 17260873.0, "reward": 0.8401602506637573, "reward_std": 0.27836984395980835, "rewards/rollout_reward_func/mean": 0.8401602506637573, "rewards/rollout_reward_func/std": 0.45207148790359497, "sampling/importance_sampling_ratio/max": 1.1973391771316528, "sampling/importance_sampling_ratio/mean": 1.0020536184310913, "sampling/importance_sampling_ratio/min": 0.9340715408325195, "sampling/sampling_logp_difference/max": 0.181077241897583, "sampling/sampling_logp_difference/mean": 0.002426608931273222, "step": 396, "step_time": 10.690794087998256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1358.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 435.65625, "completions/mean_terminated_length": 435.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.03011506819166243, "epoch": 0.00794, "frac_reward_zero_std": 0.75, "grad_norm": 0.09031786024570465, "kl": 1.9850663393735886, "learning_rate": 5.125495869049581e-05, "loss": 0.0075, "num_tokens": 17304011.0, "reward": 0.9315446615219116, "reward_std": 0.1267746239900589, "rewards/rollout_reward_func/mean": 0.9315446615219116, "rewards/rollout_reward_func/std": 0.2694025933742523, "sampling/importance_sampling_ratio/max": 1.1141501665115356, "sampling/importance_sampling_ratio/mean": 1.0034511089324951, "sampling/importance_sampling_ratio/min": 0.9927395582199097, "sampling/sampling_logp_difference/max": 0.10680627822875977, "sampling/sampling_logp_difference/mean": 0.0009597503812983632, "step": 397, "step_time": 9.383985855999526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1502.0, "completions/max_terminated_length": 1502.0, "completions/mean_length": 494.5, "completions/mean_terminated_length": 494.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04148736922070384, "epoch": 0.00796, "frac_reward_zero_std": 0.0, "grad_norm": 0.04616577923297882, "kl": 2.7397646605968475, "learning_rate": 5.115486798700204e-05, "loss": 0.0095, "num_tokens": 17350080.0, "reward": 0.784192681312561, "reward_std": 0.6103953123092651, "rewards/rollout_reward_func/mean": 0.784192681312561, "rewards/rollout_reward_func/std": 0.5953936576843262, "sampling/importance_sampling_ratio/max": 1.001572608947754, "sampling/importance_sampling_ratio/mean": 0.998132050037384, "sampling/importance_sampling_ratio/min": 0.9659138917922974, "sampling/sampling_logp_difference/max": 0.033608585596084595, "sampling/sampling_logp_difference/mean": 0.0005739482585340738, "step": 398, "step_time": 10.350869323998268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1425.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 411.03125, "completions/mean_terminated_length": 411.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04595591555698775, "epoch": 0.00798, "frac_reward_zero_std": 0.25, "grad_norm": 0.039590101689100266, "kl": 2.74935744702816, "learning_rate": 5.1054646125404776e-05, "loss": -0.0005, "num_tokens": 17392819.0, "reward": 0.5239495038986206, "reward_std": 0.6289160251617432, "rewards/rollout_reward_func/mean": 0.5239495038986206, "rewards/rollout_reward_func/std": 0.7626776099205017, "sampling/importance_sampling_ratio/max": 1.0366166830062866, "sampling/importance_sampling_ratio/mean": 0.9964383840560913, "sampling/importance_sampling_ratio/min": 0.9313643574714661, "sampling/sampling_logp_difference/max": 0.07104992866516113, "sampling/sampling_logp_difference/mean": 0.0017079797107726336, "step": 399, "step_time": 9.400446548000218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1787.0, "completions/max_terminated_length": 1787.0, "completions/mean_length": 566.71875, "completions/mean_terminated_length": 566.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05824896041303873, "epoch": 0.008, "frac_reward_zero_std": 0.0, "grad_norm": 0.14086604118347168, "kl": 4.452244237065315, "learning_rate": 5.095429454154221e-05, "loss": 0.0097, "num_tokens": 17442423.0, "reward": 0.3994544446468353, "reward_std": 0.7716719508171082, "rewards/rollout_reward_func/mean": 0.3994544446468353, "rewards/rollout_reward_func/std": 0.7919886708259583, "sampling/importance_sampling_ratio/max": 1.43679678440094, "sampling/importance_sampling_ratio/mean": 1.0093997716903687, "sampling/importance_sampling_ratio/min": 0.9130212068557739, "sampling/sampling_logp_difference/max": 0.3615899085998535, "sampling/sampling_logp_difference/mean": 0.004695264622569084, "step": 400, "step_time": 10.60830179399909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1751.0, "completions/max_terminated_length": 1751.0, "completions/mean_length": 482.28125, "completions/mean_terminated_length": 482.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06891977449413389, "epoch": 0.00802, "frac_reward_zero_std": 0.0, "grad_norm": 0.1627579927444458, "kl": 3.6301970332860947, "learning_rate": 5.085381467311107e-05, "loss": 0.0042, "num_tokens": 17488421.0, "reward": 0.5661377906799316, "reward_std": 0.7971882224082947, "rewards/rollout_reward_func/mean": 0.5661377906799316, "rewards/rollout_reward_func/std": 0.7895565032958984, "sampling/importance_sampling_ratio/max": 1.2804324626922607, "sampling/importance_sampling_ratio/mean": 1.0090514421463013, "sampling/importance_sampling_ratio/min": 0.9282717108726501, "sampling/sampling_logp_difference/max": 0.2426772117614746, "sampling/sampling_logp_difference/mean": 0.004869070835411549, "step": 401, "step_time": 10.244721981999646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2301.0, "completions/max_terminated_length": 2301.0, "completions/mean_length": 537.3125, "completions/mean_terminated_length": 537.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.049134614382637665, "epoch": 0.00804, "frac_reward_zero_std": 0.5, "grad_norm": 0.12666580080986023, "kl": 3.574688032269478, "learning_rate": 5.0753207959645896e-05, "loss": 0.0168, "num_tokens": 17534593.0, "reward": 0.763367772102356, "reward_std": 0.43845945596694946, "rewards/rollout_reward_func/mean": 0.763367772102356, "rewards/rollout_reward_func/std": 0.6364938616752625, "sampling/importance_sampling_ratio/max": 1.0500099658966064, "sampling/importance_sampling_ratio/mean": 1.0017461776733398, "sampling/importance_sampling_ratio/min": 0.9850899577140808, "sampling/sampling_logp_difference/max": 0.04883575439453125, "sampling/sampling_logp_difference/mean": 0.001343111740425229, "step": 402, "step_time": 12.147635143997832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2561.0, "completions/max_terminated_length": 2561.0, "completions/mean_length": 420.3125, "completions/mean_terminated_length": 420.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05475027102511376, "epoch": 0.00806, "frac_reward_zero_std": 0.25, "grad_norm": 0.16998182237148285, "kl": 3.353247657418251, "learning_rate": 5.0652475842498524e-05, "loss": 0.0056, "num_tokens": 17576340.0, "reward": 0.7537018060684204, "reward_std": 0.5761569142341614, "rewards/rollout_reward_func/mean": 0.7537018060684204, "rewards/rollout_reward_func/std": 0.6620924472808838, "sampling/importance_sampling_ratio/max": 1.1177271604537964, "sampling/importance_sampling_ratio/mean": 1.000673770904541, "sampling/importance_sampling_ratio/min": 0.9412984848022461, "sampling/sampling_logp_difference/max": 0.12233161926269531, "sampling/sampling_logp_difference/mean": 0.002113380003720522, "step": 403, "step_time": 13.30420248000246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1671.0, "completions/max_terminated_length": 1671.0, "completions/mean_length": 531.90625, "completions/mean_terminated_length": 531.90625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04453609534539282, "epoch": 0.00808, "frac_reward_zero_std": 0.5, "grad_norm": 0.15495382249355316, "kl": 2.506533160805702, "learning_rate": 5.0551619764817406e-05, "loss": 0.011, "num_tokens": 17624161.0, "reward": 0.9060662984848022, "reward_std": 0.26568469405174255, "rewards/rollout_reward_func/mean": 0.9060662984848022, "rewards/rollout_reward_func/std": 0.3877139091491699, "sampling/importance_sampling_ratio/max": 1.0028026103973389, "sampling/importance_sampling_ratio/mean": 0.9951510429382324, "sampling/importance_sampling_ratio/min": 0.9186420440673828, "sampling/sampling_logp_difference/max": 0.08579421043395996, "sampling/sampling_logp_difference/mean": 0.0012623604852706194, "step": 404, "step_time": 10.387093815000298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1859.0, "completions/max_terminated_length": 1859.0, "completions/mean_length": 568.75, "completions/mean_terminated_length": 568.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07428141112904996, "epoch": 0.0081, "frac_reward_zero_std": 0.25, "grad_norm": 0.0668555423617363, "kl": 3.4521894603967667, "learning_rate": 5.045064117152691e-05, "loss": 0.02, "num_tokens": 17672066.0, "reward": 0.6329009532928467, "reward_std": 0.5616664886474609, "rewards/rollout_reward_func/mean": 0.6329009532928467, "rewards/rollout_reward_func/std": 0.7261850237846375, "sampling/importance_sampling_ratio/max": 1.053938865661621, "sampling/importance_sampling_ratio/mean": 1.000809907913208, "sampling/importance_sampling_ratio/min": 0.9556564092636108, "sampling/sampling_logp_difference/max": 0.045360952615737915, "sampling/sampling_logp_difference/mean": 0.0012809421168640256, "step": 405, "step_time": 10.997084028000245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1652.0, "completions/max_terminated_length": 1652.0, "completions/mean_length": 543.25, "completions/mean_terminated_length": 543.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05168547644279897, "epoch": 0.00812, "frac_reward_zero_std": 0.5, "grad_norm": 0.008325683884322643, "kl": 2.9207785725593567, "learning_rate": 5.0349541509306635e-05, "loss": 0.0027, "num_tokens": 17719819.0, "reward": 0.8452123403549194, "reward_std": 0.35655277967453003, "rewards/rollout_reward_func/mean": 0.8452123403549194, "rewards/rollout_reward_func/std": 0.506984531879425, "sampling/importance_sampling_ratio/max": 1.1287031173706055, "sampling/importance_sampling_ratio/mean": 0.9985804557800293, "sampling/importance_sampling_ratio/min": 0.9090589284896851, "sampling/sampling_logp_difference/max": 0.12104225158691406, "sampling/sampling_logp_difference/mean": 0.002230389742180705, "step": 406, "step_time": 10.214301088001775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1703.0, "completions/max_terminated_length": 1703.0, "completions/mean_length": 603.1875, "completions/mean_terminated_length": 603.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05644200398819521, "epoch": 0.00814, "frac_reward_zero_std": 0.25, "grad_norm": 0.052547357976436615, "kl": 3.183659017086029, "learning_rate": 5.024832222657068e-05, "loss": -0.0041, "num_tokens": 17769553.0, "reward": 0.6898912191390991, "reward_std": 0.5804780721664429, "rewards/rollout_reward_func/mean": 0.6898912191390991, "rewards/rollout_reward_func/std": 0.6814175844192505, "sampling/importance_sampling_ratio/max": 1.028568983078003, "sampling/importance_sampling_ratio/mean": 0.9956703186035156, "sampling/importance_sampling_ratio/min": 0.8838797211647034, "sampling/sampling_logp_difference/max": 0.12351095676422119, "sampling/sampling_logp_difference/mean": 0.0017295870929956436, "step": 407, "step_time": 10.935625626000729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2145.0, "completions/max_terminated_length": 2145.0, "completions/mean_length": 521.875, "completions/mean_terminated_length": 521.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.13349051028490067, "epoch": 0.00816, "frac_reward_zero_std": 0.5, "grad_norm": 0.3847309947013855, "kl": 2.8007897809147835, "learning_rate": 5.0146984773446935e-05, "loss": 0.0178, "num_tokens": 17815885.0, "reward": 0.7825921773910522, "reward_std": 0.4114615321159363, "rewards/rollout_reward_func/mean": 0.7825921773910522, "rewards/rollout_reward_func/std": 0.5987290143966675, "sampling/importance_sampling_ratio/max": 1.5589492321014404, "sampling/importance_sampling_ratio/mean": 1.0031771659851074, "sampling/importance_sampling_ratio/min": 0.731090247631073, "sampling/sampling_logp_difference/max": 0.24788999557495117, "sampling/sampling_logp_difference/mean": 0.00780561612918973, "step": 408, "step_time": 13.05891415699898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1754.0, "completions/max_terminated_length": 1754.0, "completions/mean_length": 532.75, "completions/mean_terminated_length": 532.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.03323867090512067, "epoch": 0.00818, "frac_reward_zero_std": 0.0, "grad_norm": 0.08420190960168839, "kl": 2.733386591076851, "learning_rate": 5.004553060175623e-05, "loss": 0.0082, "num_tokens": 17862323.0, "reward": 0.7779996395111084, "reward_std": 0.5471538305282593, "rewards/rollout_reward_func/mean": 0.7779996395111084, "rewards/rollout_reward_func/std": 0.5501754879951477, "sampling/importance_sampling_ratio/max": 1.0599555969238281, "sampling/importance_sampling_ratio/mean": 1.001774549484253, "sampling/importance_sampling_ratio/min": 0.9912304878234863, "sampling/sampling_logp_difference/max": 0.05834603309631348, "sampling/sampling_logp_difference/mean": 0.000770647544413805, "step": 409, "step_time": 11.148350217998996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1850.0, "completions/max_terminated_length": 1850.0, "completions/mean_length": 529.375, "completions/mean_terminated_length": 529.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.046081166714429855, "epoch": 0.0082, "frac_reward_zero_std": 0.0, "grad_norm": 0.06420991569757462, "kl": 3.234129950404167, "learning_rate": 4.9943961164991605e-05, "loss": 0.0002, "num_tokens": 17910580.0, "reward": 0.6315737962722778, "reward_std": 0.7627012133598328, "rewards/rollout_reward_func/mean": 0.6315737962722778, "rewards/rollout_reward_func/std": 0.734076976776123, "sampling/importance_sampling_ratio/max": 1.0961934328079224, "sampling/importance_sampling_ratio/mean": 1.001449704170227, "sampling/importance_sampling_ratio/min": 0.9445170164108276, "sampling/sampling_logp_difference/max": 0.09253978729248047, "sampling/sampling_logp_difference/mean": 0.0013627447187900543, "step": 410, "step_time": 11.407127365000633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1652.0, "completions/max_terminated_length": 1652.0, "completions/mean_length": 483.53125, "completions/mean_terminated_length": 483.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04266157676465809, "epoch": 0.00822, "frac_reward_zero_std": 0.0, "grad_norm": 0.19061382114887238, "kl": 3.1843847185373306, "learning_rate": 4.98422779182974e-05, "loss": -0.0014, "num_tokens": 17956454.0, "reward": 0.5017520189285278, "reward_std": 0.823966920375824, "rewards/rollout_reward_func/mean": 0.5017520189285278, "rewards/rollout_reward_func/std": 0.8323329091072083, "sampling/importance_sampling_ratio/max": 1.2702239751815796, "sampling/importance_sampling_ratio/mean": 1.004717469215393, "sampling/importance_sampling_ratio/min": 0.8982879519462585, "sampling/sampling_logp_difference/max": 0.23906564712524414, "sampling/sampling_logp_difference/mean": 0.0035672462545335293, "step": 411, "step_time": 11.034947409000779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1558.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 453.09375, "completions/mean_terminated_length": 453.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.03937590476562036, "epoch": 0.00824, "frac_reward_zero_std": 0.5, "grad_norm": 0.04477764293551445, "kl": 2.8682167381048203, "learning_rate": 4.974048231844853e-05, "loss": 0.0129, "num_tokens": 18000065.0, "reward": 0.9054579734802246, "reward_std": 0.2674051821231842, "rewards/rollout_reward_func/mean": 0.9054579734802246, "rewards/rollout_reward_func/std": 0.3884824812412262, "sampling/importance_sampling_ratio/max": 1.0560126304626465, "sampling/importance_sampling_ratio/mean": 0.9999554753303528, "sampling/importance_sampling_ratio/min": 0.9531592130661011, "sampling/sampling_logp_difference/max": 0.054205894470214844, "sampling/sampling_logp_difference/mean": 0.0008945210138335824, "step": 412, "step_time": 10.938593682999453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1813.0, "completions/max_terminated_length": 1813.0, "completions/mean_length": 549.59375, "completions/mean_terminated_length": 549.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.03823628631653264, "epoch": 0.00826, "frac_reward_zero_std": 0.25, "grad_norm": 0.36858147382736206, "kl": 2.65264855325222, "learning_rate": 4.963857582382952e-05, "loss": 0.0094, "num_tokens": 18048510.0, "reward": 0.870212197303772, "reward_std": 0.36709535121917725, "rewards/rollout_reward_func/mean": 0.870212197303772, "rewards/rollout_reward_func/std": 0.431301474571228, "sampling/importance_sampling_ratio/max": 1.2578123807907104, "sampling/importance_sampling_ratio/mean": 1.0115666389465332, "sampling/importance_sampling_ratio/min": 0.9936754107475281, "sampling/sampling_logp_difference/max": 0.22953081130981445, "sampling/sampling_logp_difference/mean": 0.003178003942593932, "step": 413, "step_time": 11.845692206999956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1860.0, "completions/max_terminated_length": 1860.0, "completions/mean_length": 520.71875, "completions/mean_terminated_length": 520.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.040113069699145854, "epoch": 0.00828, "frac_reward_zero_std": 0.25, "grad_norm": 0.04158441722393036, "kl": 3.2289950996637344, "learning_rate": 4.953655989441365e-05, "loss": 0.0042, "num_tokens": 18094100.0, "reward": 0.6831515431404114, "reward_std": 0.5123870372772217, "rewards/rollout_reward_func/mean": 0.6831515431404114, "rewards/rollout_reward_func/std": 0.643218457698822, "sampling/importance_sampling_ratio/max": 1.0045183897018433, "sampling/importance_sampling_ratio/mean": 0.9995185136795044, "sampling/importance_sampling_ratio/min": 0.9850382804870605, "sampling/sampling_logp_difference/max": 0.013384461402893066, "sampling/sampling_logp_difference/mean": 0.00028201824170537293, "step": 414, "step_time": 11.335754293000718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1747.0, "completions/max_terminated_length": 1747.0, "completions/mean_length": 472.59375, "completions/mean_terminated_length": 472.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.042063588596647605, "epoch": 0.0083, "frac_reward_zero_std": 0.0, "grad_norm": 0.10922200232744217, "kl": 3.1381163001060486, "learning_rate": 4.9434435991742e-05, "loss": 0.0155, "num_tokens": 18138941.0, "reward": 0.5602551102638245, "reward_std": 0.6574921607971191, "rewards/rollout_reward_func/mean": 0.5602551102638245, "rewards/rollout_reward_func/std": 0.7077726721763611, "sampling/importance_sampling_ratio/max": 1.0559992790222168, "sampling/importance_sampling_ratio/mean": 1.0013502836227417, "sampling/importance_sampling_ratio/min": 0.9567334055900574, "sampling/sampling_logp_difference/max": 0.05621218681335449, "sampling/sampling_logp_difference/mean": 0.0011692214757204056, "step": 415, "step_time": 10.775895928001773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1763.0, "completions/max_terminated_length": 1763.0, "completions/mean_length": 572.5, "completions/mean_terminated_length": 572.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05806278460659087, "epoch": 0.00832, "frac_reward_zero_std": 0.0, "grad_norm": 0.17052417993545532, "kl": 2.5742068588733673, "learning_rate": 4.933220557890258e-05, "loss": 0.0165, "num_tokens": 18188267.0, "reward": 0.5104025602340698, "reward_std": 0.6343930959701538, "rewards/rollout_reward_func/mean": 0.5104025602340698, "rewards/rollout_reward_func/std": 0.632972002029419, "sampling/importance_sampling_ratio/max": 1.068711280822754, "sampling/importance_sampling_ratio/mean": 0.9924192428588867, "sampling/importance_sampling_ratio/min": 0.6811312437057495, "sampling/sampling_logp_difference/max": 0.3846132755279541, "sampling/sampling_logp_difference/mean": 0.004100108984857798, "step": 416, "step_time": 11.054924111999753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2657.0, "completions/max_terminated_length": 2657.0, "completions/mean_length": 616.15625, "completions/mean_terminated_length": 616.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.03868865536423982, "epoch": 0.00834, "frac_reward_zero_std": 0.25, "grad_norm": 0.07223250716924667, "kl": 2.6622893139719963, "learning_rate": 4.9229870120509314e-05, "loss": 0.018, "num_tokens": 18238036.0, "reward": 0.8754406571388245, "reward_std": 0.35230696201324463, "rewards/rollout_reward_func/mean": 0.8754406571388245, "rewards/rollout_reward_func/std": 0.4131568968296051, "sampling/importance_sampling_ratio/max": 1.0744590759277344, "sampling/importance_sampling_ratio/mean": 1.0039677619934082, "sampling/importance_sampling_ratio/min": 0.995751142501831, "sampling/sampling_logp_difference/max": 0.07116180658340454, "sampling/sampling_logp_difference/mean": 0.0009496605489403009, "step": 417, "step_time": 14.343221156001164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1684.0, "completions/max_terminated_length": 1684.0, "completions/mean_length": 516.25, "completions/mean_terminated_length": 516.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.02814579417463392, "epoch": 0.00836, "frac_reward_zero_std": 0.25, "grad_norm": 0.07847806066274643, "kl": 2.758747398853302, "learning_rate": 4.9127431082681025e-05, "loss": 0.0036, "num_tokens": 18285537.0, "reward": 0.7700939178466797, "reward_std": 0.4092877507209778, "rewards/rollout_reward_func/mean": 0.7700939178466797, "rewards/rollout_reward_func/std": 0.5051968693733215, "sampling/importance_sampling_ratio/max": 1.0822787284851074, "sampling/importance_sampling_ratio/mean": 1.0013763904571533, "sampling/importance_sampling_ratio/min": 0.9713001847267151, "sampling/sampling_logp_difference/max": 0.07936012744903564, "sampling/sampling_logp_difference/mean": 0.0011164476163685322, "step": 418, "step_time": 11.580372232997433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2476.0, "completions/max_terminated_length": 2476.0, "completions/mean_length": 552.8125, "completions/mean_terminated_length": 552.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04476395878009498, "epoch": 0.00838, "frac_reward_zero_std": 0.25, "grad_norm": 0.10174679756164551, "kl": 3.537159949541092, "learning_rate": 4.902488993302056e-05, "loss": -0.0002, "num_tokens": 18332645.0, "reward": 0.6872170567512512, "reward_std": 0.5793100595474243, "rewards/rollout_reward_func/mean": 0.6872170567512512, "rewards/rollout_reward_func/std": 0.6852321624755859, "sampling/importance_sampling_ratio/max": 1.269108533859253, "sampling/importance_sampling_ratio/mean": 1.0035206079483032, "sampling/importance_sampling_ratio/min": 0.9079466462135315, "sampling/sampling_logp_difference/max": 0.2402191162109375, "sampling/sampling_logp_difference/mean": 0.0031376080587506294, "step": 419, "step_time": 12.961348481000641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1776.0, "completions/max_terminated_length": 1776.0, "completions/mean_length": 522.0625, "completions/mean_terminated_length": 522.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.051717766327783465, "epoch": 0.0084, "frac_reward_zero_std": 0.25, "grad_norm": 0.15665334463119507, "kl": 3.5025094151496887, "learning_rate": 4.8922248140593614e-05, "loss": 0.0124, "num_tokens": 18379120.0, "reward": 0.6749807000160217, "reward_std": 0.5148226022720337, "rewards/rollout_reward_func/mean": 0.6749807000160217, "rewards/rollout_reward_func/std": 0.6040694713592529, "sampling/importance_sampling_ratio/max": 1.1329504251480103, "sampling/importance_sampling_ratio/mean": 1.011215329170227, "sampling/importance_sampling_ratio/min": 0.9833350777626038, "sampling/sampling_logp_difference/max": 0.12460994720458984, "sampling/sampling_logp_difference/mean": 0.003464988898485899, "step": 420, "step_time": 10.555516138999337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3204.0, "completions/max_terminated_length": 3204.0, "completions/mean_length": 530.15625, "completions/mean_terminated_length": 530.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05435227998532355, "epoch": 0.00842, "frac_reward_zero_std": 0.0, "grad_norm": 0.1098443940281868, "kl": 2.8946159332990646, "learning_rate": 4.8819507175907765e-05, "loss": -0.0058, "num_tokens": 18426174.0, "reward": 0.6513627767562866, "reward_std": 0.7165068984031677, "rewards/rollout_reward_func/mean": 0.6513627767562866, "rewards/rollout_reward_func/std": 0.7009936571121216, "sampling/importance_sampling_ratio/max": 1.077221155166626, "sampling/importance_sampling_ratio/mean": 0.9887630939483643, "sampling/importance_sampling_ratio/min": 0.648464560508728, "sampling/sampling_logp_difference/max": 0.3488302230834961, "sampling/sampling_logp_difference/mean": 0.005786615889519453, "step": 421, "step_time": 14.78014258299936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1677.0, "completions/max_terminated_length": 1677.0, "completions/mean_length": 566.84375, "completions/mean_terminated_length": 566.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.03524791239760816, "epoch": 0.00844, "frac_reward_zero_std": 0.0, "grad_norm": 0.058017633855342865, "kl": 3.468432128429413, "learning_rate": 4.871666851089143e-05, "loss": 0.011, "num_tokens": 18475483.0, "reward": 0.6263853311538696, "reward_std": 0.6214660406112671, "rewards/rollout_reward_func/mean": 0.6263853311538696, "rewards/rollout_reward_func/std": 0.6942214369773865, "sampling/importance_sampling_ratio/max": 1.1307822465896606, "sampling/importance_sampling_ratio/mean": 1.0041851997375488, "sampling/importance_sampling_ratio/min": 0.9453223943710327, "sampling/sampling_logp_difference/max": 0.12291097640991211, "sampling/sampling_logp_difference/mean": 0.0018906937912106514, "step": 422, "step_time": 11.67521885899805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1676.0, "completions/max_terminated_length": 1676.0, "completions/mean_length": 420.09375, "completions/mean_terminated_length": 420.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.03484233655035496, "epoch": 0.00846, "frac_reward_zero_std": 0.25, "grad_norm": 0.08803056180477142, "kl": 2.9549446254968643, "learning_rate": 4.861373361887271e-05, "loss": 0.0063, "num_tokens": 18517397.0, "reward": 0.6308823823928833, "reward_std": 0.6078281998634338, "rewards/rollout_reward_func/mean": 0.6308823823928833, "rewards/rollout_reward_func/std": 0.730872392654419, "sampling/importance_sampling_ratio/max": 1.0358366966247559, "sampling/importance_sampling_ratio/mean": 0.9936826229095459, "sampling/importance_sampling_ratio/min": 0.8559401631355286, "sampling/sampling_logp_difference/max": 0.15558326244354248, "sampling/sampling_logp_difference/mean": 0.0026444962713867426, "step": 423, "step_time": 10.865016606000609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2356.0, "completions/max_terminated_length": 2356.0, "completions/mean_length": 582.15625, "completions/mean_terminated_length": 582.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.03812119702342898, "epoch": 0.00848, "frac_reward_zero_std": 0.5, "grad_norm": 0.3214189112186432, "kl": 3.1614930331707, "learning_rate": 4.851070397455828e-05, "loss": 0.0139, "num_tokens": 18566042.0, "reward": 0.8706567287445068, "reward_std": 0.28255900740623474, "rewards/rollout_reward_func/mean": 0.8706567287445068, "rewards/rollout_reward_func/std": 0.43029284477233887, "sampling/importance_sampling_ratio/max": 1.1306122541427612, "sampling/importance_sampling_ratio/mean": 1.007481336593628, "sampling/importance_sampling_ratio/min": 0.9789738059043884, "sampling/sampling_logp_difference/max": 0.12067723274230957, "sampling/sampling_logp_difference/mean": 0.00218680570833385, "step": 424, "step_time": 12.439325779999308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1388.0, "completions/max_terminated_length": 1388.0, "completions/mean_length": 252.59375, "completions/mean_terminated_length": 252.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.030778798507526517, "epoch": 0.0085, "frac_reward_zero_std": 0.0, "grad_norm": 0.13232366740703583, "kl": 2.1830327808856964, "learning_rate": 4.8407581054012345e-05, "loss": -0.0041, "num_tokens": 18601880.0, "reward": 0.7456193566322327, "reward_std": 0.6520976424217224, "rewards/rollout_reward_func/mean": 0.7456193566322327, "rewards/rollout_reward_func/std": 0.6246916055679321, "sampling/importance_sampling_ratio/max": 1.0018792152404785, "sampling/importance_sampling_ratio/mean": 0.9954652786254883, "sampling/importance_sampling_ratio/min": 0.8497623801231384, "sampling/sampling_logp_difference/max": 0.16280317306518555, "sampling/sampling_logp_difference/mean": 0.0015985585050657392, "step": 425, "step_time": 9.038265468002464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1686.0, "completions/max_terminated_length": 1686.0, "completions/mean_length": 477.96875, "completions/mean_terminated_length": 477.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.030544159177225083, "epoch": 0.00852, "frac_reward_zero_std": 0.25, "grad_norm": 0.047146275639534, "kl": 2.986989066004753, "learning_rate": 4.830436633463543e-05, "loss": 0.0045, "num_tokens": 18646260.0, "reward": 0.6620129346847534, "reward_std": 0.5865254402160645, "rewards/rollout_reward_func/mean": 0.6620129346847534, "rewards/rollout_reward_func/std": 0.7274832725524902, "sampling/importance_sampling_ratio/max": 1.0152007341384888, "sampling/importance_sampling_ratio/mean": 0.9923912286758423, "sampling/importance_sampling_ratio/min": 0.8534053564071655, "sampling/sampling_logp_difference/max": 0.15852922201156616, "sampling/sampling_logp_difference/mean": 0.0024876855313777924, "step": 426, "step_time": 10.79181594900001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2491.0, "completions/max_terminated_length": 2491.0, "completions/mean_length": 596.3125, "completions/mean_terminated_length": 596.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.03716866980539635, "epoch": 0.00854, "frac_reward_zero_std": 0.0, "grad_norm": 0.14214979112148285, "kl": 2.5316568166017532, "learning_rate": 4.82010612951432e-05, "loss": 0.0073, "num_tokens": 18695685.0, "reward": 0.6298165321350098, "reward_std": 0.531517505645752, "rewards/rollout_reward_func/mean": 0.6298165321350098, "rewards/rollout_reward_func/std": 0.5201346278190613, "sampling/importance_sampling_ratio/max": 1.2336033582687378, "sampling/importance_sampling_ratio/mean": 1.0095372200012207, "sampling/importance_sampling_ratio/min": 0.8670953512191772, "sampling/sampling_logp_difference/max": 0.21078252792358398, "sampling/sampling_logp_difference/mean": 0.003911780659109354, "step": 427, "step_time": 14.101740054999937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1645.0, "completions/max_terminated_length": 1645.0, "completions/mean_length": 596.28125, "completions/mean_terminated_length": 596.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.02542822720715776, "epoch": 0.00856, "frac_reward_zero_std": 0.5, "grad_norm": 0.12764447927474976, "kl": 3.2227740585803986, "learning_rate": 4.809766741554531e-05, "loss": 0.0104, "num_tokens": 18744827.0, "reward": 0.8703343868255615, "reward_std": 0.2824818789958954, "rewards/rollout_reward_func/mean": 0.8703343868255615, "rewards/rollout_reward_func/std": 0.4296819269657135, "sampling/importance_sampling_ratio/max": 1.0827027559280396, "sampling/importance_sampling_ratio/mean": 1.002096176147461, "sampling/importance_sampling_ratio/min": 0.9758991599082947, "sampling/sampling_logp_difference/max": 0.07962322235107422, "sampling/sampling_logp_difference/mean": 0.0008393897442147136, "step": 428, "step_time": 10.91370998399998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1766.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 399.90625, "completions/mean_terminated_length": 399.90625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.03370183450897457, "epoch": 0.00858, "frac_reward_zero_std": 0.0, "grad_norm": 0.11236058175563812, "kl": 2.73375540971756, "learning_rate": 4.799418617712422e-05, "loss": 0.0071, "num_tokens": 18786321.0, "reward": 0.764726459980011, "reward_std": 0.46985486149787903, "rewards/rollout_reward_func/mean": 0.764726459980011, "rewards/rollout_reward_func/std": 0.4523528814315796, "sampling/importance_sampling_ratio/max": 1.159703254699707, "sampling/importance_sampling_ratio/mean": 0.9987660646438599, "sampling/importance_sampling_ratio/min": 0.7789628505706787, "sampling/sampling_logp_difference/max": 0.2517436742782593, "sampling/sampling_logp_difference/mean": 0.0033441116102039814, "step": 429, "step_time": 11.019094012000096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1398.0, "completions/max_terminated_length": 1398.0, "completions/mean_length": 420.875, "completions/mean_terminated_length": 420.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06790748133789748, "epoch": 0.0086, "frac_reward_zero_std": 0.0, "grad_norm": 0.27044451236724854, "kl": 2.726489320397377, "learning_rate": 4.789061906241391e-05, "loss": 0.0042, "num_tokens": 18830149.0, "reward": 0.44548195600509644, "reward_std": 0.8931037187576294, "rewards/rollout_reward_func/mean": 0.44548195600509644, "rewards/rollout_reward_func/std": 0.8605241179466248, "sampling/importance_sampling_ratio/max": 1.5061999559402466, "sampling/importance_sampling_ratio/mean": 1.001663327217102, "sampling/importance_sampling_ratio/min": 0.7956005334854126, "sampling/sampling_logp_difference/max": 0.41135549545288086, "sampling/sampling_logp_difference/mean": 0.010585549287497997, "step": 430, "step_time": 9.907745999999861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2214.0, "completions/max_terminated_length": 2214.0, "completions/mean_length": 578.34375, "completions/mean_terminated_length": 578.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.03581615863367915, "epoch": 0.00862, "frac_reward_zero_std": 0.25, "grad_norm": 0.02808118239045143, "kl": 3.1205672323703766, "learning_rate": 4.778696755517868e-05, "loss": 0.006, "num_tokens": 18879869.0, "reward": 0.7083966732025146, "reward_std": 0.4993551969528198, "rewards/rollout_reward_func/mean": 0.7083966732025146, "rewards/rollout_reward_func/std": 0.5899167656898499, "sampling/importance_sampling_ratio/max": 1.0056155920028687, "sampling/importance_sampling_ratio/mean": 0.9969238638877869, "sampling/importance_sampling_ratio/min": 0.9445633292198181, "sampling/sampling_logp_difference/max": 0.05856245756149292, "sampling/sampling_logp_difference/mean": 0.0008248141384683549, "step": 431, "step_time": 12.989676088000124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2387.0, "completions/max_terminated_length": 2387.0, "completions/mean_length": 514.78125, "completions/mean_terminated_length": 514.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.023147215833887458, "epoch": 0.00864, "frac_reward_zero_std": 0.5, "grad_norm": 0.0016452278941869736, "kl": 2.6944378688931465, "learning_rate": 4.7683233140391904e-05, "loss": 0.0086, "num_tokens": 18926017.0, "reward": 0.8974900245666504, "reward_std": 0.2237691581249237, "rewards/rollout_reward_func/mean": 0.8974900245666504, "rewards/rollout_reward_func/std": 0.32385650277137756, "sampling/importance_sampling_ratio/max": 1.0184943675994873, "sampling/importance_sampling_ratio/mean": 1.0008652210235596, "sampling/importance_sampling_ratio/min": 0.9834959506988525, "sampling/sampling_logp_difference/max": 0.016917962580919266, "sampling/sampling_logp_difference/mean": 0.0005482233827933669, "step": 432, "step_time": 12.769611531000919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2654.0, "completions/max_terminated_length": 2654.0, "completions/mean_length": 395.0, "completions/mean_terminated_length": 395.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.03291955997701734, "epoch": 0.00866, "frac_reward_zero_std": 0.25, "grad_norm": 0.13838225603103638, "kl": 2.05167418718338, "learning_rate": 4.757941730421472e-05, "loss": -0.0028, "num_tokens": 18968228.0, "reward": 0.6453250050544739, "reward_std": 0.5224488973617554, "rewards/rollout_reward_func/mean": 0.6453250050544739, "rewards/rollout_reward_func/std": 0.6570422053337097, "sampling/importance_sampling_ratio/max": 1.0547699928283691, "sampling/importance_sampling_ratio/mean": 1.0004850625991821, "sampling/importance_sampling_ratio/min": 0.963299036026001, "sampling/sampling_logp_difference/max": 0.1190037727355957, "sampling/sampling_logp_difference/mean": 0.001750977709889412, "step": 433, "step_time": 12.890579271000206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2382.0, "completions/max_terminated_length": 2382.0, "completions/mean_length": 682.15625, "completions/mean_terminated_length": 682.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04121532663702965, "epoch": 0.00868, "frac_reward_zero_std": 0.25, "grad_norm": 0.13351118564605713, "kl": 3.024275779724121, "learning_rate": 4.747552153397478e-05, "loss": 0.0016, "num_tokens": 19021539.0, "reward": 0.8348984718322754, "reward_std": 0.39990177750587463, "rewards/rollout_reward_func/mean": 0.8348984718322754, "rewards/rollout_reward_func/std": 0.4636007249355316, "sampling/importance_sampling_ratio/max": 1.095992088317871, "sampling/importance_sampling_ratio/mean": 1.0014103651046753, "sampling/importance_sampling_ratio/min": 0.9537106156349182, "sampling/sampling_logp_difference/max": 0.09151935577392578, "sampling/sampling_logp_difference/mean": 0.0016988199204206467, "step": 434, "step_time": 12.804109894999783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1636.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 376.0, "completions/mean_terminated_length": 376.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.03257650852901861, "epoch": 0.0087, "frac_reward_zero_std": 0.25, "grad_norm": 0.0528523214161396, "kl": 2.9649127423763275, "learning_rate": 4.7371547318144884e-05, "loss": -0.0015, "num_tokens": 19063085.0, "reward": 0.6094141602516174, "reward_std": 0.5629338026046753, "rewards/rollout_reward_func/mean": 0.6094141602516174, "rewards/rollout_reward_func/std": 0.6724923849105835, "sampling/importance_sampling_ratio/max": 1.0673671960830688, "sampling/importance_sampling_ratio/mean": 0.9932001233100891, "sampling/importance_sampling_ratio/min": 0.8756542205810547, "sampling/sampling_logp_difference/max": 0.13279080390930176, "sampling/sampling_logp_difference/mean": 0.0028956341557204723, "step": 435, "step_time": 9.858207871001468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1548.0, "completions/max_terminated_length": 1548.0, "completions/mean_length": 330.34375, "completions/mean_terminated_length": 330.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.027102177555207163, "epoch": 0.00872, "frac_reward_zero_std": 0.25, "grad_norm": 0.10245528072118759, "kl": 2.533746838569641, "learning_rate": 4.726749614632175e-05, "loss": 0.0126, "num_tokens": 19102121.0, "reward": 0.8053264617919922, "reward_std": 0.4018818438053131, "rewards/rollout_reward_func/mean": 0.8053264617919922, "rewards/rollout_reward_func/std": 0.48116177320480347, "sampling/importance_sampling_ratio/max": 1.0624687671661377, "sampling/importance_sampling_ratio/mean": 0.9994924664497375, "sampling/importance_sampling_ratio/min": 0.9394923448562622, "sampling/sampling_logp_difference/max": 0.0802384614944458, "sampling/sampling_logp_difference/mean": 0.001435074838809669, "step": 436, "step_time": 10.457038900999578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1805.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 510.34375, "completions/mean_terminated_length": 510.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04157943022437394, "epoch": 0.00874, "frac_reward_zero_std": 0.25, "grad_norm": 0.20627737045288086, "kl": 3.1010220497846603, "learning_rate": 4.716336950920453e-05, "loss": 0.0129, "num_tokens": 19148882.0, "reward": 0.7370699644088745, "reward_std": 0.44672614336013794, "rewards/rollout_reward_func/mean": 0.7370699644088745, "rewards/rollout_reward_func/std": 0.5245174765586853, "sampling/importance_sampling_ratio/max": 1.253645420074463, "sampling/importance_sampling_ratio/mean": 1.009002685546875, "sampling/importance_sampling_ratio/min": 0.9031577706336975, "sampling/sampling_logp_difference/max": 0.22610974311828613, "sampling/sampling_logp_difference/mean": 0.004202447831630707, "step": 437, "step_time": 11.215141176002362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2112.0, "completions/max_terminated_length": 2112.0, "completions/mean_length": 846.875, "completions/mean_terminated_length": 846.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.042712511494755745, "epoch": 0.00876, "frac_reward_zero_std": 0.25, "grad_norm": 0.09678380191326141, "kl": 4.250350624322891, "learning_rate": 4.705916889857362e-05, "loss": 0.0232, "num_tokens": 19208269.0, "reward": 0.6882401704788208, "reward_std": 0.5260798931121826, "rewards/rollout_reward_func/mean": 0.6882401704788208, "rewards/rollout_reward_func/std": 0.6332024931907654, "sampling/importance_sampling_ratio/max": 1.123752236366272, "sampling/importance_sampling_ratio/mean": 0.9821652173995972, "sampling/importance_sampling_ratio/min": 0.3737512528896332, "sampling/sampling_logp_difference/max": 0.9833517074584961, "sampling/sampling_logp_difference/mean": 0.007951472885906696, "step": 438, "step_time": 12.491636318000928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1711.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 564.0625, "completions/mean_terminated_length": 564.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07348601729609072, "epoch": 0.00878, "frac_reward_zero_std": 0.0, "grad_norm": 0.4050951898097992, "kl": 2.9341110289096832, "learning_rate": 4.695489580726914e-05, "loss": 0.018, "num_tokens": 19256504.0, "reward": 0.7451245784759521, "reward_std": 0.5545591115951538, "rewards/rollout_reward_func/mean": 0.7451245784759521, "rewards/rollout_reward_func/std": 0.5711086392402649, "sampling/importance_sampling_ratio/max": 1.1542307138442993, "sampling/importance_sampling_ratio/mean": 0.9957526922225952, "sampling/importance_sampling_ratio/min": 0.8767642974853516, "sampling/sampling_logp_difference/max": 0.14452219009399414, "sampling/sampling_logp_difference/mean": 0.003196367295458913, "step": 439, "step_time": 10.674699662000421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2695.0, "completions/max_terminated_length": 2695.0, "completions/mean_length": 630.125, "completions/mean_terminated_length": 630.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04846879933029413, "epoch": 0.0088, "frac_reward_zero_std": 0.0, "grad_norm": 0.2613735795021057, "kl": 3.76412770152092, "learning_rate": 4.6850551729169655e-05, "loss": 0.01, "num_tokens": 19307029.0, "reward": 0.5630495548248291, "reward_std": 0.7139593362808228, "rewards/rollout_reward_func/mean": 0.5630495548248291, "rewards/rollout_reward_func/std": 0.7489292025566101, "sampling/importance_sampling_ratio/max": 1.0243101119995117, "sampling/importance_sampling_ratio/mean": 0.9958807229995728, "sampling/importance_sampling_ratio/min": 0.8826023936271667, "sampling/sampling_logp_difference/max": 0.17634260654449463, "sampling/sampling_logp_difference/mean": 0.0028486924711614847, "step": 440, "step_time": 14.086478408999938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1743.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 532.0625, "completions/mean_terminated_length": 532.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.040492522763088346, "epoch": 0.00882, "frac_reward_zero_std": 0.25, "grad_norm": 0.11826809495687485, "kl": 2.266867935657501, "learning_rate": 4.674613815917071e-05, "loss": 0.0138, "num_tokens": 19354394.0, "reward": 0.8392568826675415, "reward_std": 0.3704586625099182, "rewards/rollout_reward_func/mean": 0.8392568826675415, "rewards/rollout_reward_func/std": 0.4547467827796936, "sampling/importance_sampling_ratio/max": 1.224727749824524, "sampling/importance_sampling_ratio/mean": 1.006321907043457, "sampling/importance_sampling_ratio/min": 0.9508136510848999, "sampling/sampling_logp_difference/max": 0.19948458671569824, "sampling/sampling_logp_difference/mean": 0.003916104789823294, "step": 441, "step_time": 11.628359573002854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1682.0, "completions/max_terminated_length": 1682.0, "completions/mean_length": 545.78125, "completions/mean_terminated_length": 545.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07084107887931168, "epoch": 0.00884, "frac_reward_zero_std": 0.0, "grad_norm": 0.2894895374774933, "kl": 2.952515572309494, "learning_rate": 4.664165659316343e-05, "loss": 0.0072, "num_tokens": 19401690.0, "reward": 0.7437584400177002, "reward_std": 0.54161137342453, "rewards/rollout_reward_func/mean": 0.7437584400177002, "rewards/rollout_reward_func/std": 0.5714100003242493, "sampling/importance_sampling_ratio/max": 1.2089804410934448, "sampling/importance_sampling_ratio/mean": 0.9921427369117737, "sampling/importance_sampling_ratio/min": 0.7944735884666443, "sampling/sampling_logp_difference/max": 0.22868013381958008, "sampling/sampling_logp_difference/mean": 0.00997035764157772, "step": 442, "step_time": 11.26166115900287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1754.0, "completions/max_terminated_length": 1754.0, "completions/mean_length": 523.59375, "completions/mean_terminated_length": 523.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05476558559166733, "epoch": 0.00886, "frac_reward_zero_std": 0.5, "grad_norm": 0.1766548454761505, "kl": 2.1453279554843903, "learning_rate": 4.65371085280131e-05, "loss": -0.0063, "num_tokens": 19448270.0, "reward": 0.8128572702407837, "reward_std": 0.3465689718723297, "rewards/rollout_reward_func/mean": 0.8128572702407837, "rewards/rollout_reward_func/std": 0.52559494972229, "sampling/importance_sampling_ratio/max": 1.3357081413269043, "sampling/importance_sampling_ratio/mean": 1.0073113441467285, "sampling/importance_sampling_ratio/min": 0.8282195925712585, "sampling/sampling_logp_difference/max": 0.29009830951690674, "sampling/sampling_logp_difference/mean": 0.005993885453790426, "step": 443, "step_time": 10.734189949999745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1645.0, "completions/max_terminated_length": 1645.0, "completions/mean_length": 377.65625, "completions/mean_terminated_length": 377.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05128135666018352, "epoch": 0.00888, "frac_reward_zero_std": 0.5, "grad_norm": 0.26152274012565613, "kl": 1.4497839212417603, "learning_rate": 4.643249546153771e-05, "loss": -0.0041, "num_tokens": 19489185.0, "reward": 0.9039286375045776, "reward_std": 0.2717309296131134, "rewards/rollout_reward_func/mean": 0.9039286375045776, "rewards/rollout_reward_func/std": 0.3931479752063751, "sampling/importance_sampling_ratio/max": 1.2647076845169067, "sampling/importance_sampling_ratio/mean": 1.0079543590545654, "sampling/importance_sampling_ratio/min": 0.8031744360923767, "sampling/sampling_logp_difference/max": 0.23494291305541992, "sampling/sampling_logp_difference/mean": 0.010297583416104317, "step": 444, "step_time": 10.085315805000391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1606.0, "completions/max_terminated_length": 1606.0, "completions/mean_length": 591.375, "completions/mean_terminated_length": 591.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05394103121943772, "epoch": 0.0089, "frac_reward_zero_std": 0.0, "grad_norm": 0.3614806532859802, "kl": 1.1950536286458373, "learning_rate": 4.63278188924865e-05, "loss": -0.0035, "num_tokens": 19538411.0, "reward": 0.8321544528007507, "reward_std": 0.410014271736145, "rewards/rollout_reward_func/mean": 0.8321544528007507, "rewards/rollout_reward_func/std": 0.3965170979499817, "sampling/importance_sampling_ratio/max": 1.2081315517425537, "sampling/importance_sampling_ratio/mean": 1.0028882026672363, "sampling/importance_sampling_ratio/min": 0.8023422360420227, "sampling/sampling_logp_difference/max": 0.22015905380249023, "sampling/sampling_logp_difference/mean": 0.005315280519425869, "step": 445, "step_time": 10.269891207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1645.0, "completions/max_terminated_length": 1645.0, "completions/mean_length": 540.15625, "completions/mean_terminated_length": 540.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04681752878241241, "epoch": 0.00892, "frac_reward_zero_std": 0.25, "grad_norm": 0.09813644737005234, "kl": 1.079335793852806, "learning_rate": 4.6223080320518467e-05, "loss": -0.0002, "num_tokens": 19586437.0, "reward": 0.6579041481018066, "reward_std": 0.5317248702049255, "rewards/rollout_reward_func/mean": 0.6579041481018066, "rewards/rollout_reward_func/std": 0.6902310252189636, "sampling/importance_sampling_ratio/max": 1.0397406816482544, "sampling/importance_sampling_ratio/mean": 0.9971652626991272, "sampling/importance_sampling_ratio/min": 0.8976814150810242, "sampling/sampling_logp_difference/max": 0.10673189163208008, "sampling/sampling_logp_difference/mean": 0.0014899014495313168, "step": 446, "step_time": 10.880862658000297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1797.0, "completions/max_terminated_length": 1797.0, "completions/mean_length": 513.59375, "completions/mean_terminated_length": 513.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05258015776053071, "epoch": 0.00894, "frac_reward_zero_std": 0.0, "grad_norm": 0.058455564081668854, "kl": 0.7446163855493069, "learning_rate": 4.611828124618093e-05, "loss": 0.0061, "num_tokens": 19631976.0, "reward": 0.7771439552307129, "reward_std": 0.547958254814148, "rewards/rollout_reward_func/mean": 0.7771439552307129, "rewards/rollout_reward_func/std": 0.5528903007507324, "sampling/importance_sampling_ratio/max": 1.0424870252609253, "sampling/importance_sampling_ratio/mean": 1.0021772384643555, "sampling/importance_sampling_ratio/min": 0.991209089756012, "sampling/sampling_logp_difference/max": 0.04903930425643921, "sampling/sampling_logp_difference/mean": 0.0015120597090572119, "step": 447, "step_time": 11.277590744999543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2480.0, "completions/max_terminated_length": 2480.0, "completions/mean_length": 570.46875, "completions/mean_terminated_length": 570.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.059363452543038875, "epoch": 0.00896, "frac_reward_zero_std": 0.25, "grad_norm": 0.09564101696014404, "kl": 0.6808913173153996, "learning_rate": 4.6013423170887974e-05, "loss": -0.0036, "num_tokens": 19679862.0, "reward": 0.8707038164138794, "reward_std": 0.36570489406585693, "rewards/rollout_reward_func/mean": 0.8707038164138794, "rewards/rollout_reward_func/std": 0.42753541469573975, "sampling/importance_sampling_ratio/max": 1.1304982900619507, "sampling/importance_sampling_ratio/mean": 1.0041030645370483, "sampling/importance_sampling_ratio/min": 0.9824432134628296, "sampling/sampling_logp_difference/max": 0.12254929542541504, "sampling/sampling_logp_difference/mean": 0.001655379543080926, "step": 448, "step_time": 12.816155157998764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1681.0, "completions/max_terminated_length": 1681.0, "completions/mean_length": 442.53125, "completions/mean_terminated_length": 442.53125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06621772167272866, "epoch": 0.00898, "frac_reward_zero_std": 0.0, "grad_norm": 0.20012924075126648, "kl": 0.9015501439571381, "learning_rate": 4.5908507596898977e-05, "loss": 0.0036, "num_tokens": 19722862.0, "reward": 0.6156980991363525, "reward_std": 0.7197387218475342, "rewards/rollout_reward_func/mean": 0.6156980991363525, "rewards/rollout_reward_func/std": 0.712922990322113, "sampling/importance_sampling_ratio/max": 1.0139589309692383, "sampling/importance_sampling_ratio/mean": 0.9979403018951416, "sampling/importance_sampling_ratio/min": 0.9318453073501587, "sampling/sampling_logp_difference/max": 0.10208463668823242, "sampling/sampling_logp_difference/mean": 0.0019357723649591208, "step": 449, "step_time": 10.485580194998875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1634.0, "completions/max_terminated_length": 1634.0, "completions/mean_length": 368.5625, "completions/mean_terminated_length": 368.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.03894550749100745, "epoch": 0.009, "frac_reward_zero_std": 0.25, "grad_norm": 0.15397819876670837, "kl": 0.911491833627224, "learning_rate": 4.580353602729709e-05, "loss": 0.0027, "num_tokens": 19763296.0, "reward": 0.7412374019622803, "reward_std": 0.4650149941444397, "rewards/rollout_reward_func/mean": 0.7412374019622803, "rewards/rollout_reward_func/std": 0.5800607204437256, "sampling/importance_sampling_ratio/max": 1.002912163734436, "sampling/importance_sampling_ratio/mean": 0.9917923212051392, "sampling/importance_sampling_ratio/min": 0.879520058631897, "sampling/sampling_logp_difference/max": 0.11936306953430176, "sampling/sampling_logp_difference/mean": 0.002339588478207588, "step": 450, "step_time": 10.20936751300178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1549.0, "completions/max_terminated_length": 1549.0, "completions/mean_length": 399.9375, "completions/mean_terminated_length": 399.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.048645659524481744, "epoch": 0.00902, "frac_reward_zero_std": 0.75, "grad_norm": 0.031889691948890686, "kl": 0.8166829664260149, "learning_rate": 4.569850996596766e-05, "loss": 0.0083, "num_tokens": 19804689.0, "reward": 0.9331518411636353, "reward_std": 0.1239093616604805, "rewards/rollout_reward_func/mean": 0.9331518411636353, "rewards/rollout_reward_func/std": 0.2632666230201721, "sampling/importance_sampling_ratio/max": 1.0495954751968384, "sampling/importance_sampling_ratio/mean": 0.9968407154083252, "sampling/importance_sampling_ratio/min": 0.8962895274162292, "sampling/sampling_logp_difference/max": 0.1096000075340271, "sampling/sampling_logp_difference/mean": 0.00236195488832891, "step": 451, "step_time": 10.548095308999109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2335.0, "completions/max_terminated_length": 2335.0, "completions/mean_length": 682.375, "completions/mean_terminated_length": 682.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.11167646758258343, "epoch": 0.00904, "frac_reward_zero_std": 0.5, "grad_norm": 0.1885819286108017, "kl": 0.43576287291944027, "learning_rate": 4.559343091757675e-05, "loss": -0.0039, "num_tokens": 19859000.0, "reward": 0.8767566680908203, "reward_std": 0.34858471155166626, "rewards/rollout_reward_func/mean": 0.8767566680908203, "rewards/rollout_reward_func/std": 0.48495855927467346, "sampling/importance_sampling_ratio/max": 1.3641117811203003, "sampling/importance_sampling_ratio/mean": 1.002948522567749, "sampling/importance_sampling_ratio/min": 0.8636539578437805, "sampling/sampling_logp_difference/max": 0.31068968772888184, "sampling/sampling_logp_difference/mean": 0.007139761000871658, "step": 452, "step_time": 13.339513056000214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1608.0, "completions/max_terminated_length": 1608.0, "completions/mean_length": 377.25, "completions/mean_terminated_length": 377.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07558402750873938, "epoch": 0.00906, "frac_reward_zero_std": 0.25, "grad_norm": 0.154552161693573, "kl": 0.7853328566998243, "learning_rate": 4.548830038754953e-05, "loss": 0.0075, "num_tokens": 19899513.0, "reward": 0.6599950194358826, "reward_std": 0.6116593480110168, "rewards/rollout_reward_func/mean": 0.6599950194358826, "rewards/rollout_reward_func/std": 0.7323248386383057, "sampling/importance_sampling_ratio/max": 1.0156491994857788, "sampling/importance_sampling_ratio/mean": 0.9928690791130066, "sampling/importance_sampling_ratio/min": 0.868992030620575, "sampling/sampling_logp_difference/max": 0.14630264043807983, "sampling/sampling_logp_difference/mean": 0.002492745639756322, "step": 453, "step_time": 10.334614526001133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1369.0, "completions/max_terminated_length": 1369.0, "completions/mean_length": 414.0625, "completions/mean_terminated_length": 414.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07970124506391585, "epoch": 0.00908, "frac_reward_zero_std": 0.25, "grad_norm": 0.16483508050441742, "kl": 0.7097156122326851, "learning_rate": 4.538311988204872e-05, "loss": 0.0007, "num_tokens": 19942498.0, "reward": 0.7421607971191406, "reward_std": 0.4472953677177429, "rewards/rollout_reward_func/mean": 0.7421607971191406, "rewards/rollout_reward_func/std": 0.5755265951156616, "sampling/importance_sampling_ratio/max": 1.024273157119751, "sampling/importance_sampling_ratio/mean": 0.989867091178894, "sampling/importance_sampling_ratio/min": 0.8004573583602905, "sampling/sampling_logp_difference/max": 0.16967105865478516, "sampling/sampling_logp_difference/mean": 0.004003660753369331, "step": 454, "step_time": 9.410338307001439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2381.0, "completions/max_terminated_length": 2381.0, "completions/mean_length": 518.78125, "completions/mean_terminated_length": 518.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06871705676894635, "epoch": 0.0091, "frac_reward_zero_std": 0.25, "grad_norm": 0.22554893791675568, "kl": 0.8576799295842648, "learning_rate": 4.527789090795303e-05, "loss": 0.0012, "num_tokens": 19988563.0, "reward": 0.8431378602981567, "reward_std": 0.44367319345474243, "rewards/rollout_reward_func/mean": 0.8431378602981567, "rewards/rollout_reward_func/std": 0.5115151405334473, "sampling/importance_sampling_ratio/max": 1.0265225172042847, "sampling/importance_sampling_ratio/mean": 0.999423623085022, "sampling/importance_sampling_ratio/min": 0.9565703272819519, "sampling/sampling_logp_difference/max": 0.05483981966972351, "sampling/sampling_logp_difference/mean": 0.001979121472686529, "step": 455, "step_time": 12.846686357999715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2541.0, "completions/max_terminated_length": 2541.0, "completions/mean_length": 635.78125, "completions/mean_terminated_length": 635.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08831836027093232, "epoch": 0.00912, "frac_reward_zero_std": 0.0, "grad_norm": 0.5101639628410339, "kl": 0.92097147856839, "learning_rate": 4.5172614972835555e-05, "loss": 0.0263, "num_tokens": 20039470.0, "reward": 0.6580727100372314, "reward_std": 0.7042951583862305, "rewards/rollout_reward_func/mean": 0.6580727100372314, "rewards/rollout_reward_func/std": 0.6918373703956604, "sampling/importance_sampling_ratio/max": 1.0364017486572266, "sampling/importance_sampling_ratio/mean": 0.9983530044555664, "sampling/importance_sampling_ratio/min": 0.9214058518409729, "sampling/sampling_logp_difference/max": 0.08315789699554443, "sampling/sampling_logp_difference/mean": 0.003105662064626813, "step": 456, "step_time": 13.399771433999376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1635.0, "completions/max_terminated_length": 1635.0, "completions/mean_length": 471.25, "completions/mean_terminated_length": 471.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08495781291276217, "epoch": 0.00914, "frac_reward_zero_std": 0.0, "grad_norm": 0.29524722695350647, "kl": 0.7026688978075981, "learning_rate": 4.5067293584942185e-05, "loss": 0.0173, "num_tokens": 20082952.0, "reward": 0.682660698890686, "reward_std": 0.667336642742157, "rewards/rollout_reward_func/mean": 0.682660698890686, "rewards/rollout_reward_func/std": 0.6439881324768066, "sampling/importance_sampling_ratio/max": 1.3004999160766602, "sampling/importance_sampling_ratio/mean": 1.0021476745605469, "sampling/importance_sampling_ratio/min": 0.9515664577484131, "sampling/sampling_logp_difference/max": 0.1647193431854248, "sampling/sampling_logp_difference/mean": 0.0039277952164411545, "step": 457, "step_time": 10.12861464999969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1553.0, "completions/max_terminated_length": 1553.0, "completions/mean_length": 405.3125, "completions/mean_terminated_length": 405.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08503864426165819, "epoch": 0.00916, "frac_reward_zero_std": 0.25, "grad_norm": 0.14366890490055084, "kl": 1.0521269738674164, "learning_rate": 4.496192825316998e-05, "loss": 0.0061, "num_tokens": 20124779.0, "reward": 0.6218923330307007, "reward_std": 0.5169287323951721, "rewards/rollout_reward_func/mean": 0.6218923330307007, "rewards/rollout_reward_func/std": 0.7511652708053589, "sampling/importance_sampling_ratio/max": 1.0355340242385864, "sampling/importance_sampling_ratio/mean": 0.9912394285202026, "sampling/importance_sampling_ratio/min": 0.830745279788971, "sampling/sampling_logp_difference/max": 0.18311083316802979, "sampling/sampling_logp_difference/mean": 0.0030086846090853214, "step": 458, "step_time": 9.896104102000209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1607.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 499.84375, "completions/mean_terminated_length": 499.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08641208335757256, "epoch": 0.00918, "frac_reward_zero_std": 0.0, "grad_norm": 0.09843409806489944, "kl": 0.6633150763809681, "learning_rate": 4.485652048704559e-05, "loss": -0.0176, "num_tokens": 20171498.0, "reward": 0.6818546056747437, "reward_std": 0.6559575796127319, "rewards/rollout_reward_func/mean": 0.6818546056747437, "rewards/rollout_reward_func/std": 0.6984158158302307, "sampling/importance_sampling_ratio/max": 1.2529888153076172, "sampling/importance_sampling_ratio/mean": 1.000744342803955, "sampling/importance_sampling_ratio/min": 0.9263259172439575, "sampling/sampling_logp_difference/max": 0.18814617395401, "sampling/sampling_logp_difference/mean": 0.003507649526000023, "step": 459, "step_time": 10.606538591001481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1735.0, "completions/max_terminated_length": 1735.0, "completions/mean_length": 489.625, "completions/mean_terminated_length": 489.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1349673424847424, "epoch": 0.0092, "frac_reward_zero_std": 0.25, "grad_norm": 0.14855368435382843, "kl": 1.016698518767953, "learning_rate": 4.4751071796703605e-05, "loss": 0.0084, "num_tokens": 20216410.0, "reward": 0.7412111163139343, "reward_std": 0.4193772077560425, "rewards/rollout_reward_func/mean": 0.7412111163139343, "rewards/rollout_reward_func/std": 0.5192373394966125, "sampling/importance_sampling_ratio/max": 1.0160987377166748, "sampling/importance_sampling_ratio/mean": 0.9881390929222107, "sampling/importance_sampling_ratio/min": 0.9084139466285706, "sampling/sampling_logp_difference/max": 0.19873344898223877, "sampling/sampling_logp_difference/mean": 0.007775420323014259, "step": 460, "step_time": 11.539437224001631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1638.0, "completions/max_terminated_length": 1638.0, "completions/mean_length": 324.46875, "completions/mean_terminated_length": 324.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.1271245675161481, "epoch": 0.00922, "frac_reward_zero_std": 0.5, "grad_norm": 0.2825014591217041, "kl": 1.0057089775800705, "learning_rate": 4.464558369286488e-05, "loss": 0.0032, "num_tokens": 20254565.0, "reward": 0.8120763301849365, "reward_std": 0.36285728216171265, "rewards/rollout_reward_func/mean": 0.8120763301849365, "rewards/rollout_reward_func/std": 0.5322986245155334, "sampling/importance_sampling_ratio/max": 1.2246557474136353, "sampling/importance_sampling_ratio/mean": 1.0000500679016113, "sampling/importance_sampling_ratio/min": 0.8280803561210632, "sampling/sampling_logp_difference/max": 0.18845391273498535, "sampling/sampling_logp_difference/mean": 0.011560920625925064, "step": 461, "step_time": 9.958762075000777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2178.0, "completions/max_terminated_length": 2178.0, "completions/mean_length": 490.4375, "completions/mean_terminated_length": 490.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.17441062396392226, "epoch": 0.00924, "frac_reward_zero_std": 0.0, "grad_norm": 0.31229889392852783, "kl": 0.9118440598249435, "learning_rate": 4.454005768681496e-05, "loss": 0.0153, "num_tokens": 20300546.0, "reward": 0.6513845920562744, "reward_std": 0.6911287307739258, "rewards/rollout_reward_func/mean": 0.6513845920562744, "rewards/rollout_reward_func/std": 0.7033176422119141, "sampling/importance_sampling_ratio/max": 1.086592197418213, "sampling/importance_sampling_ratio/mean": 0.9826399087905884, "sampling/importance_sampling_ratio/min": 0.8425159454345703, "sampling/sampling_logp_difference/max": 0.24264931678771973, "sampling/sampling_logp_difference/mean": 0.011080155149102211, "step": 462, "step_time": 12.222469536999597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1416.0, "completions/max_terminated_length": 1416.0, "completions/mean_length": 404.0625, "completions/mean_terminated_length": 404.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.14892594900447875, "epoch": 0.00926, "frac_reward_zero_std": 0.0, "grad_norm": 0.17933820188045502, "kl": 0.8277010470628738, "learning_rate": 4.443449529038243e-05, "loss": -0.0088, "num_tokens": 20343493.0, "reward": 0.8065080642700195, "reward_std": 0.5472779273986816, "rewards/rollout_reward_func/mean": 0.8065080642700195, "rewards/rollout_reward_func/std": 0.5428571105003357, "sampling/importance_sampling_ratio/max": 1.3034034967422485, "sampling/importance_sampling_ratio/mean": 1.0143320560455322, "sampling/importance_sampling_ratio/min": 0.9156749844551086, "sampling/sampling_logp_difference/max": 0.19332313537597656, "sampling/sampling_logp_difference/mean": 0.00996656809002161, "step": 463, "step_time": 9.884585330001755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1409.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 367.34375, "completions/mean_terminated_length": 367.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07843527430668473, "epoch": 0.00928, "frac_reward_zero_std": 0.25, "grad_norm": 0.11167420446872711, "kl": 0.8541043512523174, "learning_rate": 4.432889801591715e-05, "loss": 0.0093, "num_tokens": 20384497.0, "reward": 0.7175522446632385, "reward_std": 0.46927642822265625, "rewards/rollout_reward_func/mean": 0.7175522446632385, "rewards/rollout_reward_func/std": 0.6272379159927368, "sampling/importance_sampling_ratio/max": 1.0006084442138672, "sampling/importance_sampling_ratio/mean": 0.9888176321983337, "sampling/importance_sampling_ratio/min": 0.8707840442657471, "sampling/sampling_logp_difference/max": 0.08294987678527832, "sampling/sampling_logp_difference/mean": 0.0026873364113271236, "step": 464, "step_time": 10.281002003999674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2442.0, "completions/max_terminated_length": 2442.0, "completions/mean_length": 666.90625, "completions/mean_terminated_length": 666.90625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08327567105880007, "epoch": 0.0093, "frac_reward_zero_std": 0.25, "grad_norm": 0.1004444807767868, "kl": 0.8053998313844204, "learning_rate": 4.4223267376268736e-05, "loss": -0.0036, "num_tokens": 20437225.0, "reward": 0.7821990251541138, "reward_std": 0.49636831879615784, "rewards/rollout_reward_func/mean": 0.7821990251541138, "rewards/rollout_reward_func/std": 0.6006209850311279, "sampling/importance_sampling_ratio/max": 1.199526071548462, "sampling/importance_sampling_ratio/mean": 0.9886412620544434, "sampling/importance_sampling_ratio/min": 0.7070308923721313, "sampling/sampling_logp_difference/max": 0.24860620498657227, "sampling/sampling_logp_difference/mean": 0.008429847657680511, "step": 465, "step_time": 13.254150205001679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2386.0, "completions/max_terminated_length": 2386.0, "completions/mean_length": 435.53125, "completions/mean_terminated_length": 435.53125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.11336538987234235, "epoch": 0.00932, "frac_reward_zero_std": 0.25, "grad_norm": 0.26791125535964966, "kl": 1.1861447766423225, "learning_rate": 4.411760488476479e-05, "loss": 0.0181, "num_tokens": 20480282.0, "reward": 0.7515416145324707, "reward_std": 0.5014573931694031, "rewards/rollout_reward_func/mean": 0.7515416145324707, "rewards/rollout_reward_func/std": 0.610092282295227, "sampling/importance_sampling_ratio/max": 1.4816343784332275, "sampling/importance_sampling_ratio/mean": 1.0239752531051636, "sampling/importance_sampling_ratio/min": 0.8363040685653687, "sampling/sampling_logp_difference/max": 0.30392986536026, "sampling/sampling_logp_difference/mean": 0.011568987742066383, "step": 466, "step_time": 12.15597289499874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1763.0, "completions/max_terminated_length": 1763.0, "completions/mean_length": 545.375, "completions/mean_terminated_length": 545.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.13647353881970048, "epoch": 0.00934, "frac_reward_zero_std": 0.25, "grad_norm": 0.3742064833641052, "kl": 0.9225295335054398, "learning_rate": 4.40119120551892e-05, "loss": 0.0234, "num_tokens": 20527742.0, "reward": 0.6638165712356567, "reward_std": 0.60538649559021, "rewards/rollout_reward_func/mean": 0.6638165712356567, "rewards/rollout_reward_func/std": 0.7256990671157837, "sampling/importance_sampling_ratio/max": 1.3648496866226196, "sampling/importance_sampling_ratio/mean": 1.0127131938934326, "sampling/importance_sampling_ratio/min": 0.902861475944519, "sampling/sampling_logp_difference/max": 0.23546743392944336, "sampling/sampling_logp_difference/mean": 0.008326444774866104, "step": 467, "step_time": 10.566178803999719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1627.0, "completions/max_terminated_length": 1627.0, "completions/mean_length": 391.71875, "completions/mean_terminated_length": 391.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1156360594322905, "epoch": 0.00936, "frac_reward_zero_std": 0.25, "grad_norm": 0.3254666328430176, "kl": 0.7284389548003674, "learning_rate": 4.390619040176059e-05, "loss": -0.0123, "num_tokens": 20570243.0, "reward": 0.5539218187332153, "reward_std": 0.6382086277008057, "rewards/rollout_reward_func/mean": 0.5539218187332153, "rewards/rollout_reward_func/std": 0.7627818584442139, "sampling/importance_sampling_ratio/max": 1.0974334478378296, "sampling/importance_sampling_ratio/mean": 0.9807085990905762, "sampling/importance_sampling_ratio/min": 0.800949215888977, "sampling/sampling_logp_difference/max": 0.22240877151489258, "sampling/sampling_logp_difference/mean": 0.010116509161889553, "step": 468, "step_time": 10.165360018000683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1852.0, "completions/max_terminated_length": 1852.0, "completions/mean_length": 541.75, "completions/mean_terminated_length": 541.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.11949147749692202, "epoch": 0.00938, "frac_reward_zero_std": 0.0, "grad_norm": 1.618021011352539, "kl": 0.8162462546024472, "learning_rate": 4.380044143911043e-05, "loss": 0.0696, "num_tokens": 20617752.0, "reward": 0.7794264554977417, "reward_std": 0.5596599578857422, "rewards/rollout_reward_func/mean": 0.7794264554977417, "rewards/rollout_reward_func/std": 0.5497846603393555, "sampling/importance_sampling_ratio/max": 2.067485809326172, "sampling/importance_sampling_ratio/mean": 1.0350780487060547, "sampling/importance_sampling_ratio/min": 0.9833405017852783, "sampling/sampling_logp_difference/max": 0.2769789695739746, "sampling/sampling_logp_difference/mean": 0.009765563532710075, "step": 469, "step_time": 11.845611650000137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1320.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 379.6875, "completions/mean_terminated_length": 379.6875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.10285447398200631, "epoch": 0.0094, "frac_reward_zero_std": 0.0, "grad_norm": 0.37412920594215393, "kl": 2.335883565247059, "learning_rate": 4.369466668226148e-05, "loss": 0.0156, "num_tokens": 20659105.0, "reward": 0.6266819834709167, "reward_std": 0.7570593357086182, "rewards/rollout_reward_func/mean": 0.6266819834709167, "rewards/rollout_reward_func/std": 0.7398858666419983, "sampling/importance_sampling_ratio/max": 1.0085550546646118, "sampling/importance_sampling_ratio/mean": 0.9755420088768005, "sampling/importance_sampling_ratio/min": 0.7608059644699097, "sampling/sampling_logp_difference/max": 0.29126250743865967, "sampling/sampling_logp_difference/mean": 0.008459947071969509, "step": 470, "step_time": 9.578650646000824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1567.0, "completions/max_terminated_length": 1567.0, "completions/mean_length": 359.78125, "completions/mean_terminated_length": 359.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.035714212572202086, "epoch": 0.00942, "frac_reward_zero_std": 0.25, "grad_norm": 0.2755154073238373, "kl": 0.9676204979477916, "learning_rate": 4.3588867646606075e-05, "loss": 0.0091, "num_tokens": 20699752.0, "reward": 0.7713315486907959, "reward_std": 0.4077182710170746, "rewards/rollout_reward_func/mean": 0.7713315486907959, "rewards/rollout_reward_func/std": 0.5077093243598938, "sampling/importance_sampling_ratio/max": 1.0020225048065186, "sampling/importance_sampling_ratio/mean": 0.9924885034561157, "sampling/importance_sampling_ratio/min": 0.892275869846344, "sampling/sampling_logp_difference/max": 0.11368846893310547, "sampling/sampling_logp_difference/mean": 0.0019995879847556353, "step": 471, "step_time": 9.781082270000297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1648.0, "completions/max_terminated_length": 1648.0, "completions/mean_length": 420.53125, "completions/mean_terminated_length": 411.2257995605469, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.11375703592784703, "epoch": 0.00944, "frac_reward_zero_std": 0.0, "grad_norm": 0.6749041676521301, "kl": 1.0295426696538925, "learning_rate": 4.348304584788433e-05, "loss": 0.1106, "num_tokens": 20742956.0, "reward": 0.5201226472854614, "reward_std": 0.7339253425598145, "rewards/rollout_reward_func/mean": 0.5201226472854614, "rewards/rollout_reward_func/std": 0.7189366817474365, "sampling/importance_sampling_ratio/max": 1.7458113431930542, "sampling/importance_sampling_ratio/mean": 1.0291683673858643, "sampling/importance_sampling_ratio/min": 0.8782824277877808, "sampling/sampling_logp_difference/max": 0.32213258743286133, "sampling/sampling_logp_difference/mean": 0.010540695860981941, "step": 472, "step_time": 10.307577231000323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1697.0, "completions/max_terminated_length": 1697.0, "completions/mean_length": 421.78125, "completions/mean_terminated_length": 421.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.10192345781251788, "epoch": 0.00946, "frac_reward_zero_std": 0.0, "grad_norm": 0.15385037660598755, "kl": 1.1971377320587635, "learning_rate": 4.337720280216251e-05, "loss": 0.0062, "num_tokens": 20785035.0, "reward": 0.8059319257736206, "reward_std": 0.4647321403026581, "rewards/rollout_reward_func/mean": 0.8059319257736206, "rewards/rollout_reward_func/std": 0.4771122336387634, "sampling/importance_sampling_ratio/max": 1.2470922470092773, "sampling/importance_sampling_ratio/mean": 0.9879426956176758, "sampling/importance_sampling_ratio/min": 0.7052823305130005, "sampling/sampling_logp_difference/max": 0.3578951358795166, "sampling/sampling_logp_difference/mean": 0.007669444661587477, "step": 473, "step_time": 10.266129868001372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1709.0, "completions/max_terminated_length": 1709.0, "completions/mean_length": 471.25, "completions/mean_terminated_length": 471.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07627592817880213, "epoch": 0.00948, "frac_reward_zero_std": 0.0, "grad_norm": 0.04373486712574959, "kl": 1.1992175728082657, "learning_rate": 4.327134002581127e-05, "loss": 0.0106, "num_tokens": 20828827.0, "reward": 0.7075757384300232, "reward_std": 0.5851081013679504, "rewards/rollout_reward_func/mean": 0.7075757384300232, "rewards/rollout_reward_func/std": 0.5901199579238892, "sampling/importance_sampling_ratio/max": 1.1036490201950073, "sampling/importance_sampling_ratio/mean": 1.002841830253601, "sampling/importance_sampling_ratio/min": 0.9177454113960266, "sampling/sampling_logp_difference/max": 0.08578610420227051, "sampling/sampling_logp_difference/mean": 0.0028636979404836893, "step": 474, "step_time": 10.892037075999724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1667.0, "completions/max_terminated_length": 1667.0, "completions/mean_length": 496.0, "completions/mean_terminated_length": 496.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.14856369514018297, "epoch": 0.0095, "frac_reward_zero_std": 0.5, "grad_norm": 0.09522901475429535, "kl": 1.3827891796827316, "learning_rate": 4.316545903548394e-05, "loss": 0.0086, "num_tokens": 20874845.0, "reward": 0.8394955396652222, "reward_std": 0.3070661127567291, "rewards/rollout_reward_func/mean": 0.8394955396652222, "rewards/rollout_reward_func/std": 0.4534715414047241, "sampling/importance_sampling_ratio/max": 1.2209835052490234, "sampling/importance_sampling_ratio/mean": 0.9796254634857178, "sampling/importance_sampling_ratio/min": 0.783122181892395, "sampling/sampling_logp_difference/max": 0.3450040817260742, "sampling/sampling_logp_difference/mean": 0.01301491167396307, "step": 475, "step_time": 11.072724538999864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1728.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 544.875, "completions/mean_terminated_length": 544.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.11438385327346623, "epoch": 0.00952, "frac_reward_zero_std": 0.0, "grad_norm": 0.17891010642051697, "kl": 1.2257879972457886, "learning_rate": 4.305956134809477e-05, "loss": -0.0144, "num_tokens": 20922215.0, "reward": 0.8034529089927673, "reward_std": 0.49144816398620605, "rewards/rollout_reward_func/mean": 0.8034529089927673, "rewards/rollout_reward_func/std": 0.48399457335472107, "sampling/importance_sampling_ratio/max": 1.094618558883667, "sampling/importance_sampling_ratio/mean": 0.9921758770942688, "sampling/importance_sampling_ratio/min": 0.829417884349823, "sampling/sampling_logp_difference/max": 0.18404173851013184, "sampling/sampling_logp_difference/mean": 0.005722466390579939, "step": 476, "step_time": 10.780266768000729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1488.0, "completions/max_terminated_length": 1488.0, "completions/mean_length": 472.5, "completions/mean_terminated_length": 472.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.16611045459285378, "epoch": 0.00954, "frac_reward_zero_std": 0.0, "grad_norm": 0.4236607253551483, "kl": 1.0403286553919315, "learning_rate": 4.295364848079726e-05, "loss": 0.0159, "num_tokens": 20968006.0, "reward": 0.652174174785614, "reward_std": 0.6869129538536072, "rewards/rollout_reward_func/mean": 0.652174174785614, "rewards/rollout_reward_func/std": 0.6981610059738159, "sampling/importance_sampling_ratio/max": 1.463969111442566, "sampling/importance_sampling_ratio/mean": 1.0099821090698242, "sampling/importance_sampling_ratio/min": 0.8639673590660095, "sampling/sampling_logp_difference/max": 0.19669294357299805, "sampling/sampling_logp_difference/mean": 0.009079836308956146, "step": 477, "step_time": 9.94282332000057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3342.0, "completions/max_terminated_length": 3342.0, "completions/mean_length": 770.53125, "completions/mean_terminated_length": 770.53125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.20122343208640814, "epoch": 0.00956, "frac_reward_zero_std": 0.25, "grad_norm": 0.3489304482936859, "kl": 0.8516473986092024, "learning_rate": 4.284772195096236e-05, "loss": 0.0123, "num_tokens": 21025134.0, "reward": 0.7838442325592041, "reward_std": 0.4926740229129791, "rewards/rollout_reward_func/mean": 0.7838442325592041, "rewards/rollout_reward_func/std": 0.5964462161064148, "sampling/importance_sampling_ratio/max": 1.3705592155456543, "sampling/importance_sampling_ratio/mean": 1.0211753845214844, "sampling/importance_sampling_ratio/min": 0.8994253873825073, "sampling/sampling_logp_difference/max": 0.2526369094848633, "sampling/sampling_logp_difference/mean": 0.008423288352787495, "step": 478, "step_time": 15.950710173999141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1717.0, "completions/max_terminated_length": 1717.0, "completions/mean_length": 694.375, "completions/mean_terminated_length": 694.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.15390216372907162, "epoch": 0.00958, "frac_reward_zero_std": 0.5, "grad_norm": 0.11987709254026413, "kl": 0.8030732870101929, "learning_rate": 4.274178327615677e-05, "loss": -0.0014, "num_tokens": 21077660.0, "reward": 0.8696112632751465, "reward_std": 0.24153712391853333, "rewards/rollout_reward_func/mean": 0.8696112632751465, "rewards/rollout_reward_func/std": 0.35062703490257263, "sampling/importance_sampling_ratio/max": 1.0259126424789429, "sampling/importance_sampling_ratio/mean": 0.9858798980712891, "sampling/importance_sampling_ratio/min": 0.7162445187568665, "sampling/sampling_logp_difference/max": 0.3380228281021118, "sampling/sampling_logp_difference/mean": 0.01045995019376278, "step": 479, "step_time": 12.035725636999814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1639.0, "completions/max_terminated_length": 1639.0, "completions/mean_length": 559.40625, "completions/mean_terminated_length": 559.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1807458510156721, "epoch": 0.0096, "frac_reward_zero_std": 0.5, "grad_norm": 0.14663177728652954, "kl": 0.9291217438876629, "learning_rate": 4.263583397412117e-05, "loss": -0.0058, "num_tokens": 21125580.0, "reward": 0.8714296817779541, "reward_std": 0.28003719449043274, "rewards/rollout_reward_func/mean": 0.8714296817779541, "rewards/rollout_reward_func/std": 0.4238206148147583, "sampling/importance_sampling_ratio/max": 1.4838334321975708, "sampling/importance_sampling_ratio/mean": 1.0216410160064697, "sampling/importance_sampling_ratio/min": 0.8950276374816895, "sampling/sampling_logp_difference/max": 0.23094534873962402, "sampling/sampling_logp_difference/mean": 0.009554342366755009, "step": 480, "step_time": 11.753171077999468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1639.0, "completions/max_terminated_length": 1639.0, "completions/mean_length": 430.78125, "completions/mean_terminated_length": 430.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1780693712644279, "epoch": 0.00962, "frac_reward_zero_std": 0.25, "grad_norm": 0.24251112341880798, "kl": 0.9835023172199726, "learning_rate": 4.252987556274854e-05, "loss": 0.05, "num_tokens": 21168578.0, "reward": 0.682228147983551, "reward_std": 0.5318856239318848, "rewards/rollout_reward_func/mean": 0.682228147983551, "rewards/rollout_reward_func/std": 0.6392444968223572, "sampling/importance_sampling_ratio/max": 1.5229785442352295, "sampling/importance_sampling_ratio/mean": 1.0503311157226562, "sampling/importance_sampling_ratio/min": 0.9376240968704224, "sampling/sampling_logp_difference/max": 0.2744598388671875, "sampling/sampling_logp_difference/mean": 0.012079544365406036, "step": 481, "step_time": 10.9992033239987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2412.0, "completions/max_terminated_length": 2412.0, "completions/mean_length": 656.0, "completions/mean_terminated_length": 656.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1382370158098638, "epoch": 0.00964, "frac_reward_zero_std": 0.75, "grad_norm": 0.2922344505786896, "kl": 0.9374846741557121, "learning_rate": 4.242390956006229e-05, "loss": 0.0285, "num_tokens": 21219017.0, "reward": 0.8382992744445801, "reward_std": 0.18538571894168854, "rewards/rollout_reward_func/mean": 0.8382992744445801, "rewards/rollout_reward_func/std": 0.45292332768440247, "sampling/importance_sampling_ratio/max": 1.3362618684768677, "sampling/importance_sampling_ratio/mean": 1.00322687625885, "sampling/importance_sampling_ratio/min": 0.8920287489891052, "sampling/sampling_logp_difference/max": 0.23374104499816895, "sampling/sampling_logp_difference/mean": 0.011347031220793724, "step": 482, "step_time": 13.041375768000762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1738.0, "completions/max_terminated_length": 1738.0, "completions/mean_length": 482.9375, "completions/mean_terminated_length": 497.4838562011719, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.18117137253284454, "epoch": 0.00966, "frac_reward_zero_std": 0.0, "grad_norm": 0.6315069198608398, "kl": 1.0906357988715172, "learning_rate": 4.231793748419464e-05, "loss": 0.1383, "num_tokens": 21264068.0, "reward": 0.6569750905036926, "reward_std": 0.6822764873504639, "rewards/rollout_reward_func/mean": 0.6569750905036926, "rewards/rollout_reward_func/std": 0.6914200782775879, "sampling/importance_sampling_ratio/max": 1.1045833826065063, "sampling/importance_sampling_ratio/mean": 0.9935373067855835, "sampling/importance_sampling_ratio/min": 0.9231139421463013, "sampling/sampling_logp_difference/max": 0.19250774383544922, "sampling/sampling_logp_difference/mean": 0.0114359799772501, "step": 483, "step_time": 11.516561299999921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1550.0, "completions/max_terminated_length": 1550.0, "completions/mean_length": 494.28125, "completions/mean_terminated_length": 494.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.14930287533206865, "epoch": 0.00968, "frac_reward_zero_std": 0.0, "grad_norm": 0.18626542389392853, "kl": 1.0260459631681442, "learning_rate": 4.2211960853364816e-05, "loss": 0.0034, "num_tokens": 21310127.0, "reward": 0.5964707136154175, "reward_std": 0.7217289209365845, "rewards/rollout_reward_func/mean": 0.5964707136154175, "rewards/rollout_reward_func/std": 0.7450999617576599, "sampling/importance_sampling_ratio/max": 1.2025771141052246, "sampling/importance_sampling_ratio/mean": 1.0040358304977417, "sampling/importance_sampling_ratio/min": 0.7680273652076721, "sampling/sampling_logp_difference/max": 0.23184514045715332, "sampling/sampling_logp_difference/mean": 0.0074826739728450775, "step": 484, "step_time": 10.201417691000643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1695.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 598.71875, "completions/mean_terminated_length": 598.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.13743195496499538, "epoch": 0.0097, "frac_reward_zero_std": 0.0, "grad_norm": 0.1688639223575592, "kl": 1.1845230236649513, "learning_rate": 4.210598118585727e-05, "loss": 0.0243, "num_tokens": 21359631.0, "reward": 0.5976054668426514, "reward_std": 0.756737470626831, "rewards/rollout_reward_func/mean": 0.5976054668426514, "rewards/rollout_reward_func/std": 0.7436534762382507, "sampling/importance_sampling_ratio/max": 1.1216233968734741, "sampling/importance_sampling_ratio/mean": 0.9922677278518677, "sampling/importance_sampling_ratio/min": 0.8731721043586731, "sampling/sampling_logp_difference/max": 0.11841106414794922, "sampling/sampling_logp_difference/mean": 0.005156847648322582, "step": 485, "step_time": 11.43912453700068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1398.0, "completions/max_terminated_length": 1398.0, "completions/mean_length": 430.59375, "completions/mean_terminated_length": 430.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.13652128633111715, "epoch": 0.00972, "frac_reward_zero_std": 0.0, "grad_norm": 0.11653725057840347, "kl": 0.5727855525910854, "learning_rate": 4.2000000000000004e-05, "loss": 0.0016, "num_tokens": 21404758.0, "reward": 0.719502329826355, "reward_std": 0.6733185052871704, "rewards/rollout_reward_func/mean": 0.719502329826355, "rewards/rollout_reward_func/std": 0.677751362323761, "sampling/importance_sampling_ratio/max": 1.0093036890029907, "sampling/importance_sampling_ratio/mean": 0.991510272026062, "sampling/importance_sampling_ratio/min": 0.8487319350242615, "sampling/sampling_logp_difference/max": 0.11891388893127441, "sampling/sampling_logp_difference/mean": 0.005705517716705799, "step": 486, "step_time": 10.095237816999543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1706.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 607.65625, "completions/mean_terminated_length": 607.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.11966344434767962, "epoch": 0.00974, "frac_reward_zero_std": 0.0, "grad_norm": 0.29918214678764343, "kl": 0.8550918195396662, "learning_rate": 4.189401881414272e-05, "loss": 0.0381, "num_tokens": 21455144.0, "reward": 0.8423151969909668, "reward_std": 0.44600000977516174, "rewards/rollout_reward_func/mean": 0.8423151969909668, "rewards/rollout_reward_func/std": 0.44765400886535645, "sampling/importance_sampling_ratio/max": 1.248997449874878, "sampling/importance_sampling_ratio/mean": 0.9899729490280151, "sampling/importance_sampling_ratio/min": 0.7437707185745239, "sampling/sampling_logp_difference/max": 0.2850757837295532, "sampling/sampling_logp_difference/mean": 0.006408435292541981, "step": 487, "step_time": 11.170960809999997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1437.0, "completions/max_terminated_length": 1437.0, "completions/mean_length": 553.21875, "completions/mean_terminated_length": 553.21875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0757484482601285, "epoch": 0.00976, "frac_reward_zero_std": 0.75, "grad_norm": 0.04360786825418472, "kl": 0.7532531879842281, "learning_rate": 4.178803914663519e-05, "loss": 0.0039, "num_tokens": 21504324.0, "reward": 0.9662500023841858, "reward_std": 0.09545941650867462, "rewards/rollout_reward_func/mean": 0.9662500023841858, "rewards/rollout_reward_func/std": 0.19091881811618805, "sampling/importance_sampling_ratio/max": 1.1977804899215698, "sampling/importance_sampling_ratio/mean": 1.0102603435516357, "sampling/importance_sampling_ratio/min": 0.9621528387069702, "sampling/sampling_logp_difference/max": 0.18131548166275024, "sampling/sampling_logp_difference/mean": 0.003415257204324007, "step": 488, "step_time": 10.884426213001461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1779.0, "completions/max_terminated_length": 1779.0, "completions/mean_length": 562.03125, "completions/mean_terminated_length": 562.03125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.10897684702649713, "epoch": 0.00978, "frac_reward_zero_std": 0.5, "grad_norm": 0.15243735909461975, "kl": 1.22078837454319, "learning_rate": 4.168206251580537e-05, "loss": 0.0066, "num_tokens": 21552667.0, "reward": 0.8740178346633911, "reward_std": 0.29320162534713745, "rewards/rollout_reward_func/mean": 0.8740178346633911, "rewards/rollout_reward_func/std": 0.42059940099716187, "sampling/importance_sampling_ratio/max": 1.0490648746490479, "sampling/importance_sampling_ratio/mean": 0.9983556866645813, "sampling/importance_sampling_ratio/min": 0.8889144062995911, "sampling/sampling_logp_difference/max": 0.0942840576171875, "sampling/sampling_logp_difference/mean": 0.002200655173510313, "step": 489, "step_time": 11.501640307999878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1863.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 490.5625, "completions/mean_terminated_length": 490.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0902994244825095, "epoch": 0.0098, "frac_reward_zero_std": 0.0, "grad_norm": 0.2041340321302414, "kl": 0.9708405956625938, "learning_rate": 4.157609043993772e-05, "loss": 0.007, "num_tokens": 21598553.0, "reward": 0.5573501586914062, "reward_std": 0.7499261498451233, "rewards/rollout_reward_func/mean": 0.5573501586914062, "rewards/rollout_reward_func/std": 0.7157748937606812, "sampling/importance_sampling_ratio/max": 1.1010081768035889, "sampling/importance_sampling_ratio/mean": 1.001507043838501, "sampling/importance_sampling_ratio/min": 0.8917485475540161, "sampling/sampling_logp_difference/max": 0.09686458110809326, "sampling/sampling_logp_difference/mean": 0.0033968016505241394, "step": 490, "step_time": 11.725343363998945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1563.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 392.78125, "completions/mean_terminated_length": 392.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07797722821123898, "epoch": 0.00982, "frac_reward_zero_std": 0.0, "grad_norm": 0.11360481381416321, "kl": 0.8157632872462273, "learning_rate": 4.147012443725147e-05, "loss": -0.0135, "num_tokens": 21641493.0, "reward": 0.7027812004089355, "reward_std": 0.5969122052192688, "rewards/rollout_reward_func/mean": 0.7027812004089355, "rewards/rollout_reward_func/std": 0.601826548576355, "sampling/importance_sampling_ratio/max": 1.2114982604980469, "sampling/importance_sampling_ratio/mean": 1.0147638320922852, "sampling/importance_sampling_ratio/min": 0.9600318670272827, "sampling/sampling_logp_difference/max": 0.177947998046875, "sampling/sampling_logp_difference/mean": 0.004218671470880508, "step": 491, "step_time": 10.475962013999379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1530.0, "completions/max_terminated_length": 1530.0, "completions/mean_length": 478.75, "completions/mean_terminated_length": 478.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05180919531267136, "epoch": 0.00984, "frac_reward_zero_std": 0.0, "grad_norm": 0.0696905255317688, "kl": 0.6131577678024769, "learning_rate": 4.136416602587882e-05, "loss": 0.0251, "num_tokens": 21687321.0, "reward": 0.8049613237380981, "reward_std": 0.4692973494529724, "rewards/rollout_reward_func/mean": 0.8049613237380981, "rewards/rollout_reward_func/std": 0.4801423251628876, "sampling/importance_sampling_ratio/max": 1.0695058107376099, "sampling/importance_sampling_ratio/mean": 0.9987416863441467, "sampling/importance_sampling_ratio/min": 0.9507795572280884, "sampling/sampling_logp_difference/max": 0.08395123481750488, "sampling/sampling_logp_difference/mean": 0.0016274088993668556, "step": 492, "step_time": 10.089187919999858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1794.0, "completions/max_terminated_length": 1794.0, "completions/mean_length": 467.375, "completions/mean_terminated_length": 467.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08265486313030124, "epoch": 0.00986, "frac_reward_zero_std": 0.5, "grad_norm": 0.1280883252620697, "kl": 0.9173768572509289, "learning_rate": 4.125821672384323e-05, "loss": 0.0084, "num_tokens": 21732349.0, "reward": 0.7548463344573975, "reward_std": 0.45398229360580444, "rewards/rollout_reward_func/mean": 0.7548463344573975, "rewards/rollout_reward_func/std": 0.6590513586997986, "sampling/importance_sampling_ratio/max": 1.1355280876159668, "sampling/importance_sampling_ratio/mean": 0.9936286211013794, "sampling/importance_sampling_ratio/min": 0.8541780710220337, "sampling/sampling_logp_difference/max": 0.1074373722076416, "sampling/sampling_logp_difference/mean": 0.005605650134384632, "step": 493, "step_time": 11.067834136999409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1681.0, "completions/max_terminated_length": 1681.0, "completions/mean_length": 443.75, "completions/mean_terminated_length": 443.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06589517166139558, "epoch": 0.00988, "frac_reward_zero_std": 0.25, "grad_norm": 0.07820002734661102, "kl": 0.9809021249529906, "learning_rate": 4.1152278049037647e-05, "loss": 0.0138, "num_tokens": 21776737.0, "reward": 0.8034026622772217, "reward_std": 0.407662034034729, "rewards/rollout_reward_func/mean": 0.8034026622772217, "rewards/rollout_reward_func/std": 0.4872661530971527, "sampling/importance_sampling_ratio/max": 1.2076959609985352, "sampling/importance_sampling_ratio/mean": 0.9939943552017212, "sampling/importance_sampling_ratio/min": 0.8142198324203491, "sampling/sampling_logp_difference/max": 0.19893062114715576, "sampling/sampling_logp_difference/mean": 0.0054200030863285065, "step": 494, "step_time": 10.883576180000091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1725.0, "completions/max_terminated_length": 1725.0, "completions/mean_length": 339.59375, "completions/mean_terminated_length": 339.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.059448995627462864, "epoch": 0.0099, "frac_reward_zero_std": 0.5, "grad_norm": 0.05765138566493988, "kl": 1.2464230358600616, "learning_rate": 4.104635151920274e-05, "loss": 0.01, "num_tokens": 21815983.0, "reward": 0.8354934453964233, "reward_std": 0.28896060585975647, "rewards/rollout_reward_func/mean": 0.8354934453964233, "rewards/rollout_reward_func/std": 0.4644348919391632, "sampling/importance_sampling_ratio/max": 1.2317893505096436, "sampling/importance_sampling_ratio/mean": 1.0071722269058228, "sampling/importance_sampling_ratio/min": 0.9182206392288208, "sampling/sampling_logp_difference/max": 0.20687556266784668, "sampling/sampling_logp_difference/mean": 0.0037219184450805187, "step": 495, "step_time": 10.235724447998109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1508.0, "completions/max_terminated_length": 1508.0, "completions/mean_length": 548.21875, "completions/mean_terminated_length": 548.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06742370151914656, "epoch": 0.00992, "frac_reward_zero_std": 0.25, "grad_norm": 0.15820346772670746, "kl": 0.5895766140893102, "learning_rate": 4.094043865190523e-05, "loss": -0.0122, "num_tokens": 21864170.0, "reward": 0.7751697301864624, "reward_std": 0.48525679111480713, "rewards/rollout_reward_func/mean": 0.7751697301864624, "rewards/rollout_reward_func/std": 0.5570719242095947, "sampling/importance_sampling_ratio/max": 1.0206085443496704, "sampling/importance_sampling_ratio/mean": 0.9919231534004211, "sampling/importance_sampling_ratio/min": 0.8324635624885559, "sampling/sampling_logp_difference/max": 0.20400452613830566, "sampling/sampling_logp_difference/mean": 0.00430405605584383, "step": 496, "step_time": 10.16583785399962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1676.0, "completions/max_terminated_length": 1676.0, "completions/mean_length": 508.59375, "completions/mean_terminated_length": 508.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06964499555760995, "epoch": 0.00994, "frac_reward_zero_std": 0.0, "grad_norm": 0.17510947585105896, "kl": 1.2203457951545715, "learning_rate": 4.0834540964516064e-05, "loss": -0.0096, "num_tokens": 21908738.0, "reward": 0.7420090436935425, "reward_std": 0.560528039932251, "rewards/rollout_reward_func/mean": 0.7420090436935425, "rewards/rollout_reward_func/std": 0.5723789930343628, "sampling/importance_sampling_ratio/max": 1.0947972536087036, "sampling/importance_sampling_ratio/mean": 0.9835255742073059, "sampling/importance_sampling_ratio/min": 0.7273269891738892, "sampling/sampling_logp_difference/max": 0.22966480255126953, "sampling/sampling_logp_difference/mean": 0.006120853126049042, "step": 497, "step_time": 10.878331473998514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1716.0, "completions/max_terminated_length": 1716.0, "completions/mean_length": 391.875, "completions/mean_terminated_length": 391.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.057995552429929376, "epoch": 0.00996, "frac_reward_zero_std": 0.25, "grad_norm": 0.10284340381622314, "kl": 1.1110782772302628, "learning_rate": 4.072865997418874e-05, "loss": 0.0155, "num_tokens": 21951424.0, "reward": 0.6821558475494385, "reward_std": 0.5163916349411011, "rewards/rollout_reward_func/mean": 0.6821558475494385, "rewards/rollout_reward_func/std": 0.6458978056907654, "sampling/importance_sampling_ratio/max": 1.1218260526657104, "sampling/importance_sampling_ratio/mean": 1.0077598094940186, "sampling/importance_sampling_ratio/min": 0.9498429298400879, "sampling/sampling_logp_difference/max": 0.11494731903076172, "sampling/sampling_logp_difference/mean": 0.0026006398256868124, "step": 498, "step_time": 11.395621328000743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1881.0, "completions/max_terminated_length": 1881.0, "completions/mean_length": 425.0, "completions/mean_terminated_length": 425.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04323529277462512, "epoch": 0.00998, "frac_reward_zero_std": 0.25, "grad_norm": 0.046052511781454086, "kl": 1.1712440252304077, "learning_rate": 4.0622797197837486e-05, "loss": 0.0205, "num_tokens": 21994476.0, "reward": 0.7436999082565308, "reward_std": 0.46817320585250854, "rewards/rollout_reward_func/mean": 0.7436999082565308, "rewards/rollout_reward_func/std": 0.5753507018089294, "sampling/importance_sampling_ratio/max": 1.0481750965118408, "sampling/importance_sampling_ratio/mean": 0.9975406527519226, "sampling/importance_sampling_ratio/min": 0.9202783703804016, "sampling/sampling_logp_difference/max": 0.0828857421875, "sampling/sampling_logp_difference/mean": 0.0015834070509299636, "step": 499, "step_time": 11.59773083699929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1712.0, "completions/max_terminated_length": 1712.0, "completions/mean_length": 454.1875, "completions/mean_terminated_length": 454.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05083105922676623, "epoch": 0.01, "frac_reward_zero_std": 0.0, "grad_norm": 0.12082856148481369, "kl": 1.2271523475646973, "learning_rate": 4.051695415211568e-05, "loss": -0.0035, "num_tokens": 22039991.0, "reward": 0.58428955078125, "reward_std": 0.7305554151535034, "rewards/rollout_reward_func/mean": 0.58428955078125, "rewards/rollout_reward_func/std": 0.7157592177391052, "sampling/importance_sampling_ratio/max": 1.1955643892288208, "sampling/importance_sampling_ratio/mean": 1.005871057510376, "sampling/importance_sampling_ratio/min": 0.8955443501472473, "sampling/sampling_logp_difference/max": 0.1126253604888916, "sampling/sampling_logp_difference/mean": 0.0031339083798229694, "step": 500, "step_time": 11.049340204001055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1753.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 451.34375, "completions/mean_terminated_length": 451.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.03135061217471957, "epoch": 0.01002, "frac_reward_zero_std": 0.25, "grad_norm": 0.10598837584257126, "kl": 0.938065692782402, "learning_rate": 4.0411132353393934e-05, "loss": 0.0075, "num_tokens": 22083850.0, "reward": 0.8000115156173706, "reward_std": 0.33046185970306396, "rewards/rollout_reward_func/mean": 0.8000115156173706, "rewards/rollout_reward_func/std": 0.42327141761779785, "sampling/importance_sampling_ratio/max": 1.0565725564956665, "sampling/importance_sampling_ratio/mean": 0.9984250068664551, "sampling/importance_sampling_ratio/min": 0.9035040140151978, "sampling/sampling_logp_difference/max": 0.10139322280883789, "sampling/sampling_logp_difference/mean": 0.0013769750948995352, "step": 501, "step_time": 10.514925406999282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1755.0, "completions/max_terminated_length": 1755.0, "completions/mean_length": 551.5, "completions/mean_terminated_length": 551.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.034519605920650065, "epoch": 0.01004, "frac_reward_zero_std": 0.0, "grad_norm": 0.060829635709524155, "kl": 0.9796556103974581, "learning_rate": 4.030533331773851e-05, "loss": 0.0231, "num_tokens": 22131966.0, "reward": 0.6859860420227051, "reward_std": 0.6464661955833435, "rewards/rollout_reward_func/mean": 0.6859860420227051, "rewards/rollout_reward_func/std": 0.6413649916648865, "sampling/importance_sampling_ratio/max": 1.0427836179733276, "sampling/importance_sampling_ratio/mean": 0.9997246265411377, "sampling/importance_sampling_ratio/min": 0.9577890634536743, "sampling/sampling_logp_difference/max": 0.04529210925102234, "sampling/sampling_logp_difference/mean": 0.001048503560014069, "step": 502, "step_time": 11.51904865099823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 235.9375, "completions/mean_terminated_length": 235.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.01680125559505541, "epoch": 0.01006, "frac_reward_zero_std": 0.5, "grad_norm": 0.0020468251314014196, "kl": 1.1588670834898949, "learning_rate": 4.019955856088958e-05, "loss": 0.0003, "num_tokens": 22166983.0, "reward": 0.9035841226577759, "reward_std": 0.2727053165435791, "rewards/rollout_reward_func/mean": 0.9035841226577759, "rewards/rollout_reward_func/std": 0.3943967819213867, "sampling/importance_sampling_ratio/max": 1.0207539796829224, "sampling/importance_sampling_ratio/mean": 1.0005617141723633, "sampling/importance_sampling_ratio/min": 0.9926008582115173, "sampling/sampling_logp_difference/max": 0.01048542931675911, "sampling/sampling_logp_difference/mean": 0.00035174289951100945, "step": 503, "step_time": 7.6459593879999375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1665.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 617.1875, "completions/mean_terminated_length": 617.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.03420832473784685, "epoch": 0.01008, "frac_reward_zero_std": 0.75, "grad_norm": 0.3015705347061157, "kl": 0.8226663991808891, "learning_rate": 4.009380959823941e-05, "loss": 0.0126, "num_tokens": 22217978.0, "reward": 0.9672363996505737, "reward_std": 0.0926695317029953, "rewards/rollout_reward_func/mean": 0.9672363996505737, "rewards/rollout_reward_func/std": 0.1853390485048294, "sampling/importance_sampling_ratio/max": 1.2533165216445923, "sampling/importance_sampling_ratio/mean": 1.0091850757598877, "sampling/importance_sampling_ratio/min": 0.9805535674095154, "sampling/sampling_logp_difference/max": 0.22866535186767578, "sampling/sampling_logp_difference/mean": 0.0022933015134185553, "step": 504, "step_time": 12.005458384999656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1899.0, "completions/max_terminated_length": 1899.0, "completions/mean_length": 481.25, "completions/mean_terminated_length": 481.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.03404964134097099, "epoch": 0.0101, "frac_reward_zero_std": 0.0, "grad_norm": 0.4169735312461853, "kl": 2.2265808936208487, "learning_rate": 3.998808794481079e-05, "loss": 0.0206, "num_tokens": 22262464.0, "reward": 0.6187294721603394, "reward_std": 0.6439776420593262, "rewards/rollout_reward_func/mean": 0.6187294721603394, "rewards/rollout_reward_func/std": 0.661870002746582, "sampling/importance_sampling_ratio/max": 1.4046202898025513, "sampling/importance_sampling_ratio/mean": 1.0136603116989136, "sampling/importance_sampling_ratio/min": 0.9237033724784851, "sampling/sampling_logp_difference/max": 0.3370075225830078, "sampling/sampling_logp_difference/mean": 0.003800370264798403, "step": 505, "step_time": 11.380447475999972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1809.0, "completions/max_terminated_length": 1809.0, "completions/mean_length": 556.84375, "completions/mean_terminated_length": 556.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.02327341859927401, "epoch": 0.01012, "frac_reward_zero_std": 0.0, "grad_norm": 0.051659755408763885, "kl": 0.7774786297231913, "learning_rate": 3.988239511523521e-05, "loss": 0.0137, "num_tokens": 22310759.0, "reward": 0.7818125486373901, "reward_std": 0.5520204305648804, "rewards/rollout_reward_func/mean": 0.7818125486373901, "rewards/rollout_reward_func/std": 0.5407187342643738, "sampling/importance_sampling_ratio/max": 1.0028462409973145, "sampling/importance_sampling_ratio/mean": 0.9955629110336304, "sampling/importance_sampling_ratio/min": 0.8841311931610107, "sampling/sampling_logp_difference/max": 0.12315988540649414, "sampling/sampling_logp_difference/mean": 0.0010846301447600126, "step": 506, "step_time": 11.323048752999966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1575.0, "completions/max_terminated_length": 1575.0, "completions/mean_length": 379.96875, "completions/mean_terminated_length": 379.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.02136528641131008, "epoch": 0.01014, "frac_reward_zero_std": 0.25, "grad_norm": 0.11239355802536011, "kl": 1.233220400288701, "learning_rate": 3.977673262373125e-05, "loss": 0.0016, "num_tokens": 22351464.0, "reward": 0.8312892317771912, "reward_std": 0.3451900780200958, "rewards/rollout_reward_func/mean": 0.8312892317771912, "rewards/rollout_reward_func/std": 0.39840009808540344, "sampling/importance_sampling_ratio/max": 1.0144060850143433, "sampling/importance_sampling_ratio/mean": 0.9902513027191162, "sampling/importance_sampling_ratio/min": 0.6937952041625977, "sampling/sampling_logp_difference/max": 0.36482763290405273, "sampling/sampling_logp_difference/mean": 0.003503193613141775, "step": 507, "step_time": 10.789526354998998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1609.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 524.25, "completions/mean_terminated_length": 524.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.049914170522242785, "epoch": 0.01016, "frac_reward_zero_std": 0.25, "grad_norm": 0.24250341951847076, "kl": 1.1094746142625809, "learning_rate": 3.967110198408284e-05, "loss": 0.0148, "num_tokens": 22398281.0, "reward": 0.7415827512741089, "reward_std": 0.44241762161254883, "rewards/rollout_reward_func/mean": 0.7415827512741089, "rewards/rollout_reward_func/std": 0.5184012651443481, "sampling/importance_sampling_ratio/max": 1.0101706981658936, "sampling/importance_sampling_ratio/mean": 0.9925948977470398, "sampling/importance_sampling_ratio/min": 0.8886620402336121, "sampling/sampling_logp_difference/max": 0.12332534790039062, "sampling/sampling_logp_difference/mean": 0.0018256003968417645, "step": 508, "step_time": 10.797887164999338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2558.0, "completions/max_terminated_length": 2558.0, "completions/mean_length": 732.65625, "completions/mean_terminated_length": 732.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07613262860104442, "epoch": 0.01018, "frac_reward_zero_std": 0.0, "grad_norm": 0.11984723061323166, "kl": 1.170874372124672, "learning_rate": 3.9565504709617576e-05, "loss": -0.0021, "num_tokens": 22453285.0, "reward": 0.38084492087364197, "reward_std": 0.8399746417999268, "rewards/rollout_reward_func/mean": 0.38084492087364197, "rewards/rollout_reward_func/std": 0.8512788414955139, "sampling/importance_sampling_ratio/max": 1.1354351043701172, "sampling/importance_sampling_ratio/mean": 1.0012376308441162, "sampling/importance_sampling_ratio/min": 0.9374959468841553, "sampling/sampling_logp_difference/max": 0.12116432189941406, "sampling/sampling_logp_difference/mean": 0.002764175646007061, "step": 509, "step_time": 14.154839641000763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1673.0, "completions/max_terminated_length": 1673.0, "completions/mean_length": 567.15625, "completions/mean_terminated_length": 567.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08044843538664281, "epoch": 0.0102, "frac_reward_zero_std": 0.25, "grad_norm": 0.38198861479759216, "kl": 1.8111945688724518, "learning_rate": 3.945994231318503e-05, "loss": 0.0013, "num_tokens": 22502034.0, "reward": 0.7463120222091675, "reward_std": 0.5106428265571594, "rewards/rollout_reward_func/mean": 0.7463120222091675, "rewards/rollout_reward_func/std": 0.6254424452781677, "sampling/importance_sampling_ratio/max": 1.0896629095077515, "sampling/importance_sampling_ratio/mean": 0.9891909956932068, "sampling/importance_sampling_ratio/min": 0.7078548669815063, "sampling/sampling_logp_difference/max": 0.35415029525756836, "sampling/sampling_logp_difference/mean": 0.006025128066539764, "step": 510, "step_time": 11.058366153000861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3372.0, "completions/max_terminated_length": 3372.0, "completions/mean_length": 612.15625, "completions/mean_terminated_length": 612.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09538603574037552, "epoch": 0.01022, "frac_reward_zero_std": 0.0, "grad_norm": 0.8409005403518677, "kl": 1.4058475196361542, "learning_rate": 3.9354416307135123e-05, "loss": -0.0081, "num_tokens": 22551961.0, "reward": 0.49655020236968994, "reward_std": 0.7399753332138062, "rewards/rollout_reward_func/mean": 0.49655020236968994, "rewards/rollout_reward_func/std": 0.7545689940452576, "sampling/importance_sampling_ratio/max": 1.12163245677948, "sampling/importance_sampling_ratio/mean": 0.9589322805404663, "sampling/importance_sampling_ratio/min": 4.551801101836117e-13, "sampling/sampling_logp_difference/max": 27.884180068969727, "sampling/sampling_logp_difference/mean": 0.15730509161949158, "step": 511, "step_time": 16.033734293000634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1738.0, "completions/max_terminated_length": 1738.0, "completions/mean_length": 654.8125, "completions/mean_terminated_length": 654.8125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.10432835202664137, "epoch": 0.01024, "frac_reward_zero_std": 0.25, "grad_norm": 0.1831570714712143, "kl": 1.120298407971859, "learning_rate": 3.92489282032964e-05, "loss": 0.0005, "num_tokens": 22603412.0, "reward": 0.781509518623352, "reward_std": 0.44887155294418335, "rewards/rollout_reward_func/mean": 0.781509518623352, "rewards/rollout_reward_func/std": 0.5431646704673767, "sampling/importance_sampling_ratio/max": 1.420763611793518, "sampling/importance_sampling_ratio/mean": 1.0138649940490723, "sampling/importance_sampling_ratio/min": 0.87284916639328, "sampling/sampling_logp_difference/max": 0.372980535030365, "sampling/sampling_logp_difference/mean": 0.0072586676105856895, "step": 512, "step_time": 11.948811046000628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1767.0, "completions/max_terminated_length": 1767.0, "completions/mean_length": 592.0625, "completions/mean_terminated_length": 592.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07605438842438161, "epoch": 0.01026, "frac_reward_zero_std": 0.25, "grad_norm": 0.12347698956727982, "kl": 0.8066644771024585, "learning_rate": 3.91434795129544e-05, "loss": 0.0267, "num_tokens": 22652885.0, "reward": 0.7810208201408386, "reward_std": 0.4745109975337982, "rewards/rollout_reward_func/mean": 0.7810208201408386, "rewards/rollout_reward_func/std": 0.5477237701416016, "sampling/importance_sampling_ratio/max": 1.115304946899414, "sampling/importance_sampling_ratio/mean": 1.0039174556732178, "sampling/importance_sampling_ratio/min": 0.9900262355804443, "sampling/sampling_logp_difference/max": 0.10195615887641907, "sampling/sampling_logp_difference/mean": 0.0012508833315223455, "step": 513, "step_time": 11.765432899998814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1579.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 372.59375, "completions/mean_terminated_length": 372.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08419219451025128, "epoch": 0.01028, "frac_reward_zero_std": 0.0, "grad_norm": 0.11732294410467148, "kl": 1.2736047133803368, "learning_rate": 3.903807174683001e-05, "loss": -0.0044, "num_tokens": 22693988.0, "reward": 0.5589157938957214, "reward_std": 0.7505031228065491, "rewards/rollout_reward_func/mean": 0.5589157938957214, "rewards/rollout_reward_func/std": 0.7537054419517517, "sampling/importance_sampling_ratio/max": 1.0845943689346313, "sampling/importance_sampling_ratio/mean": 1.0000195503234863, "sampling/importance_sampling_ratio/min": 0.8875836730003357, "sampling/sampling_logp_difference/max": 0.1197361946105957, "sampling/sampling_logp_difference/mean": 0.003345753997564316, "step": 514, "step_time": 10.180601101998946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1647.0, "completions/max_terminated_length": 1647.0, "completions/mean_length": 478.3125, "completions/mean_terminated_length": 478.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07674759184010327, "epoch": 0.0103, "frac_reward_zero_std": 0.0, "grad_norm": 0.24825896322727203, "kl": 0.8189521059393883, "learning_rate": 3.893270641505782e-05, "loss": 0.0045, "num_tokens": 22739539.0, "reward": 0.6814586520195007, "reward_std": 0.6099026799201965, "rewards/rollout_reward_func/mean": 0.6814586520195007, "rewards/rollout_reward_func/std": 0.646129846572876, "sampling/importance_sampling_ratio/max": 1.2896918058395386, "sampling/importance_sampling_ratio/mean": 1.0159671306610107, "sampling/importance_sampling_ratio/min": 0.9526877403259277, "sampling/sampling_logp_difference/max": 0.13000333309173584, "sampling/sampling_logp_difference/mean": 0.004260492045432329, "step": 515, "step_time": 10.66617848900205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1687.0, "completions/max_terminated_length": 1687.0, "completions/mean_length": 409.625, "completions/mean_terminated_length": 409.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09069445903878659, "epoch": 0.01032, "frac_reward_zero_std": 0.0, "grad_norm": 0.1465643048286438, "kl": 0.9317524703219533, "learning_rate": 3.882738502716445e-05, "loss": -0.0037, "num_tokens": 22781979.0, "reward": 0.7110027074813843, "reward_std": 0.6489017605781555, "rewards/rollout_reward_func/mean": 0.7110027074813843, "rewards/rollout_reward_func/std": 0.6434390544891357, "sampling/importance_sampling_ratio/max": 1.0487313270568848, "sampling/importance_sampling_ratio/mean": 0.994596004486084, "sampling/importance_sampling_ratio/min": 0.8980613946914673, "sampling/sampling_logp_difference/max": 0.10751962661743164, "sampling/sampling_logp_difference/mean": 0.0038530954625457525, "step": 516, "step_time": 11.339727284002038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1497.0, "completions/max_terminated_length": 1497.0, "completions/mean_length": 361.15625, "completions/mean_terminated_length": 361.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07671989026130177, "epoch": 0.01034, "frac_reward_zero_std": 0.25, "grad_norm": 0.07108905911445618, "kl": 0.9943260326981544, "learning_rate": 3.872210909204696e-05, "loss": 0.0063, "num_tokens": 22821404.0, "reward": 0.6984992027282715, "reward_std": 0.6179108619689941, "rewards/rollout_reward_func/mean": 0.6984992027282715, "rewards/rollout_reward_func/std": 0.7119293212890625, "sampling/importance_sampling_ratio/max": 1.0674667358398438, "sampling/importance_sampling_ratio/mean": 1.0011849403381348, "sampling/importance_sampling_ratio/min": 0.9669804573059082, "sampling/sampling_logp_difference/max": 0.06662255525588989, "sampling/sampling_logp_difference/mean": 0.001575256115756929, "step": 517, "step_time": 9.916020503002073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1718.0, "completions/max_terminated_length": 1718.0, "completions/mean_length": 499.8125, "completions/mean_terminated_length": 499.8125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0576839204877615, "epoch": 0.01036, "frac_reward_zero_std": 0.5, "grad_norm": 0.07108048349618912, "kl": 0.7941818619146943, "learning_rate": 3.861688011795128e-05, "loss": 0.0142, "num_tokens": 22866573.0, "reward": 0.9350903034210205, "reward_std": 0.18359236419200897, "rewards/rollout_reward_func/mean": 0.9350903034210205, "rewards/rollout_reward_func/std": 0.25542908906936646, "sampling/importance_sampling_ratio/max": 1.052558183670044, "sampling/importance_sampling_ratio/mean": 0.993338942527771, "sampling/importance_sampling_ratio/min": 0.9079891443252563, "sampling/sampling_logp_difference/max": 0.09648752212524414, "sampling/sampling_logp_difference/mean": 0.0023361104540526867, "step": 518, "step_time": 11.465338561000863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1567.0, "completions/max_terminated_length": 1567.0, "completions/mean_length": 366.0625, "completions/mean_terminated_length": 366.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08495193347334862, "epoch": 0.01038, "frac_reward_zero_std": 0.25, "grad_norm": 0.1308465301990509, "kl": 1.2989885248243809, "learning_rate": 3.8511699612450476e-05, "loss": 0.0093, "num_tokens": 22906746.0, "reward": 0.7490164041519165, "reward_std": 0.5244844555854797, "rewards/rollout_reward_func/mean": 0.7490164041519165, "rewards/rollout_reward_func/std": 0.618692934513092, "sampling/importance_sampling_ratio/max": 1.1017502546310425, "sampling/importance_sampling_ratio/mean": 1.0037178993225098, "sampling/importance_sampling_ratio/min": 0.9365481734275818, "sampling/sampling_logp_difference/max": 0.09602105617523193, "sampling/sampling_logp_difference/mean": 0.0023925043642520905, "step": 519, "step_time": 9.708121038001082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1509.0, "completions/max_terminated_length": 1509.0, "completions/mean_length": 441.25, "completions/mean_terminated_length": 441.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08589412528090179, "epoch": 0.0104, "frac_reward_zero_std": 0.25, "grad_norm": 0.1131943017244339, "kl": 1.2652635872364044, "learning_rate": 3.840656908242324e-05, "loss": -0.0048, "num_tokens": 22950923.0, "reward": 0.5504136085510254, "reward_std": 0.6078059077262878, "rewards/rollout_reward_func/mean": 0.5504136085510254, "rewards/rollout_reward_func/std": 0.7216536998748779, "sampling/importance_sampling_ratio/max": 1.1027555465698242, "sampling/importance_sampling_ratio/mean": 1.004356861114502, "sampling/importance_sampling_ratio/min": 0.9681494235992432, "sampling/sampling_logp_difference/max": 0.09781599044799805, "sampling/sampling_logp_difference/mean": 0.0019257338717579842, "step": 520, "step_time": 9.573828287000651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1811.0, "completions/max_terminated_length": 1811.0, "completions/mean_length": 463.3125, "completions/mean_terminated_length": 463.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09211243479512632, "epoch": 0.01042, "frac_reward_zero_std": 0.25, "grad_norm": 0.41930943727493286, "kl": 1.4152699708938599, "learning_rate": 3.830149003403234e-05, "loss": 0.0054, "num_tokens": 22996268.0, "reward": 0.5979372262954712, "reward_std": 0.6666386127471924, "rewards/rollout_reward_func/mean": 0.5979372262954712, "rewards/rollout_reward_func/std": 0.7860245704650879, "sampling/importance_sampling_ratio/max": 1.196882963180542, "sampling/importance_sampling_ratio/mean": 1.00369131565094, "sampling/importance_sampling_ratio/min": 0.8532478213310242, "sampling/sampling_logp_difference/max": 0.1797161102294922, "sampling/sampling_logp_difference/mean": 0.005319025367498398, "step": 521, "step_time": 11.087668094999572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1637.0, "completions/max_terminated_length": 1637.0, "completions/mean_length": 547.96875, "completions/mean_terminated_length": 547.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08192298281937838, "epoch": 0.01044, "frac_reward_zero_std": 0.25, "grad_norm": 0.05310691148042679, "kl": 0.8873818498104811, "learning_rate": 3.8196463972702914e-05, "loss": -0.001, "num_tokens": 23043116.0, "reward": 0.8092901110649109, "reward_std": 0.4575382173061371, "rewards/rollout_reward_func/mean": 0.8092901110649109, "rewards/rollout_reward_func/std": 0.5375297665596008, "sampling/importance_sampling_ratio/max": 1.0963793992996216, "sampling/importance_sampling_ratio/mean": 0.9984251856803894, "sampling/importance_sampling_ratio/min": 0.9435007572174072, "sampling/sampling_logp_difference/max": 0.0920565128326416, "sampling/sampling_logp_difference/mean": 0.001776629826053977, "step": 522, "step_time": 10.37864657999944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1359.0, "completions/max_terminated_length": 1359.0, "completions/mean_length": 426.875, "completions/mean_terminated_length": 426.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06768591282889247, "epoch": 0.01046, "frac_reward_zero_std": 0.0, "grad_norm": 0.08715008944272995, "kl": 0.7169635184109211, "learning_rate": 3.809149240310102e-05, "loss": 0.0041, "num_tokens": 23086236.0, "reward": 0.5649412870407104, "reward_std": 0.6830838918685913, "rewards/rollout_reward_func/mean": 0.5649412870407104, "rewards/rollout_reward_func/std": 0.7904418110847473, "sampling/importance_sampling_ratio/max": 1.0108884572982788, "sampling/importance_sampling_ratio/mean": 0.9999282956123352, "sampling/importance_sampling_ratio/min": 0.9879587292671204, "sampling/sampling_logp_difference/max": 0.01083691418170929, "sampling/sampling_logp_difference/mean": 0.000571709533687681, "step": 523, "step_time": 9.531492074001108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2571.0, "completions/max_terminated_length": 2571.0, "completions/mean_length": 598.5625, "completions/mean_terminated_length": 598.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08703542809234932, "epoch": 0.01048, "frac_reward_zero_std": 0.0, "grad_norm": 0.08490276336669922, "kl": 1.2478143200278282, "learning_rate": 3.798657682911203e-05, "loss": 0.022, "num_tokens": 23135125.0, "reward": 0.49186643958091736, "reward_std": 0.7990166544914246, "rewards/rollout_reward_func/mean": 0.49186643958091736, "rewards/rollout_reward_func/std": 0.7621797323226929, "sampling/importance_sampling_ratio/max": 1.2579545974731445, "sampling/importance_sampling_ratio/mean": 1.0084024667739868, "sampling/importance_sampling_ratio/min": 0.9391183257102966, "sampling/sampling_logp_difference/max": 0.22948741912841797, "sampling/sampling_logp_difference/mean": 0.0034019406884908676, "step": 524, "step_time": 12.801590966000731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1757.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 543.84375, "completions/mean_terminated_length": 543.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07401161268353462, "epoch": 0.0105, "frac_reward_zero_std": 0.0, "grad_norm": 0.18894262611865997, "kl": 0.5503964982926846, "learning_rate": 3.788171875381907e-05, "loss": 0.0149, "num_tokens": 23182089.0, "reward": 0.7194868326187134, "reward_std": 0.6322088241577148, "rewards/rollout_reward_func/mean": 0.7194868326187134, "rewards/rollout_reward_func/std": 0.6245484948158264, "sampling/importance_sampling_ratio/max": 1.1161991357803345, "sampling/importance_sampling_ratio/mean": 1.0019376277923584, "sampling/importance_sampling_ratio/min": 0.9150695204734802, "sampling/sampling_logp_difference/max": 0.09087705612182617, "sampling/sampling_logp_difference/mean": 0.002180443610996008, "step": 525, "step_time": 10.44984992099944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1657.0, "completions/max_terminated_length": 1657.0, "completions/mean_length": 542.15625, "completions/mean_terminated_length": 542.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06169994850642979, "epoch": 0.01052, "frac_reward_zero_std": 0.0, "grad_norm": 0.07849369943141937, "kl": 0.7413357943296432, "learning_rate": 3.7776919679481536e-05, "loss": -0.0032, "num_tokens": 23229219.0, "reward": 0.6794017553329468, "reward_std": 0.6186172962188721, "rewards/rollout_reward_func/mean": 0.6794017553329468, "rewards/rollout_reward_func/std": 0.5992524027824402, "sampling/importance_sampling_ratio/max": 1.11381196975708, "sampling/importance_sampling_ratio/mean": 1.0065619945526123, "sampling/importance_sampling_ratio/min": 0.9357106685638428, "sampling/sampling_logp_difference/max": 0.10562968254089355, "sampling/sampling_logp_difference/mean": 0.0026286328211426735, "step": 526, "step_time": 10.885161679000703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1634.0, "completions/max_terminated_length": 1634.0, "completions/mean_length": 533.4375, "completions/mean_terminated_length": 533.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.11208318607532419, "epoch": 0.01054, "frac_reward_zero_std": 0.5, "grad_norm": 0.09920431673526764, "kl": 1.4536795150488615, "learning_rate": 3.767218110751351e-05, "loss": 0.0001, "num_tokens": 23275970.0, "reward": 0.6571357846260071, "reward_std": 0.41957128047943115, "rewards/rollout_reward_func/mean": 0.6571357846260071, "rewards/rollout_reward_func/std": 0.7388449907302856, "sampling/importance_sampling_ratio/max": 1.1207969188690186, "sampling/importance_sampling_ratio/mean": 0.9940973520278931, "sampling/importance_sampling_ratio/min": 0.885270357131958, "sampling/sampling_logp_difference/max": 0.11435890197753906, "sampling/sampling_logp_difference/mean": 0.0038540856912732124, "step": 527, "step_time": 10.841344360000221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1569.0, "completions/max_terminated_length": 1569.0, "completions/mean_length": 488.5, "completions/mean_terminated_length": 488.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.050286242680158466, "epoch": 0.01056, "frac_reward_zero_std": 0.25, "grad_norm": 0.032455045729875565, "kl": 0.9610765911638737, "learning_rate": 3.756750453846229e-05, "loss": 0.0058, "num_tokens": 23321321.0, "reward": 0.8992905020713806, "reward_std": 0.28484952449798584, "rewards/rollout_reward_func/mean": 0.8992905020713806, "rewards/rollout_reward_func/std": 0.31813016533851624, "sampling/importance_sampling_ratio/max": 1.0659321546554565, "sampling/importance_sampling_ratio/mean": 1.001264214515686, "sampling/importance_sampling_ratio/min": 0.9518321752548218, "sampling/sampling_logp_difference/max": 0.061849117279052734, "sampling/sampling_logp_difference/mean": 0.0013898095348849893, "step": 528, "step_time": 10.04284817900134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1711.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 480.3125, "completions/mean_terminated_length": 480.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.03836047067306936, "epoch": 0.01058, "frac_reward_zero_std": 0.25, "grad_norm": 0.02927936241030693, "kl": 0.6131521947681904, "learning_rate": 3.7462891471986896e-05, "loss": 0.0082, "num_tokens": 23366895.0, "reward": 0.8727348446846008, "reward_std": 0.3599601686000824, "rewards/rollout_reward_func/mean": 0.8727348446846008, "rewards/rollout_reward_func/std": 0.42574217915534973, "sampling/importance_sampling_ratio/max": 1.0123018026351929, "sampling/importance_sampling_ratio/mean": 0.999829888343811, "sampling/importance_sampling_ratio/min": 0.975979208946228, "sampling/sampling_logp_difference/max": 0.02439779043197632, "sampling/sampling_logp_difference/mean": 0.00042736297473311424, "step": 529, "step_time": 10.461976789997607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1627.0, "completions/max_terminated_length": 1627.0, "completions/mean_length": 479.59375, "completions/mean_terminated_length": 479.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08037288812920451, "epoch": 0.0106, "frac_reward_zero_std": 0.0, "grad_norm": 0.07950285077095032, "kl": 0.8778408411890268, "learning_rate": 3.7358343406836574e-05, "loss": -0.0036, "num_tokens": 23411407.0, "reward": 0.7402489185333252, "reward_std": 0.5604634881019592, "rewards/rollout_reward_func/mean": 0.7402489185333252, "rewards/rollout_reward_func/std": 0.5792297720909119, "sampling/importance_sampling_ratio/max": 1.056509017944336, "sampling/importance_sampling_ratio/mean": 0.9955013394355774, "sampling/importance_sampling_ratio/min": 0.8875249028205872, "sampling/sampling_logp_difference/max": 0.12177228927612305, "sampling/sampling_logp_difference/mean": 0.0035653486847877502, "step": 530, "step_time": 10.67849502900026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1627.0, "completions/max_terminated_length": 1627.0, "completions/mean_length": 442.84375, "completions/mean_terminated_length": 442.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0705157108604908, "epoch": 0.01062, "frac_reward_zero_std": 0.25, "grad_norm": 0.11938540637493134, "kl": 0.9154520630836487, "learning_rate": 3.72538618408293e-05, "loss": -0.0015, "num_tokens": 23455483.0, "reward": 0.59360671043396, "reward_std": 0.643538773059845, "rewards/rollout_reward_func/mean": 0.59360671043396, "rewards/rollout_reward_func/std": 0.7493613958358765, "sampling/importance_sampling_ratio/max": 1.1069860458374023, "sampling/importance_sampling_ratio/mean": 0.9961196780204773, "sampling/importance_sampling_ratio/min": 0.8638603091239929, "sampling/sampling_logp_difference/max": 0.10870361328125, "sampling/sampling_logp_difference/mean": 0.0027833597268909216, "step": 531, "step_time": 10.127674736999325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1556.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 440.3125, "completions/mean_terminated_length": 440.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08149609714746475, "epoch": 0.01064, "frac_reward_zero_std": 0.5, "grad_norm": 0.10706121474504471, "kl": 0.6005466911010444, "learning_rate": 3.714944827083034e-05, "loss": 0.0059, "num_tokens": 23497925.0, "reward": 0.7788539528846741, "reward_std": 0.37198197841644287, "rewards/rollout_reward_func/mean": 0.7788539528846741, "rewards/rollout_reward_func/std": 0.5501177906990051, "sampling/importance_sampling_ratio/max": 1.0928115844726562, "sampling/importance_sampling_ratio/mean": 0.9988470673561096, "sampling/importance_sampling_ratio/min": 0.9047716856002808, "sampling/sampling_logp_difference/max": 0.12327957153320312, "sampling/sampling_logp_difference/mean": 0.0034255674108862877, "step": 532, "step_time": 10.179340233000403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1682.0, "completions/max_terminated_length": 1682.0, "completions/mean_length": 490.96875, "completions/mean_terminated_length": 490.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09096663491800427, "epoch": 0.01066, "frac_reward_zero_std": 0.0, "grad_norm": 0.11062154918909073, "kl": 0.8979768678545952, "learning_rate": 3.704510419273086e-05, "loss": -0.0038, "num_tokens": 23544161.0, "reward": 0.5577291250228882, "reward_std": 0.7685167193412781, "rewards/rollout_reward_func/mean": 0.5577291250228882, "rewards/rollout_reward_func/std": 0.7587558627128601, "sampling/importance_sampling_ratio/max": 1.118683934211731, "sampling/importance_sampling_ratio/mean": 1.001147747039795, "sampling/importance_sampling_ratio/min": 0.9011716246604919, "sampling/sampling_logp_difference/max": 0.11208462715148926, "sampling/sampling_logp_difference/mean": 0.004593024495989084, "step": 533, "step_time": 10.441497401999186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1698.0, "completions/max_terminated_length": 1698.0, "completions/mean_length": 443.4375, "completions/mean_terminated_length": 443.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09307856997475028, "epoch": 0.01068, "frac_reward_zero_std": 0.25, "grad_norm": 0.08935537189245224, "kl": 0.8264538124203682, "learning_rate": 3.694083110142639e-05, "loss": -0.0039, "num_tokens": 23588197.0, "reward": 0.658811092376709, "reward_std": 0.6445352435112, "rewards/rollout_reward_func/mean": 0.658811092376709, "rewards/rollout_reward_func/std": 0.7399898767471313, "sampling/importance_sampling_ratio/max": 1.0599919557571411, "sampling/importance_sampling_ratio/mean": 0.9991565942764282, "sampling/importance_sampling_ratio/min": 0.9573670029640198, "sampling/sampling_logp_difference/max": 0.05846625566482544, "sampling/sampling_logp_difference/mean": 0.0018819629913195968, "step": 534, "step_time": 10.524678548000338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1753.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 735.875, "completions/mean_terminated_length": 735.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.10358615824952722, "epoch": 0.0107, "frac_reward_zero_std": 0.0, "grad_norm": 0.09322969615459442, "kl": 0.9295036047697067, "learning_rate": 3.683663049079546e-05, "loss": 0.004, "num_tokens": 23643227.0, "reward": 0.619361162185669, "reward_std": 0.6203635931015015, "rewards/rollout_reward_func/mean": 0.619361162185669, "rewards/rollout_reward_func/std": 0.6573901772499084, "sampling/importance_sampling_ratio/max": 1.1375808715820312, "sampling/importance_sampling_ratio/mean": 0.998836874961853, "sampling/importance_sampling_ratio/min": 0.9059128761291504, "sampling/sampling_logp_difference/max": 0.12881749868392944, "sampling/sampling_logp_difference/mean": 0.0033338377252221107, "step": 535, "step_time": 11.479427648999263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1379.0, "completions/max_terminated_length": 1379.0, "completions/mean_length": 370.28125, "completions/mean_terminated_length": 370.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.050432760966941714, "epoch": 0.01072, "frac_reward_zero_std": 0.25, "grad_norm": 0.02274090237915516, "kl": 0.5466954186558723, "learning_rate": 3.6732503853678255e-05, "loss": -0.0078, "num_tokens": 23684357.0, "reward": 0.8070106506347656, "reward_std": 0.42495042085647583, "rewards/rollout_reward_func/mean": 0.8070106506347656, "rewards/rollout_reward_func/std": 0.5419858694076538, "sampling/importance_sampling_ratio/max": 1.0295512676239014, "sampling/importance_sampling_ratio/mean": 1.0011005401611328, "sampling/importance_sampling_ratio/min": 0.969044029712677, "sampling/sampling_logp_difference/max": 0.02874763309955597, "sampling/sampling_logp_difference/mean": 0.0007957536727190018, "step": 536, "step_time": 9.248562909001521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1694.0, "completions/max_terminated_length": 1694.0, "completions/mean_length": 509.46875, "completions/mean_terminated_length": 509.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.067957554012537, "epoch": 0.01074, "frac_reward_zero_std": 0.5, "grad_norm": 0.11915501952171326, "kl": 0.6763051301240921, "learning_rate": 3.6628452681855104e-05, "loss": -0.0093, "num_tokens": 23730874.0, "reward": 0.6868878602981567, "reward_std": 0.44584327936172485, "rewards/rollout_reward_func/mean": 0.6868878602981567, "rewards/rollout_reward_func/std": 0.6856384873390198, "sampling/importance_sampling_ratio/max": 1.0479577779769897, "sampling/importance_sampling_ratio/mean": 1.0005707740783691, "sampling/importance_sampling_ratio/min": 0.8968098759651184, "sampling/sampling_logp_difference/max": 0.11026835441589355, "sampling/sampling_logp_difference/mean": 0.0017416330520063639, "step": 537, "step_time": 11.00035013300112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1568.0, "completions/max_terminated_length": 1568.0, "completions/mean_length": 441.875, "completions/mean_terminated_length": 441.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06372684566304088, "epoch": 0.01076, "frac_reward_zero_std": 0.25, "grad_norm": 0.030752139165997505, "kl": 0.8988461454282515, "learning_rate": 3.652447846602522e-05, "loss": -0.0063, "num_tokens": 23774272.0, "reward": 0.7197457551956177, "reward_std": 0.5340789556503296, "rewards/rollout_reward_func/mean": 0.7197457551956177, "rewards/rollout_reward_func/std": 0.6286405920982361, "sampling/importance_sampling_ratio/max": 1.115963101387024, "sampling/importance_sampling_ratio/mean": 1.0028923749923706, "sampling/importance_sampling_ratio/min": 0.9739212989807129, "sampling/sampling_logp_difference/max": 0.1098470687866211, "sampling/sampling_logp_difference/mean": 0.0016269669868052006, "step": 538, "step_time": 9.940479218998917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1568.0, "completions/max_terminated_length": 1568.0, "completions/mean_length": 568.34375, "completions/mean_terminated_length": 568.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08513232902623713, "epoch": 0.01078, "frac_reward_zero_std": 0.25, "grad_norm": 0.09969284385442734, "kl": 0.7506486549973488, "learning_rate": 3.642058269578527e-05, "loss": 0.0009, "num_tokens": 23823690.0, "reward": 0.6886275410652161, "reward_std": 0.5765968561172485, "rewards/rollout_reward_func/mean": 0.6886275410652161, "rewards/rollout_reward_func/std": 0.6815220713615417, "sampling/importance_sampling_ratio/max": 1.0163958072662354, "sampling/importance_sampling_ratio/mean": 0.989382266998291, "sampling/importance_sampling_ratio/min": 0.8826927542686462, "sampling/sampling_logp_difference/max": 0.12479066848754883, "sampling/sampling_logp_difference/mean": 0.0031066639348864555, "step": 539, "step_time": 10.768308160999368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1607.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 425.40625, "completions/mean_terminated_length": 425.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06745803914964199, "epoch": 0.0108, "frac_reward_zero_std": 0.0, "grad_norm": 0.11450672894716263, "kl": 1.0822029784321785, "learning_rate": 3.631676685960809e-05, "loss": -0.0065, "num_tokens": 23867542.0, "reward": 0.5915122628211975, "reward_std": 0.7784448862075806, "rewards/rollout_reward_func/mean": 0.5915122628211975, "rewards/rollout_reward_func/std": 0.7519227862358093, "sampling/importance_sampling_ratio/max": 1.112187147140503, "sampling/importance_sampling_ratio/mean": 0.9949748516082764, "sampling/importance_sampling_ratio/min": 0.8973242044448853, "sampling/sampling_logp_difference/max": 0.11029195785522461, "sampling/sampling_logp_difference/mean": 0.002967415377497673, "step": 540, "step_time": 10.166667425000014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1599.0, "completions/max_terminated_length": 1599.0, "completions/mean_length": 440.1875, "completions/mean_terminated_length": 440.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04479483625618741, "epoch": 0.01082, "frac_reward_zero_std": 0.75, "grad_norm": 0.019083818420767784, "kl": 0.9599084109067917, "learning_rate": 3.621303244482132e-05, "loss": -0.001, "num_tokens": 23911384.0, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/rollout_reward_func/mean": 0.9375, "rewards/rollout_reward_func/std": 0.3535533845424652, "sampling/importance_sampling_ratio/max": 1.0072757005691528, "sampling/importance_sampling_ratio/mean": 0.996600866317749, "sampling/importance_sampling_ratio/min": 0.913865327835083, "sampling/sampling_logp_difference/max": 0.09268677234649658, "sampling/sampling_logp_difference/mean": 0.0011969178449362516, "step": 541, "step_time": 10.286826413998824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1758.0, "completions/max_terminated_length": 1758.0, "completions/mean_length": 556.3125, "completions/mean_terminated_length": 556.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.10587966139428318, "epoch": 0.01084, "frac_reward_zero_std": 0.25, "grad_norm": 0.12761332094669342, "kl": 0.6823977790772915, "learning_rate": 3.610938093758609e-05, "loss": 0.0075, "num_tokens": 23959033.0, "reward": 0.6891840696334839, "reward_std": 0.5877520442008972, "rewards/rollout_reward_func/mean": 0.6891840696334839, "rewards/rollout_reward_func/std": 0.6829987168312073, "sampling/importance_sampling_ratio/max": 1.3058912754058838, "sampling/importance_sampling_ratio/mean": 1.0070784091949463, "sampling/importance_sampling_ratio/min": 0.853683590888977, "sampling/sampling_logp_difference/max": 0.2663536071777344, "sampling/sampling_logp_difference/mean": 0.004197621718049049, "step": 542, "step_time": 10.986845143998835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1688.0, "completions/max_terminated_length": 1688.0, "completions/mean_length": 535.0, "completions/mean_terminated_length": 535.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08305445406585932, "epoch": 0.01086, "frac_reward_zero_std": 0.25, "grad_norm": 0.33937588334083557, "kl": 2.7145333625376225, "learning_rate": 3.600581382287577e-05, "loss": 0.0058, "num_tokens": 24006171.0, "reward": 0.6485474705696106, "reward_std": 0.5646494626998901, "rewards/rollout_reward_func/mean": 0.6485474705696106, "rewards/rollout_reward_func/std": 0.6551475524902344, "sampling/importance_sampling_ratio/max": 1.1280384063720703, "sampling/importance_sampling_ratio/mean": 1.0011396408081055, "sampling/importance_sampling_ratio/min": 0.8813457489013672, "sampling/sampling_logp_difference/max": 0.13014185428619385, "sampling/sampling_logp_difference/mean": 0.004103389102965593, "step": 543, "step_time": 10.655309847998979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1427.0, "completions/max_terminated_length": 1427.0, "completions/mean_length": 386.71875, "completions/mean_terminated_length": 386.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.041259254299802706, "epoch": 0.01088, "frac_reward_zero_std": 1.0, "grad_norm": 0.001173701137304306, "kl": 0.5491238739341497, "learning_rate": 3.5902332584454676e-05, "loss": 0.0021, "num_tokens": 24047477.0, "reward": 1.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.0232830047607422, "sampling/importance_sampling_ratio/mean": 0.9994409084320068, "sampling/importance_sampling_ratio/min": 0.9150906205177307, "sampling/sampling_logp_difference/max": 0.08855819702148438, "sampling/sampling_logp_difference/mean": 0.0011594909010455012, "step": 544, "step_time": 9.844019376000688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1625.0, "completions/max_terminated_length": 1625.0, "completions/mean_length": 562.4375, "completions/mean_terminated_length": 562.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08630534587427974, "epoch": 0.0109, "frac_reward_zero_std": 0.75, "grad_norm": 0.07855388522148132, "kl": 0.7573478296399117, "learning_rate": 3.5798938704856805e-05, "loss": 0.0013, "num_tokens": 24095384.0, "reward": 0.9044984579086304, "reward_std": 0.1877453476190567, "rewards/rollout_reward_func/mean": 0.9044984579086304, "rewards/rollout_reward_func/std": 0.3944528102874756, "sampling/importance_sampling_ratio/max": 1.1010714769363403, "sampling/importance_sampling_ratio/mean": 0.999301016330719, "sampling/importance_sampling_ratio/min": 0.8944228887557983, "sampling/sampling_logp_difference/max": 0.11547541618347168, "sampling/sampling_logp_difference/mean": 0.002995716407895088, "step": 545, "step_time": 10.322614259000147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1599.0, "completions/max_terminated_length": 1599.0, "completions/mean_length": 479.1875, "completions/mean_terminated_length": 479.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07357802242040634, "epoch": 0.01092, "frac_reward_zero_std": 0.75, "grad_norm": 0.014120426028966904, "kl": 0.6411944720894098, "learning_rate": 3.569563366536456e-05, "loss": 0.0036, "num_tokens": 24139799.0, "reward": 0.9669984579086304, "reward_std": 0.09334250539541245, "rewards/rollout_reward_func/mean": 0.9669984579086304, "rewards/rollout_reward_func/std": 0.1866850107908249, "sampling/importance_sampling_ratio/max": 1.1685829162597656, "sampling/importance_sampling_ratio/mean": 0.9999786019325256, "sampling/importance_sampling_ratio/min": 0.8881136775016785, "sampling/sampling_logp_difference/max": 0.15574324131011963, "sampling/sampling_logp_difference/mean": 0.002879334846511483, "step": 546, "step_time": 10.161477444000411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1731.0, "completions/max_terminated_length": 1731.0, "completions/mean_length": 335.90625, "completions/mean_terminated_length": 335.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07991821016184986, "epoch": 0.01094, "frac_reward_zero_std": 0.25, "grad_norm": 0.07133892178535461, "kl": 1.1740493213874288, "learning_rate": 3.559241894598764e-05, "loss": 0.008, "num_tokens": 24179386.0, "reward": 0.5991842746734619, "reward_std": 0.5831258296966553, "rewards/rollout_reward_func/mean": 0.5991842746734619, "rewards/rollout_reward_func/std": 0.782845139503479, "sampling/importance_sampling_ratio/max": 1.0795685052871704, "sampling/importance_sampling_ratio/mean": 1.0087943077087402, "sampling/importance_sampling_ratio/min": 0.9837927222251892, "sampling/sampling_logp_difference/max": 0.07680320739746094, "sampling/sampling_logp_difference/mean": 0.00285546132363379, "step": 547, "step_time": 10.27679277000152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2402.0, "completions/max_terminated_length": 2402.0, "completions/mean_length": 614.03125, "completions/mean_terminated_length": 614.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.11185186338843778, "epoch": 0.01096, "frac_reward_zero_std": 0.25, "grad_norm": 0.3700685203075409, "kl": 0.8598261177539825, "learning_rate": 3.5489296025441714e-05, "loss": -0.0027, "num_tokens": 24230267.0, "reward": 0.6889480948448181, "reward_std": 0.5759731531143188, "rewards/rollout_reward_func/mean": 0.6889480948448181, "rewards/rollout_reward_func/std": 0.6827472448348999, "sampling/importance_sampling_ratio/max": 1.2381411790847778, "sampling/importance_sampling_ratio/mean": 1.0102765560150146, "sampling/importance_sampling_ratio/min": 0.88978511095047, "sampling/sampling_logp_difference/max": 0.21292471885681152, "sampling/sampling_logp_difference/mean": 0.004142694640904665, "step": 548, "step_time": 13.2966723310019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1688.0, "completions/max_terminated_length": 1688.0, "completions/mean_length": 531.34375, "completions/mean_terminated_length": 531.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.1091716755181551, "epoch": 0.01098, "frac_reward_zero_std": 0.0, "grad_norm": 0.08050285279750824, "kl": 0.8655238058418036, "learning_rate": 3.5386266381127297e-05, "loss": 0.0154, "num_tokens": 24277736.0, "reward": 0.3693757653236389, "reward_std": 0.7428185343742371, "rewards/rollout_reward_func/mean": 0.3693757653236389, "rewards/rollout_reward_func/std": 0.7811435461044312, "sampling/importance_sampling_ratio/max": 1.0405900478363037, "sampling/importance_sampling_ratio/mean": 1.0013645887374878, "sampling/importance_sampling_ratio/min": 0.9230137467384338, "sampling/sampling_logp_difference/max": 0.07950127124786377, "sampling/sampling_logp_difference/mean": 0.002035804558545351, "step": 549, "step_time": 11.245523530001265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1724.0, "completions/max_terminated_length": 1724.0, "completions/mean_length": 427.46875, "completions/mean_terminated_length": 427.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06830315804108977, "epoch": 0.011, "frac_reward_zero_std": 0.0, "grad_norm": 0.12196504324674606, "kl": 1.366571482270956, "learning_rate": 3.5283331489108557e-05, "loss": 0.0043, "num_tokens": 24321105.0, "reward": 0.7708879709243774, "reward_std": 0.5184255838394165, "rewards/rollout_reward_func/mean": 0.7708879709243774, "rewards/rollout_reward_func/std": 0.5064700245857239, "sampling/importance_sampling_ratio/max": 1.1261656284332275, "sampling/importance_sampling_ratio/mean": 0.9988199472427368, "sampling/importance_sampling_ratio/min": 0.8825982809066772, "sampling/sampling_logp_difference/max": 0.12491989135742188, "sampling/sampling_logp_difference/mean": 0.003655706997960806, "step": 550, "step_time": 11.033491805999802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 286.125, "completions/mean_terminated_length": 286.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07140315137803555, "epoch": 0.01102, "frac_reward_zero_std": 0.0, "grad_norm": 0.12634344398975372, "kl": 0.9361300878226757, "learning_rate": 3.518049282409222e-05, "loss": 0.0144, "num_tokens": 24358155.0, "reward": 0.7706634402275085, "reward_std": 0.5161359906196594, "rewards/rollout_reward_func/mean": 0.7706634402275085, "rewards/rollout_reward_func/std": 0.5042467713356018, "sampling/importance_sampling_ratio/max": 1.225901484489441, "sampling/importance_sampling_ratio/mean": 1.0107979774475098, "sampling/importance_sampling_ratio/min": 0.9983251094818115, "sampling/sampling_logp_difference/max": 0.17029213905334473, "sampling/sampling_logp_difference/mean": 0.0026574251241981983, "step": 551, "step_time": 8.765841622999687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1625.0, "completions/max_terminated_length": 1625.0, "completions/mean_length": 597.78125, "completions/mean_terminated_length": 597.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09820019640028477, "epoch": 0.01104, "frac_reward_zero_std": 0.0, "grad_norm": 0.18156732618808746, "kl": 1.0535610280930996, "learning_rate": 3.5077751859406395e-05, "loss": 0.0126, "num_tokens": 24408641.0, "reward": 0.7810161113739014, "reward_std": 0.5373966693878174, "rewards/rollout_reward_func/mean": 0.7810161113739014, "rewards/rollout_reward_func/std": 0.5471912026405334, "sampling/importance_sampling_ratio/max": 1.130866289138794, "sampling/importance_sampling_ratio/mean": 1.0004465579986572, "sampling/importance_sampling_ratio/min": 0.8755868077278137, "sampling/sampling_logp_difference/max": 0.12304925918579102, "sampling/sampling_logp_difference/mean": 0.003040411975234747, "step": 552, "step_time": 10.97889220900015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1557.0, "completions/max_terminated_length": 1557.0, "completions/mean_length": 295.15625, "completions/mean_terminated_length": 295.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07453312212601304, "epoch": 0.01106, "frac_reward_zero_std": 0.25, "grad_norm": 0.05603642016649246, "kl": 1.0613846564665437, "learning_rate": 3.497511006697944e-05, "loss": -0.0042, "num_tokens": 24446228.0, "reward": 0.8015768527984619, "reward_std": 0.40978145599365234, "rewards/rollout_reward_func/mean": 0.8015768527984619, "rewards/rollout_reward_func/std": 0.4867049753665924, "sampling/importance_sampling_ratio/max": 1.106373906135559, "sampling/importance_sampling_ratio/mean": 1.0060598850250244, "sampling/importance_sampling_ratio/min": 0.9825282096862793, "sampling/sampling_logp_difference/max": 0.10099458694458008, "sampling/sampling_logp_difference/mean": 0.0019944636151194572, "step": 553, "step_time": 10.615755326998624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1733.0, "completions/max_terminated_length": 1733.0, "completions/mean_length": 559.15625, "completions/mean_terminated_length": 559.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09941067965701222, "epoch": 0.01108, "frac_reward_zero_std": 0.0, "grad_norm": 0.09402045607566833, "kl": 0.7556006871163845, "learning_rate": 3.487256891731897e-05, "loss": 0.0152, "num_tokens": 24494173.0, "reward": 0.7546025514602661, "reward_std": 0.5763430595397949, "rewards/rollout_reward_func/mean": 0.7546025514602661, "rewards/rollout_reward_func/std": 0.6053285002708435, "sampling/importance_sampling_ratio/max": 1.0283435583114624, "sampling/importance_sampling_ratio/mean": 1.0008455514907837, "sampling/importance_sampling_ratio/min": 0.947740375995636, "sampling/sampling_logp_difference/max": 0.0536273717880249, "sampling/sampling_logp_difference/mean": 0.001423984533175826, "step": 554, "step_time": 11.11397307399875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1578.0, "completions/max_terminated_length": 1578.0, "completions/mean_length": 506.40625, "completions/mean_terminated_length": 506.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08800830598920584, "epoch": 0.0111, "frac_reward_zero_std": 0.25, "grad_norm": 0.023550089448690414, "kl": 0.6954366881400347, "learning_rate": 3.4770129879490695e-05, "loss": -0.0055, "num_tokens": 24540230.0, "reward": 0.7129209041595459, "reward_std": 0.4788580536842346, "rewards/rollout_reward_func/mean": 0.7129209041595459, "rewards/rollout_reward_func/std": 0.6360123157501221, "sampling/importance_sampling_ratio/max": 1.1221601963043213, "sampling/importance_sampling_ratio/mean": 1.0029281377792358, "sampling/importance_sampling_ratio/min": 0.956020176410675, "sampling/sampling_logp_difference/max": 0.11525630950927734, "sampling/sampling_logp_difference/mean": 0.002573401667177677, "step": 555, "step_time": 10.395756402001098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1577.0, "completions/max_terminated_length": 1577.0, "completions/mean_length": 484.40625, "completions/mean_terminated_length": 484.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07824740349315107, "epoch": 0.01112, "frac_reward_zero_std": 0.25, "grad_norm": 0.09259714931249619, "kl": 0.5817960510030389, "learning_rate": 3.466779442109741e-05, "loss": 0.0058, "num_tokens": 24586492.0, "reward": 0.7782049179077148, "reward_std": 0.44496509432792664, "rewards/rollout_reward_func/mean": 0.7782049179077148, "rewards/rollout_reward_func/std": 0.5534802079200745, "sampling/importance_sampling_ratio/max": 1.1186918020248413, "sampling/importance_sampling_ratio/mean": 0.9988079071044922, "sampling/importance_sampling_ratio/min": 0.8842870593070984, "sampling/sampling_logp_difference/max": 0.12301826477050781, "sampling/sampling_logp_difference/mean": 0.002211349317803979, "step": 556, "step_time": 10.613902565999524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1728.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 533.15625, "completions/mean_terminated_length": 533.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09231396188260987, "epoch": 0.01114, "frac_reward_zero_std": 0.0, "grad_norm": 0.6306222081184387, "kl": 1.1956517398357391, "learning_rate": 3.456556400825799e-05, "loss": 0.0103, "num_tokens": 24634057.0, "reward": 0.6881695985794067, "reward_std": 0.6534982919692993, "rewards/rollout_reward_func/mean": 0.6881695985794067, "rewards/rollout_reward_func/std": 0.6326475739479065, "sampling/importance_sampling_ratio/max": 1.4814976453781128, "sampling/importance_sampling_ratio/mean": 1.0144710540771484, "sampling/importance_sampling_ratio/min": 0.8416323065757751, "sampling/sampling_logp_difference/max": 0.3940155506134033, "sampling/sampling_logp_difference/mean": 0.005072933621704578, "step": 557, "step_time": 10.61361218500133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1618.0, "completions/max_terminated_length": 1618.0, "completions/mean_length": 462.625, "completions/mean_terminated_length": 462.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08773140422999859, "epoch": 0.01116, "frac_reward_zero_std": 0.25, "grad_norm": 0.1332550048828125, "kl": 0.7398058883845806, "learning_rate": 3.446344010558635e-05, "loss": 0.0023, "num_tokens": 24679313.0, "reward": 0.7153999209403992, "reward_std": 0.5227551460266113, "rewards/rollout_reward_func/mean": 0.7153999209403992, "rewards/rollout_reward_func/std": 0.6314928531646729, "sampling/importance_sampling_ratio/max": 1.1279304027557373, "sampling/importance_sampling_ratio/mean": 0.9967726469039917, "sampling/importance_sampling_ratio/min": 0.889435350894928, "sampling/sampling_logp_difference/max": 0.12183761596679688, "sampling/sampling_logp_difference/mean": 0.003484393237158656, "step": 558, "step_time": 10.436619125998732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1701.0, "completions/max_terminated_length": 1701.0, "completions/mean_length": 392.375, "completions/mean_terminated_length": 392.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.061868378194049, "epoch": 0.01118, "frac_reward_zero_std": 0.25, "grad_norm": 0.25230902433395386, "kl": 0.7120536919683218, "learning_rate": 3.436142417617047e-05, "loss": -0.0021, "num_tokens": 24720165.0, "reward": 0.8079800605773926, "reward_std": 0.46023067831993103, "rewards/rollout_reward_func/mean": 0.8079800605773926, "rewards/rollout_reward_func/std": 0.5402814745903015, "sampling/importance_sampling_ratio/max": 1.1091960668563843, "sampling/importance_sampling_ratio/mean": 1.0039854049682617, "sampling/importance_sampling_ratio/min": 0.8868440985679626, "sampling/sampling_logp_difference/max": 0.12008428573608398, "sampling/sampling_logp_difference/mean": 0.00280478922650218, "step": 559, "step_time": 10.120074645001296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1607.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 440.53125, "completions/mean_terminated_length": 440.53125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0544001623056829, "epoch": 0.0112, "frac_reward_zero_std": 0.5, "grad_norm": 0.08690055459737778, "kl": 0.6723251212388277, "learning_rate": 3.4259517681551474e-05, "loss": -0.0015, "num_tokens": 24764171.0, "reward": 0.8633462190628052, "reward_std": 0.23929712176322937, "rewards/rollout_reward_func/mean": 0.8633462190628052, "rewards/rollout_reward_func/std": 0.3673882484436035, "sampling/importance_sampling_ratio/max": 1.0117974281311035, "sampling/importance_sampling_ratio/mean": 1.0005971193313599, "sampling/importance_sampling_ratio/min": 0.9946044683456421, "sampling/sampling_logp_difference/max": 0.01147422194480896, "sampling/sampling_logp_difference/mean": 0.0004988069995306432, "step": 560, "step_time": 10.057944903999669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1569.0, "completions/max_terminated_length": 1569.0, "completions/mean_length": 449.4375, "completions/mean_terminated_length": 449.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07509293779730797, "epoch": 0.01122, "frac_reward_zero_std": 0.5, "grad_norm": 0.022077307105064392, "kl": 0.7618245873600245, "learning_rate": 3.41577220817026e-05, "loss": 0.0006, "num_tokens": 24808339.0, "reward": 0.8997752666473389, "reward_std": 0.21850407123565674, "rewards/rollout_reward_func/mean": 0.8997752666473389, "rewards/rollout_reward_func/std": 0.3166113495826721, "sampling/importance_sampling_ratio/max": 1.0767594575881958, "sampling/importance_sampling_ratio/mean": 0.9997000098228455, "sampling/importance_sampling_ratio/min": 0.8887419700622559, "sampling/sampling_logp_difference/max": 0.11794805526733398, "sampling/sampling_logp_difference/mean": 0.0021239675115793943, "step": 561, "step_time": 10.2517233460012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1569.0, "completions/max_terminated_length": 1569.0, "completions/mean_length": 346.59375, "completions/mean_terminated_length": 346.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0550700374878943, "epoch": 0.01124, "frac_reward_zero_std": 0.0, "grad_norm": 0.0879012942314148, "kl": 0.889500092715025, "learning_rate": 3.4056038835008404e-05, "loss": 0.003, "num_tokens": 24848430.0, "reward": 0.7722513675689697, "reward_std": 0.495276540517807, "rewards/rollout_reward_func/mean": 0.7722513675689697, "rewards/rollout_reward_func/std": 0.5020740628242493, "sampling/importance_sampling_ratio/max": 1.0579100847244263, "sampling/importance_sampling_ratio/mean": 1.0027753114700317, "sampling/importance_sampling_ratio/min": 0.9947196841239929, "sampling/sampling_logp_difference/max": 0.05618619918823242, "sampling/sampling_logp_difference/mean": 0.0008175715338438749, "step": 562, "step_time": 9.985492644999795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2197.0, "completions/max_terminated_length": 2197.0, "completions/mean_length": 473.5625, "completions/mean_terminated_length": 473.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06071990530472249, "epoch": 0.01126, "frac_reward_zero_std": 0.25, "grad_norm": 0.04774719476699829, "kl": 0.6894225254654884, "learning_rate": 3.3954469398243765e-05, "loss": -0.0008, "num_tokens": 24892738.0, "reward": 0.8728525042533875, "reward_std": 0.359627366065979, "rewards/rollout_reward_func/mean": 0.8728525042533875, "rewards/rollout_reward_func/std": 0.42393240332603455, "sampling/importance_sampling_ratio/max": 1.0094190835952759, "sampling/importance_sampling_ratio/mean": 0.9949876070022583, "sampling/importance_sampling_ratio/min": 0.9295449256896973, "sampling/sampling_logp_difference/max": 0.06949186325073242, "sampling/sampling_logp_difference/mean": 0.0015654684975743294, "step": 563, "step_time": 11.700284592999196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1642.0, "completions/max_terminated_length": 1642.0, "completions/mean_length": 550.125, "completions/mean_terminated_length": 550.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0599458790384233, "epoch": 0.01128, "frac_reward_zero_std": 0.25, "grad_norm": 0.0395234115421772, "kl": 0.7378547787666321, "learning_rate": 3.385301522655306e-05, "loss": 0.002, "num_tokens": 24939582.0, "reward": 0.8654227256774902, "reward_std": 0.31524959206581116, "rewards/rollout_reward_func/mean": 0.8654227256774902, "rewards/rollout_reward_func/std": 0.3619850277900696, "sampling/importance_sampling_ratio/max": 1.065362572669983, "sampling/importance_sampling_ratio/mean": 1.000426173210144, "sampling/importance_sampling_ratio/min": 0.9374646544456482, "sampling/sampling_logp_difference/max": 0.09247374534606934, "sampling/sampling_logp_difference/mean": 0.0017499183304607868, "step": 564, "step_time": 10.58473867399971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1508.0, "completions/max_terminated_length": 1508.0, "completions/mean_length": 475.4375, "completions/mean_terminated_length": 475.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05030014459043741, "epoch": 0.0113, "frac_reward_zero_std": 0.5, "grad_norm": 0.04758532717823982, "kl": 0.4362086053006351, "learning_rate": 3.375167777342931e-05, "loss": -0.0039, "num_tokens": 24984145.0, "reward": 0.8153073787689209, "reward_std": 0.4021449089050293, "rewards/rollout_reward_func/mean": 0.8153073787689209, "rewards/rollout_reward_func/std": 0.5834363102912903, "sampling/importance_sampling_ratio/max": 1.005691647529602, "sampling/importance_sampling_ratio/mean": 0.9959691166877747, "sampling/importance_sampling_ratio/min": 0.884928286075592, "sampling/sampling_logp_difference/max": 0.12228155136108398, "sampling/sampling_logp_difference/mean": 0.0017600045539438725, "step": 565, "step_time": 10.307141406998198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2542.0, "completions/max_terminated_length": 2542.0, "completions/mean_length": 556.125, "completions/mean_terminated_length": 556.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06110541755333543, "epoch": 0.01132, "frac_reward_zero_std": 0.25, "grad_norm": 0.0270476546138525, "kl": 1.009704813361168, "learning_rate": 3.3650458490693373e-05, "loss": 0.0077, "num_tokens": 25031740.0, "reward": 0.8032385110855103, "reward_std": 0.410625159740448, "rewards/rollout_reward_func/mean": 0.8032385110855103, "rewards/rollout_reward_func/std": 0.4878070652484894, "sampling/importance_sampling_ratio/max": 1.0130436420440674, "sampling/importance_sampling_ratio/mean": 0.9980212450027466, "sampling/importance_sampling_ratio/min": 0.9720539450645447, "sampling/sampling_logp_difference/max": 0.022143036127090454, "sampling/sampling_logp_difference/mean": 0.0009665651014074683, "step": 566, "step_time": 13.524047979001807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1618.0, "completions/max_terminated_length": 1618.0, "completions/mean_length": 593.25, "completions/mean_terminated_length": 593.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06048167240805924, "epoch": 0.01134, "frac_reward_zero_std": 0.0, "grad_norm": 0.04896418750286102, "kl": 0.6688567046076059, "learning_rate": 3.3549358828473086e-05, "loss": 0.0083, "num_tokens": 25080925.0, "reward": 0.7119671702384949, "reward_std": 0.5331504344940186, "rewards/rollout_reward_func/mean": 0.7119671702384949, "rewards/rollout_reward_func/std": 0.5846304297447205, "sampling/importance_sampling_ratio/max": 1.0893357992172241, "sampling/importance_sampling_ratio/mean": 0.9957637190818787, "sampling/importance_sampling_ratio/min": 0.8870815634727478, "sampling/sampling_logp_difference/max": 0.1217656135559082, "sampling/sampling_logp_difference/mean": 0.0027378834784030914, "step": 567, "step_time": 11.473845211999105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1588.0, "completions/max_terminated_length": 1588.0, "completions/mean_length": 660.6875, "completions/mean_terminated_length": 660.6875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09055491629987955, "epoch": 0.01136, "frac_reward_zero_std": 0.25, "grad_norm": 0.08861355483531952, "kl": 0.59069555811584, "learning_rate": 3.344838023518258e-05, "loss": 0.0067, "num_tokens": 25132935.0, "reward": 0.8699397444725037, "reward_std": 0.36786600947380066, "rewards/rollout_reward_func/mean": 0.8699397444725037, "rewards/rollout_reward_func/std": 0.4281369149684906, "sampling/importance_sampling_ratio/max": 1.1206282377243042, "sampling/importance_sampling_ratio/mean": 1.0051299333572388, "sampling/importance_sampling_ratio/min": 0.9709334373474121, "sampling/sampling_logp_difference/max": 0.11435222625732422, "sampling/sampling_logp_difference/mean": 0.002717262366786599, "step": 568, "step_time": 10.801056285000413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1607.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 527.4375, "completions/mean_terminated_length": 527.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05072574340738356, "epoch": 0.01138, "frac_reward_zero_std": 0.5, "grad_norm": 0.016537653282284737, "kl": 0.7659144960343838, "learning_rate": 3.334752415750147e-05, "loss": -0.0028, "num_tokens": 25179715.0, "reward": 0.8994724154472351, "reward_std": 0.21979185938835144, "rewards/rollout_reward_func/mean": 0.8994724154472351, "rewards/rollout_reward_func/std": 0.3178686797618866, "sampling/importance_sampling_ratio/max": 1.0128957033157349, "sampling/importance_sampling_ratio/mean": 0.9995061755180359, "sampling/importance_sampling_ratio/min": 0.9695643186569214, "sampling/sampling_logp_difference/max": 0.030239030718803406, "sampling/sampling_logp_difference/mean": 0.0007199152023531497, "step": 569, "step_time": 10.363674786000047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1773.0, "completions/max_terminated_length": 1773.0, "completions/mean_length": 571.09375, "completions/mean_terminated_length": 571.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05622175079770386, "epoch": 0.0114, "frac_reward_zero_std": 0.25, "grad_norm": 0.0941125676035881, "kl": 0.9325131103396416, "learning_rate": 3.3246792040354106e-05, "loss": -0.0021, "num_tokens": 25228268.0, "reward": 0.7160430550575256, "reward_std": 0.5497441291809082, "rewards/rollout_reward_func/mean": 0.7160430550575256, "rewards/rollout_reward_func/std": 0.6338129043579102, "sampling/importance_sampling_ratio/max": 1.2457878589630127, "sampling/importance_sampling_ratio/mean": 1.0077428817749023, "sampling/importance_sampling_ratio/min": 0.9866790175437927, "sampling/sampling_logp_difference/max": 0.22083711624145508, "sampling/sampling_logp_difference/mean": 0.0019957295153290033, "step": 570, "step_time": 10.764289407999058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1712.0, "completions/max_terminated_length": 1712.0, "completions/mean_length": 453.65625, "completions/mean_terminated_length": 453.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04364934214390814, "epoch": 0.01142, "frac_reward_zero_std": 0.25, "grad_norm": 0.08336438238620758, "kl": 0.5613250248134136, "learning_rate": 3.3146185326888924e-05, "loss": -0.0143, "num_tokens": 25272329.0, "reward": 0.7446466088294983, "reward_std": 0.5154964923858643, "rewards/rollout_reward_func/mean": 0.7446466088294983, "rewards/rollout_reward_func/std": 0.627234160900116, "sampling/importance_sampling_ratio/max": 1.010794997215271, "sampling/importance_sampling_ratio/mean": 0.9967249631881714, "sampling/importance_sampling_ratio/min": 0.8957927227020264, "sampling/sampling_logp_difference/max": 0.1114349365234375, "sampling/sampling_logp_difference/mean": 0.0012317325454205275, "step": 571, "step_time": 10.29248423900026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1567.0, "completions/max_terminated_length": 1567.0, "completions/mean_length": 464.09375, "completions/mean_terminated_length": 464.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07378102885559201, "epoch": 0.01144, "frac_reward_zero_std": 0.0, "grad_norm": 0.11446607112884521, "kl": 0.7183163240551949, "learning_rate": 3.3045705458457784e-05, "loss": -0.0001, "num_tokens": 25316876.0, "reward": 0.6212726831436157, "reward_std": 0.7092105150222778, "rewards/rollout_reward_func/mean": 0.6212726831436157, "rewards/rollout_reward_func/std": 0.7035043835639954, "sampling/importance_sampling_ratio/max": 1.0919948816299438, "sampling/importance_sampling_ratio/mean": 1.0092285871505737, "sampling/importance_sampling_ratio/min": 0.9966561794281006, "sampling/sampling_logp_difference/max": 0.08797097206115723, "sampling/sampling_logp_difference/mean": 0.0019447874510660768, "step": 572, "step_time": 10.795912095000858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1559.0, "completions/max_terminated_length": 1559.0, "completions/mean_length": 377.625, "completions/mean_terminated_length": 377.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04316802887478843, "epoch": 0.01146, "frac_reward_zero_std": 0.5, "grad_norm": 0.10517268627882004, "kl": 0.9532385244965553, "learning_rate": 3.294535387459522e-05, "loss": 0.0023, "num_tokens": 25356782.0, "reward": 0.9316569566726685, "reward_std": 0.1933034062385559, "rewards/rollout_reward_func/mean": 0.9316569566726685, "rewards/rollout_reward_func/std": 0.2689686119556427, "sampling/importance_sampling_ratio/max": 1.2623112201690674, "sampling/importance_sampling_ratio/mean": 1.0010607242584229, "sampling/importance_sampling_ratio/min": 0.8840695023536682, "sampling/sampling_logp_difference/max": 0.21509337425231934, "sampling/sampling_logp_difference/mean": 0.003741877619177103, "step": 573, "step_time": 9.742134243001601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1710.0, "completions/max_terminated_length": 1710.0, "completions/mean_length": 637.71875, "completions/mean_terminated_length": 637.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05799759924411774, "epoch": 0.01148, "frac_reward_zero_std": 0.5, "grad_norm": 0.07938188314437866, "kl": 0.6310953013598919, "learning_rate": 3.284513201299795e-05, "loss": 0.0008, "num_tokens": 25407563.0, "reward": 0.8139843940734863, "reward_std": 0.36622118949890137, "rewards/rollout_reward_func/mean": 0.8139843940734863, "rewards/rollout_reward_func/std": 0.5272975564002991, "sampling/importance_sampling_ratio/max": 1.010514497756958, "sampling/importance_sampling_ratio/mean": 0.9985403418540955, "sampling/importance_sampling_ratio/min": 0.9651868939399719, "sampling/sampling_logp_difference/max": 0.03740501403808594, "sampling/sampling_logp_difference/mean": 0.0008437420474365354, "step": 574, "step_time": 10.79786353199961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1476.0, "completions/max_terminated_length": 1476.0, "completions/mean_length": 457.34375, "completions/mean_terminated_length": 457.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05375830712728202, "epoch": 0.0115, "frac_reward_zero_std": 0.0, "grad_norm": 0.056164227426052094, "kl": 0.8112361617386341, "learning_rate": 3.274504130950419e-05, "loss": 0.0092, "num_tokens": 25452630.0, "reward": 0.7429511547088623, "reward_std": 0.5138827562332153, "rewards/rollout_reward_func/mean": 0.7429511547088623, "rewards/rollout_reward_func/std": 0.574743926525116, "sampling/importance_sampling_ratio/max": 1.0741488933563232, "sampling/importance_sampling_ratio/mean": 1.0001592636108398, "sampling/importance_sampling_ratio/min": 0.8848416209220886, "sampling/sampling_logp_difference/max": 0.12162256240844727, "sampling/sampling_logp_difference/mean": 0.0019081755308434367, "step": 575, "step_time": 10.679345669001123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1656.0, "completions/max_terminated_length": 1656.0, "completions/mean_length": 390.0, "completions/mean_terminated_length": 390.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.03743913350626826, "epoch": 0.01152, "frac_reward_zero_std": 0.0, "grad_norm": 0.14498722553253174, "kl": 0.8838392496109009, "learning_rate": 3.264508319807308e-05, "loss": -0.0092, "num_tokens": 25495006.0, "reward": 0.7392046451568604, "reward_std": 0.5859452486038208, "rewards/rollout_reward_func/mean": 0.7392046451568604, "rewards/rollout_reward_func/std": 0.5801333785057068, "sampling/importance_sampling_ratio/max": 1.017979383468628, "sampling/importance_sampling_ratio/mean": 0.994530975818634, "sampling/importance_sampling_ratio/min": 0.8052058815956116, "sampling/sampling_logp_difference/max": 0.2166457176208496, "sampling/sampling_logp_difference/mean": 0.0022436100989580154, "step": 576, "step_time": 10.564995231999092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1529.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 522.0, "completions/mean_terminated_length": 522.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05239729629829526, "epoch": 0.01154, "frac_reward_zero_std": 0.25, "grad_norm": 0.07082108408212662, "kl": 0.8655131310224533, "learning_rate": 3.2545259110764225e-05, "loss": -0.0037, "num_tokens": 25542084.0, "reward": 0.6554412841796875, "reward_std": 0.634579062461853, "rewards/rollout_reward_func/mean": 0.6554412841796875, "rewards/rollout_reward_func/std": 0.7426300048828125, "sampling/importance_sampling_ratio/max": 1.0169554948806763, "sampling/importance_sampling_ratio/mean": 0.99489426612854, "sampling/importance_sampling_ratio/min": 0.8866797685623169, "sampling/sampling_logp_difference/max": 0.12006902694702148, "sampling/sampling_logp_difference/mean": 0.001580212265253067, "step": 577, "step_time": 11.083301267999559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1662.0, "completions/max_terminated_length": 1662.0, "completions/mean_length": 617.65625, "completions/mean_terminated_length": 617.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0511127095669508, "epoch": 0.01156, "frac_reward_zero_std": 0.0, "grad_norm": 0.08029000461101532, "kl": 0.7078675404191017, "learning_rate": 3.2445570477717066e-05, "loss": 0.0064, "num_tokens": 25592001.0, "reward": 0.5894489288330078, "reward_std": 0.7103238105773926, "rewards/rollout_reward_func/mean": 0.5894489288330078, "rewards/rollout_reward_func/std": 0.7090772390365601, "sampling/importance_sampling_ratio/max": 1.1335982084274292, "sampling/importance_sampling_ratio/mean": 1.0020458698272705, "sampling/importance_sampling_ratio/min": 0.9343123435974121, "sampling/sampling_logp_difference/max": 0.12321090698242188, "sampling/sampling_logp_difference/mean": 0.001700867898762226, "step": 578, "step_time": 10.944191212001897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2460.0, "completions/max_terminated_length": 2460.0, "completions/mean_length": 638.875, "completions/mean_terminated_length": 638.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.045509020681492984, "epoch": 0.01158, "frac_reward_zero_std": 0.25, "grad_norm": 0.03866994008421898, "kl": 0.6670006550848484, "learning_rate": 3.234601872713047e-05, "loss": 0.0073, "num_tokens": 25642882.0, "reward": 0.8723267912864685, "reward_std": 0.36111441254615784, "rewards/rollout_reward_func/mean": 0.8723267912864685, "rewards/rollout_reward_func/std": 0.4261328876018524, "sampling/importance_sampling_ratio/max": 1.01780366897583, "sampling/importance_sampling_ratio/mean": 0.995041012763977, "sampling/importance_sampling_ratio/min": 0.8999615907669067, "sampling/sampling_logp_difference/max": 0.11658096313476562, "sampling/sampling_logp_difference/mean": 0.001991729252040386, "step": 579, "step_time": 13.290279315999214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1595.0, "completions/max_terminated_length": 1595.0, "completions/mean_length": 352.71875, "completions/mean_terminated_length": 352.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.033996846177615225, "epoch": 0.0116, "frac_reward_zero_std": 0.5, "grad_norm": 0.02001606673002243, "kl": 0.8575951494276524, "learning_rate": 3.2246605285242225e-05, "loss": 0.0008, "num_tokens": 25682633.0, "reward": 0.8079788684844971, "reward_std": 0.37654584646224976, "rewards/rollout_reward_func/mean": 0.8079788684844971, "rewards/rollout_reward_func/std": 0.5423957109451294, "sampling/importance_sampling_ratio/max": 1.009078860282898, "sampling/importance_sampling_ratio/mean": 1.0000061988830566, "sampling/importance_sampling_ratio/min": 0.9877318739891052, "sampling/sampling_logp_difference/max": 0.014470607042312622, "sampling/sampling_logp_difference/mean": 0.00035498972283676267, "step": 580, "step_time": 10.245843507002064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2252.0, "completions/max_terminated_length": 2252.0, "completions/mean_length": 674.84375, "completions/mean_terminated_length": 674.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.055820784298703074, "epoch": 0.01162, "frac_reward_zero_std": 0.0, "grad_norm": 0.19710077345371246, "kl": 1.1436741799116135, "learning_rate": 3.2147331576308676e-05, "loss": 0.0008, "num_tokens": 25735649.0, "reward": 0.5529778003692627, "reward_std": 0.6775771975517273, "rewards/rollout_reward_func/mean": 0.5529778003692627, "rewards/rollout_reward_func/std": 0.716241717338562, "sampling/importance_sampling_ratio/max": 1.070412516593933, "sampling/importance_sampling_ratio/mean": 0.997238278388977, "sampling/importance_sampling_ratio/min": 0.8639726638793945, "sampling/sampling_logp_difference/max": 0.1471172571182251, "sampling/sampling_logp_difference/mean": 0.00173847540281713, "step": 581, "step_time": 12.530836697001178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1794.0, "completions/max_terminated_length": 1794.0, "completions/mean_length": 560.0, "completions/mean_terminated_length": 560.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.052942414418794215, "epoch": 0.01164, "frac_reward_zero_std": 0.0, "grad_norm": 0.10953519493341446, "kl": 0.6612076163291931, "learning_rate": 3.20481990225842e-05, "loss": 0.0213, "num_tokens": 25785146.0, "reward": 0.5371942520141602, "reward_std": 0.7296640276908875, "rewards/rollout_reward_func/mean": 0.5371942520141602, "rewards/rollout_reward_func/std": 0.7852379083633423, "sampling/importance_sampling_ratio/max": 1.1282936334609985, "sampling/importance_sampling_ratio/mean": 1.0027717351913452, "sampling/importance_sampling_ratio/min": 0.9448708891868591, "sampling/sampling_logp_difference/max": 0.1200723648071289, "sampling/sampling_logp_difference/mean": 0.0017096400260925293, "step": 582, "step_time": 11.338965951002137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1415.0, "completions/max_terminated_length": 1415.0, "completions/mean_length": 357.0625, "completions/mean_terminated_length": 357.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06394052249379456, "epoch": 0.01166, "frac_reward_zero_std": 0.25, "grad_norm": 0.1170019879937172, "kl": 0.9251650087535381, "learning_rate": 3.194920904430096e-05, "loss": -0.0061, "num_tokens": 25826286.0, "reward": 0.5901851654052734, "reward_std": 0.6181913018226624, "rewards/rollout_reward_func/mean": 0.5901851654052734, "rewards/rollout_reward_func/std": 0.7506319284439087, "sampling/importance_sampling_ratio/max": 1.0694855451583862, "sampling/importance_sampling_ratio/mean": 1.003063678741455, "sampling/importance_sampling_ratio/min": 0.9832593202590942, "sampling/sampling_logp_difference/max": 0.10276961326599121, "sampling/sampling_logp_difference/mean": 0.0020484630949795246, "step": 583, "step_time": 9.78620875300021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1695.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 469.71875, "completions/mean_terminated_length": 469.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04687017964897677, "epoch": 0.01168, "frac_reward_zero_std": 0.0, "grad_norm": 0.13207362592220306, "kl": 1.114575769752264, "learning_rate": 3.185036305964847e-05, "loss": 0.0014, "num_tokens": 25872040.0, "reward": 0.7142717242240906, "reward_std": 0.6216610670089722, "rewards/rollout_reward_func/mean": 0.7142717242240906, "rewards/rollout_reward_func/std": 0.6318053007125854, "sampling/importance_sampling_ratio/max": 1.7049942016601562, "sampling/importance_sampling_ratio/mean": 1.0154902935028076, "sampling/importance_sampling_ratio/min": 0.8728632926940918, "sampling/sampling_logp_difference/max": 0.5337404012680054, "sampling/sampling_logp_difference/mean": 0.005555583629757166, "step": 584, "step_time": 11.310513551999065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1664.0, "completions/max_terminated_length": 1664.0, "completions/mean_length": 529.125, "completions/mean_terminated_length": 529.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.03337370470399037, "epoch": 0.0117, "frac_reward_zero_std": 0.25, "grad_norm": 0.0073763709515333176, "kl": 0.600144200026989, "learning_rate": 3.17516624847533e-05, "loss": -0.0071, "num_tokens": 25919937.0, "reward": 0.8361408710479736, "reward_std": 0.39850640296936035, "rewards/rollout_reward_func/mean": 0.8361408710479736, "rewards/rollout_reward_func/std": 0.46294698119163513, "sampling/importance_sampling_ratio/max": 1.1319106817245483, "sampling/importance_sampling_ratio/mean": 1.0040203332901, "sampling/importance_sampling_ratio/min": 0.9792770147323608, "sampling/sampling_logp_difference/max": 0.12439441680908203, "sampling/sampling_logp_difference/mean": 0.0013090863358229399, "step": 585, "step_time": 10.876237146999301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2589.0, "completions/max_terminated_length": 2589.0, "completions/mean_length": 619.25, "completions/mean_terminated_length": 619.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.059292487101629376, "epoch": 0.01172, "frac_reward_zero_std": 0.25, "grad_norm": 0.06619912385940552, "kl": 0.7410234734416008, "learning_rate": 3.1653108733658806e-05, "loss": -0.0019, "num_tokens": 25970242.0, "reward": 0.6906378269195557, "reward_std": 0.5944950580596924, "rewards/rollout_reward_func/mean": 0.6906378269195557, "rewards/rollout_reward_func/std": 0.6828278303146362, "sampling/importance_sampling_ratio/max": 1.0078870058059692, "sampling/importance_sampling_ratio/mean": 0.9947800040245056, "sampling/importance_sampling_ratio/min": 0.8948057293891907, "sampling/sampling_logp_difference/max": 0.10961297154426575, "sampling/sampling_logp_difference/mean": 0.0016230086330324411, "step": 586, "step_time": 14.543571870998676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1607.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 505.875, "completions/mean_terminated_length": 505.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04456609417684376, "epoch": 0.01174, "frac_reward_zero_std": 0.25, "grad_norm": 0.02321828156709671, "kl": 0.6940156186465174, "learning_rate": 3.155470321830486e-05, "loss": -0.0069, "num_tokens": 26016265.0, "reward": 0.8023244142532349, "reward_std": 0.4077289402484894, "rewards/rollout_reward_func/mean": 0.8023244142532349, "rewards/rollout_reward_func/std": 0.48603296279907227, "sampling/importance_sampling_ratio/max": 1.0639842748641968, "sampling/importance_sampling_ratio/mean": 1.0033493041992188, "sampling/importance_sampling_ratio/min": 0.9926769137382507, "sampling/sampling_logp_difference/max": 0.07414482533931732, "sampling/sampling_logp_difference/mean": 0.0015051767695695162, "step": 587, "step_time": 10.80153842299751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1549.0, "completions/max_terminated_length": 1549.0, "completions/mean_length": 522.84375, "completions/mean_terminated_length": 522.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05761521740350872, "epoch": 0.01176, "frac_reward_zero_std": 0.25, "grad_norm": 0.062394458800554276, "kl": 0.6526036553550512, "learning_rate": 3.145644734850761e-05, "loss": 0.0013, "num_tokens": 26063638.0, "reward": 0.8446667194366455, "reward_std": 0.4393489956855774, "rewards/rollout_reward_func/mean": 0.8446667194366455, "rewards/rollout_reward_func/std": 0.5104753375053406, "sampling/importance_sampling_ratio/max": 1.0065720081329346, "sampling/importance_sampling_ratio/mean": 0.9968211650848389, "sampling/importance_sampling_ratio/min": 0.9782474040985107, "sampling/sampling_logp_difference/max": 0.014211460947990417, "sampling/sampling_logp_difference/mean": 0.0009589214459992945, "step": 588, "step_time": 10.56969767100054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1803.0, "completions/max_terminated_length": 1803.0, "completions/mean_length": 466.46875, "completions/mean_terminated_length": 466.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.056188413174822927, "epoch": 0.01178, "frac_reward_zero_std": 0.25, "grad_norm": 0.06460254639387131, "kl": 0.6442716289311647, "learning_rate": 3.135834253193929e-05, "loss": -0.0014, "num_tokens": 26108800.0, "reward": 0.8448752164840698, "reward_std": 0.43875908851623535, "rewards/rollout_reward_func/mean": 0.8448752164840698, "rewards/rollout_reward_func/std": 0.5052769780158997, "sampling/importance_sampling_ratio/max": 1.4065810441970825, "sampling/importance_sampling_ratio/mean": 1.0066642761230469, "sampling/importance_sampling_ratio/min": 0.9595826268196106, "sampling/sampling_logp_difference/max": 0.3391381502151489, "sampling/sampling_logp_difference/mean": 0.00410043727606535, "step": 589, "step_time": 11.558418390998668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2266.0, "completions/max_terminated_length": 2266.0, "completions/mean_length": 591.09375, "completions/mean_terminated_length": 591.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.11577376164495945, "epoch": 0.0118, "frac_reward_zero_std": 0.25, "grad_norm": 0.06325645744800568, "kl": 0.5047505535185337, "learning_rate": 3.1260390174108086e-05, "loss": 0.0009, "num_tokens": 26158947.0, "reward": 0.8385635018348694, "reward_std": 0.39169222116470337, "rewards/rollout_reward_func/mean": 0.8385635018348694, "rewards/rollout_reward_func/std": 0.4560070335865021, "sampling/importance_sampling_ratio/max": 1.0285624265670776, "sampling/importance_sampling_ratio/mean": 0.983589231967926, "sampling/importance_sampling_ratio/min": 0.8434584140777588, "sampling/sampling_logp_difference/max": 0.17139816284179688, "sampling/sampling_logp_difference/mean": 0.005378573667258024, "step": 590, "step_time": 12.675432901000022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1617.0, "completions/max_terminated_length": 1617.0, "completions/mean_length": 403.78125, "completions/mean_terminated_length": 403.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07413900434039533, "epoch": 0.01182, "frac_reward_zero_std": 0.25, "grad_norm": 0.35618120431900024, "kl": 0.7016339115798473, "learning_rate": 3.116259167833792e-05, "loss": 0.0085, "num_tokens": 26200784.0, "reward": 0.8002158999443054, "reward_std": 0.4153405427932739, "rewards/rollout_reward_func/mean": 0.8002158999443054, "rewards/rollout_reward_func/std": 0.4915505051612854, "sampling/importance_sampling_ratio/max": 1.0203686952590942, "sampling/importance_sampling_ratio/mean": 0.9926114082336426, "sampling/importance_sampling_ratio/min": 0.8621559739112854, "sampling/sampling_logp_difference/max": 0.1952928900718689, "sampling/sampling_logp_difference/mean": 0.0032671019434928894, "step": 591, "step_time": 10.52327421299924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1676.0, "completions/max_terminated_length": 1676.0, "completions/mean_length": 422.03125, "completions/mean_terminated_length": 422.03125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07656179834157228, "epoch": 0.01184, "frac_reward_zero_std": 0.25, "grad_norm": 0.5632655620574951, "kl": 0.7104943431913853, "learning_rate": 3.106494844574844e-05, "loss": -0.0024, "num_tokens": 26243264.0, "reward": 0.8983461856842041, "reward_std": 0.2875204086303711, "rewards/rollout_reward_func/mean": 0.8983461856842041, "rewards/rollout_reward_func/std": 0.3211221396923065, "sampling/importance_sampling_ratio/max": 1.061615228652954, "sampling/importance_sampling_ratio/mean": 0.985357403755188, "sampling/importance_sampling_ratio/min": 0.7921319007873535, "sampling/sampling_logp_difference/max": 0.23382019996643066, "sampling/sampling_logp_difference/mean": 0.0055868737399578094, "step": 592, "step_time": 10.205570270000862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1654.0, "completions/max_terminated_length": 1654.0, "completions/mean_length": 429.21875, "completions/mean_terminated_length": 429.21875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06378566252533346, "epoch": 0.01186, "frac_reward_zero_std": 0.0, "grad_norm": 0.07540275156497955, "kl": 1.0865878462791443, "learning_rate": 3.0967461875234866e-05, "loss": 0.011, "num_tokens": 26286415.0, "reward": 0.6548122763633728, "reward_std": 0.6782457232475281, "rewards/rollout_reward_func/mean": 0.6548122763633728, "rewards/rollout_reward_func/std": 0.6944438815116882, "sampling/importance_sampling_ratio/max": 1.0281598567962646, "sampling/importance_sampling_ratio/mean": 0.9985912442207336, "sampling/importance_sampling_ratio/min": 0.9597791433334351, "sampling/sampling_logp_difference/max": 0.11409664154052734, "sampling/sampling_logp_difference/mean": 0.002414529910311103, "step": 593, "step_time": 10.013227765999545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1600.0, "completions/max_terminated_length": 1600.0, "completions/mean_length": 616.3125, "completions/mean_terminated_length": 616.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08366718795150518, "epoch": 0.01188, "frac_reward_zero_std": 0.0, "grad_norm": 0.0706484392285347, "kl": 0.8526046685874462, "learning_rate": 3.0870133363448005e-05, "loss": 0.0053, "num_tokens": 26337963.0, "reward": 0.7088956832885742, "reward_std": 0.5687005519866943, "rewards/rollout_reward_func/mean": 0.7088956832885742, "rewards/rollout_reward_func/std": 0.5935575366020203, "sampling/importance_sampling_ratio/max": 1.1428049802780151, "sampling/importance_sampling_ratio/mean": 1.0075340270996094, "sampling/importance_sampling_ratio/min": 0.9609537720680237, "sampling/sampling_logp_difference/max": 0.09016704559326172, "sampling/sampling_logp_difference/mean": 0.002260142471641302, "step": 594, "step_time": 10.785962120999102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1480.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 440.21875, "completions/mean_terminated_length": 440.21875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.048144208965823054, "epoch": 0.0119, "frac_reward_zero_std": 0.75, "grad_norm": 0.007406334392726421, "kl": 0.660571500658989, "learning_rate": 3.077296430477423e-05, "loss": 0.0036, "num_tokens": 26380953.0, "reward": 0.9664958119392395, "reward_std": 0.09476418048143387, "rewards/rollout_reward_func/mean": 0.9664958119392395, "rewards/rollout_reward_func/std": 0.18952834606170654, "sampling/importance_sampling_ratio/max": 1.1403172016143799, "sampling/importance_sampling_ratio/mean": 1.0065219402313232, "sampling/importance_sampling_ratio/min": 0.9941709637641907, "sampling/sampling_logp_difference/max": 0.14545774459838867, "sampling/sampling_logp_difference/mean": 0.0015885980101302266, "step": 595, "step_time": 9.568722184999388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2327.0, "completions/max_terminated_length": 2327.0, "completions/mean_length": 504.96875, "completions/mean_terminated_length": 504.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04734274453949183, "epoch": 0.01192, "frac_reward_zero_std": 0.5, "grad_norm": 0.2500431537628174, "kl": 0.8450574651360512, "learning_rate": 3.0675956091315446e-05, "loss": 0.0006, "num_tokens": 26426497.0, "reward": 0.8697185516357422, "reward_std": 0.3015957772731781, "rewards/rollout_reward_func/mean": 0.8697185516357422, "rewards/rollout_reward_func/std": 0.4313565492630005, "sampling/importance_sampling_ratio/max": 1.0015897750854492, "sampling/importance_sampling_ratio/mean": 0.9887312650680542, "sampling/importance_sampling_ratio/min": 0.8841390013694763, "sampling/sampling_logp_difference/max": 0.12330198287963867, "sampling/sampling_logp_difference/mean": 0.0028782528825104237, "step": 596, "step_time": 12.770920365000165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1745.0, "completions/max_terminated_length": 1745.0, "completions/mean_length": 598.78125, "completions/mean_terminated_length": 598.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06114278081804514, "epoch": 0.01194, "frac_reward_zero_std": 0.0, "grad_norm": 0.13369682431221008, "kl": 0.7028218007762916, "learning_rate": 3.057911011286924e-05, "loss": -0.0014, "num_tokens": 26476424.0, "reward": 0.6772143244743347, "reward_std": 0.5688470602035522, "rewards/rollout_reward_func/mean": 0.6772143244743347, "rewards/rollout_reward_func/std": 0.6000171303749084, "sampling/importance_sampling_ratio/max": 1.0098161697387695, "sampling/importance_sampling_ratio/mean": 0.9950616359710693, "sampling/importance_sampling_ratio/min": 0.8900092244148254, "sampling/sampling_logp_difference/max": 0.12317085266113281, "sampling/sampling_logp_difference/mean": 0.0014967541210353374, "step": 597, "step_time": 10.821340198001053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2358.0, "completions/max_terminated_length": 2358.0, "completions/mean_length": 523.46875, "completions/mean_terminated_length": 523.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.051744087191764265, "epoch": 0.01196, "frac_reward_zero_std": 0.0, "grad_norm": 0.2563270926475525, "kl": 1.0522995330393314, "learning_rate": 3.048242775690893e-05, "loss": -0.0062, "num_tokens": 26522190.0, "reward": 0.7768058776855469, "reward_std": 0.5144651532173157, "rewards/rollout_reward_func/mean": 0.7768058776855469, "rewards/rollout_reward_func/std": 0.550489068031311, "sampling/importance_sampling_ratio/max": 1.1040617227554321, "sampling/importance_sampling_ratio/mean": 1.001006841659546, "sampling/importance_sampling_ratio/min": 0.8666796088218689, "sampling/sampling_logp_difference/max": 0.13985997438430786, "sampling/sampling_logp_difference/mean": 0.0028128682170063257, "step": 598, "step_time": 12.310543302999577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1717.0, "completions/max_terminated_length": 1717.0, "completions/mean_length": 523.375, "completions/mean_terminated_length": 523.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05367401347029954, "epoch": 0.01198, "frac_reward_zero_std": 0.25, "grad_norm": 0.21900199353694916, "kl": 0.9659171961247921, "learning_rate": 3.0385910408563622e-05, "loss": 0.0215, "num_tokens": 26567651.0, "reward": 0.6881104111671448, "reward_std": 0.5018270611763, "rewards/rollout_reward_func/mean": 0.6881104111671448, "rewards/rollout_reward_func/std": 0.6380835175514221, "sampling/importance_sampling_ratio/max": 1.0346837043762207, "sampling/importance_sampling_ratio/mean": 0.9909421801567078, "sampling/importance_sampling_ratio/min": 0.7819785475730896, "sampling/sampling_logp_difference/max": 0.23006296157836914, "sampling/sampling_logp_difference/mean": 0.004670243710279465, "step": 599, "step_time": 11.50710947099924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2233.0, "completions/max_terminated_length": 2233.0, "completions/mean_length": 385.1875, "completions/mean_terminated_length": 385.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.061695286945905536, "epoch": 0.012, "frac_reward_zero_std": 0.25, "grad_norm": 0.2297777533531189, "kl": 0.9877367317676544, "learning_rate": 3.0289559450598485e-05, "loss": 0.0158, "num_tokens": 26607662.0, "reward": 0.8727279901504517, "reward_std": 0.359979510307312, "rewards/rollout_reward_func/mean": 0.8727279901504517, "rewards/rollout_reward_func/std": 0.4256633222103119, "sampling/importance_sampling_ratio/max": 1.7895560264587402, "sampling/importance_sampling_ratio/mean": 1.0264379978179932, "sampling/importance_sampling_ratio/min": 0.9187449216842651, "sampling/sampling_logp_difference/max": 0.39458906650543213, "sampling/sampling_logp_difference/mean": 0.005934265907853842, "step": 600, "step_time": 12.093460416001108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1775.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 558.625, "completions/mean_terminated_length": 558.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08278254757169634, "epoch": 0.01202, "frac_reward_zero_std": 0.0, "grad_norm": 0.11278097331523895, "kl": 0.9130514711141586, "learning_rate": 3.0193376263394845e-05, "loss": 0.0165, "num_tokens": 26656058.0, "reward": 0.5931788682937622, "reward_std": 0.7166495323181152, "rewards/rollout_reward_func/mean": 0.5931788682937622, "rewards/rollout_reward_func/std": 0.7487666010856628, "sampling/importance_sampling_ratio/max": 1.1278643608093262, "sampling/importance_sampling_ratio/mean": 1.0120519399642944, "sampling/importance_sampling_ratio/min": 0.9852837920188904, "sampling/sampling_logp_difference/max": 0.12270259857177734, "sampling/sampling_logp_difference/mean": 0.003029154147952795, "step": 601, "step_time": 11.617470346001937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1649.0, "completions/max_terminated_length": 1649.0, "completions/mean_length": 666.28125, "completions/mean_terminated_length": 666.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.066440170397982, "epoch": 0.01204, "frac_reward_zero_std": 0.75, "grad_norm": 0.05043506994843483, "kl": 0.750783184543252, "learning_rate": 3.009736222493047e-05, "loss": 0.0059, "num_tokens": 26708307.0, "reward": 0.9397852420806885, "reward_std": 0.17031307518482208, "rewards/rollout_reward_func/mean": 0.9397852420806885, "rewards/rollout_reward_func/std": 0.34062615036964417, "sampling/importance_sampling_ratio/max": 1.0836553573608398, "sampling/importance_sampling_ratio/mean": 0.9995386600494385, "sampling/importance_sampling_ratio/min": 0.9108222126960754, "sampling/sampling_logp_difference/max": 0.11202383041381836, "sampling/sampling_logp_difference/mean": 0.0026331529952585697, "step": 602, "step_time": 10.758701333000317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1654.0, "completions/max_terminated_length": 1654.0, "completions/mean_length": 536.96875, "completions/mean_terminated_length": 536.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06740357889793813, "epoch": 0.01206, "frac_reward_zero_std": 0.5, "grad_norm": 0.1087670624256134, "kl": 0.7486318089067936, "learning_rate": 3.0001518710759776e-05, "loss": 0.0031, "num_tokens": 26756670.0, "reward": 0.8175386190414429, "reward_std": 0.39649003744125366, "rewards/rollout_reward_func/mean": 0.8175386190414429, "rewards/rollout_reward_func/std": 0.5764200091362, "sampling/importance_sampling_ratio/max": 1.0155253410339355, "sampling/importance_sampling_ratio/mean": 0.994673490524292, "sampling/importance_sampling_ratio/min": 0.8876819014549255, "sampling/sampling_logp_difference/max": 0.11971926689147949, "sampling/sampling_logp_difference/mean": 0.0016150325536727905, "step": 603, "step_time": 11.087028741001632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1647.0, "completions/max_terminated_length": 1647.0, "completions/mean_length": 498.0625, "completions/mean_terminated_length": 498.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07241023192182183, "epoch": 0.01208, "frac_reward_zero_std": 0.25, "grad_norm": 0.02824527584016323, "kl": 0.91349340043962, "learning_rate": 2.990584709399416e-05, "loss": -0.0041, "num_tokens": 26802838.0, "reward": 0.7348819971084595, "reward_std": 0.4516708552837372, "rewards/rollout_reward_func/mean": 0.7348819971084595, "rewards/rollout_reward_func/std": 0.529996931552887, "sampling/importance_sampling_ratio/max": 1.1207741498947144, "sampling/importance_sampling_ratio/mean": 1.0026111602783203, "sampling/importance_sampling_ratio/min": 0.9761419892311096, "sampling/sampling_logp_difference/max": 0.10491883754730225, "sampling/sampling_logp_difference/mean": 0.0014543338911607862, "step": 604, "step_time": 10.84238236300098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1536.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 374.5, "completions/mean_terminated_length": 374.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06626686989329755, "epoch": 0.0121, "frac_reward_zero_std": 0.25, "grad_norm": 0.09870287030935287, "kl": 0.7834581211209297, "learning_rate": 2.981034874528231e-05, "loss": 0.0136, "num_tokens": 26842421.0, "reward": 0.7788841724395752, "reward_std": 0.451036274433136, "rewards/rollout_reward_func/mean": 0.7788841724395752, "rewards/rollout_reward_func/std": 0.5489624738693237, "sampling/importance_sampling_ratio/max": 1.0335969924926758, "sampling/importance_sampling_ratio/mean": 0.9997051954269409, "sampling/importance_sampling_ratio/min": 0.9813442826271057, "sampling/sampling_logp_difference/max": 0.03488743305206299, "sampling/sampling_logp_difference/mean": 0.0009926026687026024, "step": 605, "step_time": 10.404154158002711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2473.0, "completions/max_terminated_length": 2473.0, "completions/mean_length": 625.75, "completions/mean_terminated_length": 625.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0754517768509686, "epoch": 0.01212, "frac_reward_zero_std": 0.25, "grad_norm": 0.07623522728681564, "kl": 0.8940856121480465, "learning_rate": 2.9715025032790574e-05, "loss": 0.0144, "num_tokens": 26892533.0, "reward": 0.7512150406837463, "reward_std": 0.5016535520553589, "rewards/rollout_reward_func/mean": 0.7512150406837463, "rewards/rollout_reward_func/std": 0.6164379119873047, "sampling/importance_sampling_ratio/max": 1.0189692974090576, "sampling/importance_sampling_ratio/mean": 0.9988021850585938, "sampling/importance_sampling_ratio/min": 0.9822773337364197, "sampling/sampling_logp_difference/max": 0.04088544845581055, "sampling/sampling_logp_difference/mean": 0.0012091536773368716, "step": 606, "step_time": 13.261140623000756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1699.0, "completions/max_terminated_length": 1699.0, "completions/mean_length": 452.59375, "completions/mean_terminated_length": 452.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.059628771734423935, "epoch": 0.01214, "frac_reward_zero_std": 0.0, "grad_norm": 0.40378424525260925, "kl": 0.6550290025770664, "learning_rate": 2.9619877322183375e-05, "loss": 0.0204, "num_tokens": 26936846.0, "reward": 0.7835191488265991, "reward_std": 0.612300455570221, "rewards/rollout_reward_func/mean": 0.7835191488265991, "rewards/rollout_reward_func/std": 0.5983232259750366, "sampling/importance_sampling_ratio/max": 1.3953419923782349, "sampling/importance_sampling_ratio/mean": 1.013242483139038, "sampling/importance_sampling_ratio/min": 0.9884703755378723, "sampling/sampling_logp_difference/max": 0.12128067016601562, "sampling/sampling_logp_difference/mean": 0.0025778773706406355, "step": 607, "step_time": 10.433692138999504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2281.0, "completions/max_terminated_length": 2281.0, "completions/mean_length": 461.65625, "completions/mean_terminated_length": 461.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04669260745868087, "epoch": 0.01216, "frac_reward_zero_std": 0.25, "grad_norm": 0.08924217522144318, "kl": 0.8746889606118202, "learning_rate": 2.9524906976603576e-05, "loss": 0.002, "num_tokens": 26981624.0, "reward": 0.807991623878479, "reward_std": 0.4598064422607422, "rewards/rollout_reward_func/mean": 0.807991623878479, "rewards/rollout_reward_func/std": 0.5423721075057983, "sampling/importance_sampling_ratio/max": 1.0062052011489868, "sampling/importance_sampling_ratio/mean": 0.9991469383239746, "sampling/importance_sampling_ratio/min": 0.9636269211769104, "sampling/sampling_logp_difference/max": 0.03868541121482849, "sampling/sampling_logp_difference/mean": 0.0006314663914963603, "step": 608, "step_time": 12.372403517001658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1439.0, "completions/max_terminated_length": 1439.0, "completions/mean_length": 481.90625, "completions/mean_terminated_length": 481.90625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.035459189442917705, "epoch": 0.01218, "frac_reward_zero_std": 0.5, "grad_norm": 0.020282059907913208, "kl": 0.6169353537261486, "learning_rate": 2.943011535665307e-05, "loss": 0.0029, "num_tokens": 27027416.0, "reward": 0.8723063468933105, "reward_std": 0.2957204580307007, "rewards/rollout_reward_func/mean": 0.8723063468933105, "rewards/rollout_reward_func/std": 0.42290443181991577, "sampling/importance_sampling_ratio/max": 1.008619785308838, "sampling/importance_sampling_ratio/mean": 0.9970881938934326, "sampling/importance_sampling_ratio/min": 0.8986040353775024, "sampling/sampling_logp_difference/max": 0.10717248916625977, "sampling/sampling_logp_difference/mean": 0.0011673658154904842, "step": 609, "step_time": 9.685118575999695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1660.0, "completions/max_terminated_length": 1660.0, "completions/mean_length": 465.78125, "completions/mean_terminated_length": 465.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.031786226958502084, "epoch": 0.0122, "frac_reward_zero_std": 0.5, "grad_norm": 0.018776072189211845, "kl": 0.6168023832142353, "learning_rate": 2.933550382037318e-05, "loss": 0.008, "num_tokens": 27070891.0, "reward": 0.9003835916519165, "reward_std": 0.21796126663684845, "rewards/rollout_reward_func/mean": 0.9003835916519165, "rewards/rollout_reward_func/std": 0.3148328959941864, "sampling/importance_sampling_ratio/max": 1.0080361366271973, "sampling/importance_sampling_ratio/mean": 0.9990434646606445, "sampling/importance_sampling_ratio/min": 0.9584548473358154, "sampling/sampling_logp_difference/max": 0.040645062923431396, "sampling/sampling_logp_difference/mean": 0.00048572244122624397, "step": 610, "step_time": 10.929288095999254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1791.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 574.40625, "completions/mean_terminated_length": 574.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.051240191562101245, "epoch": 0.01222, "frac_reward_zero_std": 0.0, "grad_norm": 0.04928293451666832, "kl": 0.623182887211442, "learning_rate": 2.924107372322524e-05, "loss": -0.0134, "num_tokens": 27120680.0, "reward": 0.8099408149719238, "reward_std": 0.5375685095787048, "rewards/rollout_reward_func/mean": 0.8099408149719238, "rewards/rollout_reward_func/std": 0.534086287021637, "sampling/importance_sampling_ratio/max": 1.0119365453720093, "sampling/importance_sampling_ratio/mean": 0.9975952506065369, "sampling/importance_sampling_ratio/min": 0.9248530268669128, "sampling/sampling_logp_difference/max": 0.07804250717163086, "sampling/sampling_logp_difference/mean": 0.0009832167997956276, "step": 611, "step_time": 10.857469567000408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2374.0, "completions/max_terminated_length": 2374.0, "completions/mean_length": 505.09375, "completions/mean_terminated_length": 505.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04457396629732102, "epoch": 0.01224, "frac_reward_zero_std": 0.5, "grad_norm": 0.007084437645971775, "kl": 0.4812557171098888, "learning_rate": 2.914682641807119e-05, "loss": 0.0013, "num_tokens": 27166915.0, "reward": 0.8992725610733032, "reward_std": 0.21942973136901855, "rewards/rollout_reward_func/mean": 0.8992725610733032, "rewards/rollout_reward_func/std": 0.31818658113479614, "sampling/importance_sampling_ratio/max": 1.009006381034851, "sampling/importance_sampling_ratio/mean": 0.9877417087554932, "sampling/importance_sampling_ratio/min": 0.7831897139549255, "sampling/sampling_logp_difference/max": 0.22056865692138672, "sampling/sampling_logp_difference/mean": 0.003345365636050701, "step": 612, "step_time": 12.322827066000173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1510.0, "completions/max_terminated_length": 1510.0, "completions/mean_length": 465.78125, "completions/mean_terminated_length": 465.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05472026066854596, "epoch": 0.01226, "frac_reward_zero_std": 0.25, "grad_norm": 0.13020752370357513, "kl": 0.8070669025182724, "learning_rate": 2.9052763255154194e-05, "loss": 0.0018, "num_tokens": 27211012.0, "reward": 0.741054892539978, "reward_std": 0.4602442979812622, "rewards/rollout_reward_func/mean": 0.741054892539978, "rewards/rollout_reward_func/std": 0.5767377018928528, "sampling/importance_sampling_ratio/max": 1.0486301183700562, "sampling/importance_sampling_ratio/mean": 0.9956543445587158, "sampling/importance_sampling_ratio/min": 0.8866116404533386, "sampling/sampling_logp_difference/max": 0.12172174453735352, "sampling/sampling_logp_difference/mean": 0.0020657924469560385, "step": 613, "step_time": 10.235998444999495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1582.0, "completions/max_terminated_length": 1582.0, "completions/mean_length": 398.75, "completions/mean_terminated_length": 398.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.041476933285593987, "epoch": 0.01228, "frac_reward_zero_std": 0.25, "grad_norm": 0.1797885298728943, "kl": 0.8204414416104555, "learning_rate": 2.8958885582079266e-05, "loss": 0.0006, "num_tokens": 27252629.0, "reward": 0.8171682357788086, "reward_std": 0.5171263813972473, "rewards/rollout_reward_func/mean": 0.8171682357788086, "rewards/rollout_reward_func/std": 0.5775699019432068, "sampling/importance_sampling_ratio/max": 1.0365818738937378, "sampling/importance_sampling_ratio/mean": 0.999225378036499, "sampling/importance_sampling_ratio/min": 0.9119252562522888, "sampling/sampling_logp_difference/max": 0.09219193458557129, "sampling/sampling_logp_difference/mean": 0.001203684019856155, "step": 614, "step_time": 10.414441258999432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1587.0, "completions/max_terminated_length": 1587.0, "completions/mean_length": 548.6875, "completions/mean_terminated_length": 548.6875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0438781880075112, "epoch": 0.0123, "frac_reward_zero_std": 0.0, "grad_norm": 0.1293879747390747, "kl": 0.7516644280403852, "learning_rate": 2.886519474379397e-05, "loss": -0.0056, "num_tokens": 27300514.0, "reward": 0.703372597694397, "reward_std": 0.5367273092269897, "rewards/rollout_reward_func/mean": 0.703372597694397, "rewards/rollout_reward_func/std": 0.5411840081214905, "sampling/importance_sampling_ratio/max": 1.0496041774749756, "sampling/importance_sampling_ratio/mean": 1.0033326148986816, "sampling/importance_sampling_ratio/min": 0.9921358227729797, "sampling/sampling_logp_difference/max": 0.04916614294052124, "sampling/sampling_logp_difference/mean": 0.0009798677638173103, "step": 615, "step_time": 10.569220734000737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1726.0, "completions/max_terminated_length": 1726.0, "completions/mean_length": 594.96875, "completions/mean_terminated_length": 594.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.061516106594353914, "epoch": 0.01232, "frac_reward_zero_std": 0.0, "grad_norm": 0.1412225365638733, "kl": 0.923438835889101, "learning_rate": 2.8771692082569218e-05, "loss": 0.002, "num_tokens": 27350737.0, "reward": 0.5527783036231995, "reward_std": 0.6231276988983154, "rewards/rollout_reward_func/mean": 0.5527783036231995, "rewards/rollout_reward_func/std": 0.671733558177948, "sampling/importance_sampling_ratio/max": 1.186055064201355, "sampling/importance_sampling_ratio/mean": 1.0077943801879883, "sampling/importance_sampling_ratio/min": 0.9696540832519531, "sampling/sampling_logp_difference/max": 0.11167407035827637, "sampling/sampling_logp_difference/mean": 0.0019850425887852907, "step": 616, "step_time": 10.66845726200063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1602.0, "completions/max_terminated_length": 1602.0, "completions/mean_length": 377.125, "completions/mean_terminated_length": 377.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.025376947261975147, "epoch": 0.01234, "frac_reward_zero_std": 0.25, "grad_norm": 0.12109450250864029, "kl": 1.2652764543890953, "learning_rate": 2.867837893797991e-05, "loss": 0.0082, "num_tokens": 27391085.0, "reward": 0.8446434736251831, "reward_std": 0.43941473960876465, "rewards/rollout_reward_func/mean": 0.8446434736251831, "rewards/rollout_reward_func/std": 0.5060496926307678, "sampling/importance_sampling_ratio/max": 1.1122522354125977, "sampling/importance_sampling_ratio/mean": 1.003293514251709, "sampling/importance_sampling_ratio/min": 0.9826003313064575, "sampling/sampling_logp_difference/max": 0.10526204109191895, "sampling/sampling_logp_difference/mean": 0.0012167883105576038, "step": 617, "step_time": 9.8850955659982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1665.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 451.34375, "completions/mean_terminated_length": 451.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.03619020106270909, "epoch": 0.01236, "frac_reward_zero_std": 0.25, "grad_norm": 0.02650338225066662, "kl": 0.6783541096374393, "learning_rate": 2.858525664688588e-05, "loss": -0.0064, "num_tokens": 27436278.0, "reward": 0.6806925535202026, "reward_std": 0.5397535562515259, "rewards/rollout_reward_func/mean": 0.6806925535202026, "rewards/rollout_reward_func/std": 0.6464407444000244, "sampling/importance_sampling_ratio/max": 1.0296610593795776, "sampling/importance_sampling_ratio/mean": 1.0018805265426636, "sampling/importance_sampling_ratio/min": 0.9962325096130371, "sampling/sampling_logp_difference/max": 0.025491416454315186, "sampling/sampling_logp_difference/mean": 0.0005185013869777322, "step": 618, "step_time": 10.772055552000893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2556.0, "completions/max_terminated_length": 2556.0, "completions/mean_length": 606.46875, "completions/mean_terminated_length": 606.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.03885387454647571, "epoch": 0.01238, "frac_reward_zero_std": 0.0, "grad_norm": 0.06654320657253265, "kl": 0.9954479709267616, "learning_rate": 2.8492326543412665e-05, "loss": 0.0274, "num_tokens": 27484770.0, "reward": 0.6823762655258179, "reward_std": 0.6394726037979126, "rewards/rollout_reward_func/mean": 0.6823762655258179, "rewards/rollout_reward_func/std": 0.6466310620307922, "sampling/importance_sampling_ratio/max": 1.0366802215576172, "sampling/importance_sampling_ratio/mean": 1.003069519996643, "sampling/importance_sampling_ratio/min": 0.9991898536682129, "sampling/sampling_logp_difference/max": 0.03288671374320984, "sampling/sampling_logp_difference/mean": 0.0006873043021187186, "step": 619, "step_time": 13.965180914001394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1658.0, "completions/max_terminated_length": 1658.0, "completions/mean_length": 466.0, "completions/mean_terminated_length": 466.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04905520856846124, "epoch": 0.0124, "frac_reward_zero_std": 0.5, "grad_norm": 0.03416735678911209, "kl": 0.7597006559371948, "learning_rate": 2.839958995893236e-05, "loss": 0.0022, "num_tokens": 27529330.0, "reward": 0.8782060146331787, "reward_std": 0.34448540210723877, "rewards/rollout_reward_func/mean": 0.8782060146331787, "rewards/rollout_reward_func/std": 0.47928592562675476, "sampling/importance_sampling_ratio/max": 1.0388423204421997, "sampling/importance_sampling_ratio/mean": 1.0026359558105469, "sampling/importance_sampling_ratio/min": 0.984321653842926, "sampling/sampling_logp_difference/max": 0.03645177185535431, "sampling/sampling_logp_difference/mean": 0.0009820209816098213, "step": 620, "step_time": 10.392652439001722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1409.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 347.28125, "completions/mean_terminated_length": 347.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06711166694003623, "epoch": 0.01242, "frac_reward_zero_std": 0.25, "grad_norm": 0.07668234407901764, "kl": 1.1050574332475662, "learning_rate": 2.830704822204467e-05, "loss": 0.0093, "num_tokens": 27568926.0, "reward": 0.8699976205825806, "reward_std": 0.36770230531692505, "rewards/rollout_reward_func/mean": 0.8699976205825806, "rewards/rollout_reward_func/std": 0.43011215329170227, "sampling/importance_sampling_ratio/max": 1.1245098114013672, "sampling/importance_sampling_ratio/mean": 1.006866455078125, "sampling/importance_sampling_ratio/min": 0.9771947264671326, "sampling/sampling_logp_difference/max": 0.11157536506652832, "sampling/sampling_logp_difference/mean": 0.0026898083742707968, "step": 621, "step_time": 9.37775773100202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1791.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 330.46875, "completions/mean_terminated_length": 330.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0444917103741318, "epoch": 0.01244, "frac_reward_zero_std": 0.0, "grad_norm": 0.08504091203212738, "kl": 0.9251918569207191, "learning_rate": 2.8214702658557737e-05, "loss": -0.0006, "num_tokens": 27608284.0, "reward": 0.606484055519104, "reward_std": 0.6278603076934814, "rewards/rollout_reward_func/mean": 0.606484055519104, "rewards/rollout_reward_func/std": 0.6281183958053589, "sampling/importance_sampling_ratio/max": 1.0034658908843994, "sampling/importance_sampling_ratio/mean": 0.9965944290161133, "sampling/importance_sampling_ratio/min": 0.9119126200675964, "sampling/sampling_logp_difference/max": 0.09230303764343262, "sampling/sampling_logp_difference/mean": 0.0011012912727892399, "step": 622, "step_time": 10.643396458999632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1741.0, "completions/max_terminated_length": 1741.0, "completions/mean_length": 421.125, "completions/mean_terminated_length": 421.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.03194684360641986, "epoch": 0.01246, "frac_reward_zero_std": 0.25, "grad_norm": 0.06961619108915329, "kl": 1.1775484830141068, "learning_rate": 2.8122554591469234e-05, "loss": 0.0138, "num_tokens": 27650123.0, "reward": 0.8302947282791138, "reward_std": 0.3319464921951294, "rewards/rollout_reward_func/mean": 0.8302947282791138, "rewards/rollout_reward_func/std": 0.40072914958000183, "sampling/importance_sampling_ratio/max": 1.1963671445846558, "sampling/importance_sampling_ratio/mean": 1.0021510124206543, "sampling/importance_sampling_ratio/min": 0.7560631036758423, "sampling/sampling_logp_difference/max": 0.2784450054168701, "sampling/sampling_logp_difference/mean": 0.0043127997778356075, "step": 623, "step_time": 10.697274417001609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1743.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 491.65625, "completions/mean_terminated_length": 491.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.058639467461034656, "epoch": 0.01248, "frac_reward_zero_std": 0.0, "grad_norm": 0.08887308090925217, "kl": 1.2365010008215904, "learning_rate": 2.803060534094738e-05, "loss": 0.0247, "num_tokens": 27694436.0, "reward": 0.7395994663238525, "reward_std": 0.4973642826080322, "rewards/rollout_reward_func/mean": 0.7395994663238525, "rewards/rollout_reward_func/std": 0.5230579972267151, "sampling/importance_sampling_ratio/max": 1.0975381135940552, "sampling/importance_sampling_ratio/mean": 0.9998656511306763, "sampling/importance_sampling_ratio/min": 0.9171544313430786, "sampling/sampling_logp_difference/max": 0.09538006782531738, "sampling/sampling_logp_difference/mean": 0.0027242167852818966, "step": 624, "step_time": 11.293647193000652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1722.0, "completions/max_terminated_length": 1722.0, "completions/mean_length": 395.15625, "completions/mean_terminated_length": 395.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05850500403903425, "epoch": 0.0125, "frac_reward_zero_std": 0.0, "grad_norm": 0.10366673767566681, "kl": 0.8381507508456707, "learning_rate": 2.7938856224312008e-05, "loss": -0.0059, "num_tokens": 27737264.0, "reward": 0.7325389385223389, "reward_std": 0.5141750574111938, "rewards/rollout_reward_func/mean": 0.7325389385223389, "rewards/rollout_reward_func/std": 0.5352970957756042, "sampling/importance_sampling_ratio/max": 1.1287235021591187, "sampling/importance_sampling_ratio/mean": 1.005989909172058, "sampling/importance_sampling_ratio/min": 0.9860377311706543, "sampling/sampling_logp_difference/max": 0.12101054191589355, "sampling/sampling_logp_difference/mean": 0.0017026172718033195, "step": 625, "step_time": 10.1781570309995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1735.0, "completions/max_terminated_length": 1735.0, "completions/mean_length": 450.5625, "completions/mean_terminated_length": 450.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06992056689341553, "epoch": 0.01252, "frac_reward_zero_std": 0.0, "grad_norm": 0.25616735219955444, "kl": 1.3046680018305779, "learning_rate": 2.7847308556015742e-05, "loss": 0.0031, "num_tokens": 27781861.0, "reward": 0.8018882274627686, "reward_std": 0.4936990737915039, "rewards/rollout_reward_func/mean": 0.8018882274627686, "rewards/rollout_reward_func/std": 0.48781996965408325, "sampling/importance_sampling_ratio/max": 1.117982268333435, "sampling/importance_sampling_ratio/mean": 1.0041942596435547, "sampling/importance_sampling_ratio/min": 0.9164928197860718, "sampling/sampling_logp_difference/max": 0.11492323875427246, "sampling/sampling_logp_difference/mean": 0.002527228556573391, "step": 626, "step_time": 10.486856021999301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1567.0, "completions/max_terminated_length": 1567.0, "completions/mean_length": 499.46875, "completions/mean_terminated_length": 499.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0785334138199687, "epoch": 0.01254, "frac_reward_zero_std": 0.25, "grad_norm": 0.05206829681992531, "kl": 0.8869923539459705, "learning_rate": 2.7755963647625114e-05, "loss": -0.008, "num_tokens": 27828517.0, "reward": 0.7995869517326355, "reward_std": 0.41332948207855225, "rewards/rollout_reward_func/mean": 0.7995869517326355, "rewards/rollout_reward_func/std": 0.4944692850112915, "sampling/importance_sampling_ratio/max": 1.0461732149124146, "sampling/importance_sampling_ratio/mean": 0.995756983757019, "sampling/importance_sampling_ratio/min": 0.9278209805488586, "sampling/sampling_logp_difference/max": 0.07572335004806519, "sampling/sampling_logp_difference/mean": 0.0022912186104804277, "step": 627, "step_time": 11.043718895999518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 392.8125, "completions/mean_terminated_length": 392.8125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04383030137978494, "epoch": 0.01256, "frac_reward_zero_std": 0.25, "grad_norm": 0.05566071346402168, "kl": 0.7479860223829746, "learning_rate": 2.7664822807801834e-05, "loss": 0.0059, "num_tokens": 27871454.0, "reward": 0.7692146897315979, "reward_std": 0.4123854637145996, "rewards/rollout_reward_func/mean": 0.7692146897315979, "rewards/rollout_reward_func/std": 0.5111234784126282, "sampling/importance_sampling_ratio/max": 1.1300392150878906, "sampling/importance_sampling_ratio/mean": 1.006131649017334, "sampling/importance_sampling_ratio/min": 0.9561623334884644, "sampling/sampling_logp_difference/max": 0.12218093872070312, "sampling/sampling_logp_difference/mean": 0.0022744350135326385, "step": 628, "step_time": 7.910078938999504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1443.0, "completions/max_terminated_length": 1443.0, "completions/mean_length": 433.8125, "completions/mean_terminated_length": 433.8125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.03554550767876208, "epoch": 0.01258, "frac_reward_zero_std": 0.0, "grad_norm": 0.010442448779940605, "kl": 0.7385404091328382, "learning_rate": 2.7573887342283935e-05, "loss": -0.0116, "num_tokens": 27916616.0, "reward": 0.6931272745132446, "reward_std": 0.4969215989112854, "rewards/rollout_reward_func/mean": 0.6931272745132446, "rewards/rollout_reward_func/std": 0.4985305368900299, "sampling/importance_sampling_ratio/max": 1.001433253288269, "sampling/importance_sampling_ratio/mean": 0.9995293021202087, "sampling/importance_sampling_ratio/min": 0.9955849647521973, "sampling/sampling_logp_difference/max": 0.00430632010102272, "sampling/sampling_logp_difference/mean": 0.00019837915897369385, "step": 629, "step_time": 9.899498264000613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1467.0, "completions/max_terminated_length": 1467.0, "completions/mean_length": 474.75, "completions/mean_terminated_length": 474.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06999269453808665, "epoch": 0.0126, "frac_reward_zero_std": 0.0, "grad_norm": 0.12837985157966614, "kl": 0.6514508090913296, "learning_rate": 2.748315855386721e-05, "loss": -0.0026, "num_tokens": 27961781.0, "reward": 0.7331922054290771, "reward_std": 0.48074790835380554, "rewards/rollout_reward_func/mean": 0.7331922054290771, "rewards/rollout_reward_func/std": 0.5342533588409424, "sampling/importance_sampling_ratio/max": 1.1283587217330933, "sampling/importance_sampling_ratio/mean": 0.9953776597976685, "sampling/importance_sampling_ratio/min": 0.8913128972053528, "sampling/sampling_logp_difference/max": 0.12077903747558594, "sampling/sampling_logp_difference/mean": 0.0032193115912377834, "step": 630, "step_time": 9.793302544999278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 446.875, "completions/mean_terminated_length": 446.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06507471285294741, "epoch": 0.01262, "frac_reward_zero_std": 0.25, "grad_norm": 0.046959225088357925, "kl": 0.7844951711595058, "learning_rate": 2.7392637742386435e-05, "loss": 0.0058, "num_tokens": 28007201.0, "reward": 0.6364911794662476, "reward_std": 0.4767383933067322, "rewards/rollout_reward_func/mean": 0.6364911794662476, "rewards/rollout_reward_func/std": 0.6232842206954956, "sampling/importance_sampling_ratio/max": 1.0390974283218384, "sampling/importance_sampling_ratio/mean": 0.9984173774719238, "sampling/importance_sampling_ratio/min": 0.9477919340133667, "sampling/sampling_logp_difference/max": 0.06164541840553284, "sampling/sampling_logp_difference/mean": 0.0012215797323733568, "step": 631, "step_time": 8.203762075000668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 441.375, "completions/mean_terminated_length": 441.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08982194951386191, "epoch": 0.01264, "frac_reward_zero_std": 0.0, "grad_norm": 0.0946984738111496, "kl": 0.9603361189365387, "learning_rate": 2.730232620469678e-05, "loss": 0.0063, "num_tokens": 28052416.0, "reward": 0.5982388257980347, "reward_std": 0.5650575160980225, "rewards/rollout_reward_func/mean": 0.5982388257980347, "rewards/rollout_reward_func/std": 0.5853118300437927, "sampling/importance_sampling_ratio/max": 1.1365091800689697, "sampling/importance_sampling_ratio/mean": 0.9976488351821899, "sampling/importance_sampling_ratio/min": 0.8853879570960999, "sampling/sampling_logp_difference/max": 0.1280202865600586, "sampling/sampling_logp_difference/mean": 0.0035322820767760277, "step": 632, "step_time": 8.452009263999571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1502.0, "completions/max_terminated_length": 1502.0, "completions/mean_length": 475.75, "completions/mean_terminated_length": 475.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06087527680210769, "epoch": 0.01266, "frac_reward_zero_std": 0.25, "grad_norm": 0.13474026322364807, "kl": 0.7206938210874796, "learning_rate": 2.7212225234655247e-05, "loss": 0.0022, "num_tokens": 28098708.0, "reward": 0.7735174298286438, "reward_std": 0.42044687271118164, "rewards/rollout_reward_func/mean": 0.7735174298286438, "rewards/rollout_reward_func/std": 0.5631957054138184, "sampling/importance_sampling_ratio/max": 1.1310758590698242, "sampling/importance_sampling_ratio/mean": 1.0047167539596558, "sampling/importance_sampling_ratio/min": 0.9052484631538391, "sampling/sampling_logp_difference/max": 0.12314939498901367, "sampling/sampling_logp_difference/mean": 0.002644194522872567, "step": 633, "step_time": 10.328721970000515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 364.46875, "completions/mean_terminated_length": 364.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06566745298914611, "epoch": 0.01268, "frac_reward_zero_std": 0.0, "grad_norm": 0.0797688215970993, "kl": 0.9355837255716324, "learning_rate": 2.712233612310212e-05, "loss": 0.0011, "num_tokens": 28140152.0, "reward": 0.6923213601112366, "reward_std": 0.5211162567138672, "rewards/rollout_reward_func/mean": 0.6923213601112366, "rewards/rollout_reward_func/std": 0.49982795119285583, "sampling/importance_sampling_ratio/max": 1.043535590171814, "sampling/importance_sampling_ratio/mean": 1.000885248184204, "sampling/importance_sampling_ratio/min": 0.952032208442688, "sampling/sampling_logp_difference/max": 0.049907565116882324, "sampling/sampling_logp_difference/mean": 0.0012576656881719828, "step": 634, "step_time": 8.271524099998715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1692.0, "completions/max_terminated_length": 1692.0, "completions/mean_length": 408.5625, "completions/mean_terminated_length": 408.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.051824096124619246, "epoch": 0.0127, "frac_reward_zero_std": 0.5, "grad_norm": 0.03685050830245018, "kl": 0.7162586748600006, "learning_rate": 2.7032660157842455e-05, "loss": 0.0028, "num_tokens": 28181959.0, "reward": 0.8301906585693359, "reward_std": 0.26668494939804077, "rewards/rollout_reward_func/mean": 0.8301906585693359, "rewards/rollout_reward_func/std": 0.4009816348552704, "sampling/importance_sampling_ratio/max": 1.0314842462539673, "sampling/importance_sampling_ratio/mean": 0.9987680912017822, "sampling/importance_sampling_ratio/min": 0.9052191376686096, "sampling/sampling_logp_difference/max": 0.051055848598480225, "sampling/sampling_logp_difference/mean": 0.001325755612924695, "step": 635, "step_time": 10.28136313900086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 410.84375, "completions/mean_terminated_length": 410.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06362232938408852, "epoch": 0.01272, "frac_reward_zero_std": 0.0, "grad_norm": 0.05929414927959442, "kl": 0.774362432770431, "learning_rate": 2.694319862362766e-05, "loss": -0.0004, "num_tokens": 28225166.0, "reward": 0.7679228782653809, "reward_std": 0.5103204846382141, "rewards/rollout_reward_func/mean": 0.7679228782653809, "rewards/rollout_reward_func/std": 0.5132865309715271, "sampling/importance_sampling_ratio/max": 1.0123564004898071, "sampling/importance_sampling_ratio/mean": 0.998022735118866, "sampling/importance_sampling_ratio/min": 0.9655349254608154, "sampling/sampling_logp_difference/max": 0.04221096634864807, "sampling/sampling_logp_difference/mean": 0.0009120677132159472, "step": 636, "step_time": 8.411356399999931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1701.0, "completions/max_terminated_length": 1701.0, "completions/mean_length": 376.03125, "completions/mean_terminated_length": 376.03125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04429642917239107, "epoch": 0.01274, "frac_reward_zero_std": 0.25, "grad_norm": 0.07516945153474808, "kl": 0.8543477691709995, "learning_rate": 2.6853952802137104e-05, "loss": 0.0041, "num_tokens": 28266211.0, "reward": 0.7700567245483398, "reward_std": 0.42329537868499756, "rewards/rollout_reward_func/mean": 0.7700567245483398, "rewards/rollout_reward_func/std": 0.509690523147583, "sampling/importance_sampling_ratio/max": 1.0142996311187744, "sampling/importance_sampling_ratio/mean": 0.994059681892395, "sampling/importance_sampling_ratio/min": 0.8857946991920471, "sampling/sampling_logp_difference/max": 0.1213383674621582, "sampling/sampling_logp_difference/mean": 0.0021385052241384983, "step": 637, "step_time": 10.046726241000215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1623.0, "completions/max_terminated_length": 1623.0, "completions/mean_length": 455.65625, "completions/mean_terminated_length": 455.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1498449631035328, "epoch": 0.01276, "frac_reward_zero_std": 0.0, "grad_norm": 0.3623965084552765, "kl": 0.907619783654809, "learning_rate": 2.6764923971959656e-05, "loss": 0.0152, "num_tokens": 28311286.0, "reward": 0.8027923107147217, "reward_std": 0.47284168004989624, "rewards/rollout_reward_func/mean": 0.8027923107147217, "rewards/rollout_reward_func/std": 0.4843953847885132, "sampling/importance_sampling_ratio/max": 1.1985836029052734, "sampling/importance_sampling_ratio/mean": 0.9739516973495483, "sampling/importance_sampling_ratio/min": 3.435355569892523e-13, "sampling/sampling_logp_difference/max": 18.97264289855957, "sampling/sampling_logp_difference/mean": 0.20261909067630768, "step": 638, "step_time": 10.47539335700094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1515.0, "completions/max_terminated_length": 1515.0, "completions/mean_length": 424.71875, "completions/mean_terminated_length": 424.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07478434609947726, "epoch": 0.01278, "frac_reward_zero_std": 0.25, "grad_norm": 0.08803223818540573, "kl": 0.9164513200521469, "learning_rate": 2.6676113408575516e-05, "loss": 0.002, "num_tokens": 28354631.0, "reward": 0.5139099359512329, "reward_std": 0.550000786781311, "rewards/rollout_reward_func/mean": 0.5139099359512329, "rewards/rollout_reward_func/std": 0.679961621761322, "sampling/importance_sampling_ratio/max": 1.0532299280166626, "sampling/importance_sampling_ratio/mean": 0.9938859939575195, "sampling/importance_sampling_ratio/min": 0.8128990530967712, "sampling/sampling_logp_difference/max": 0.16894984245300293, "sampling/sampling_logp_difference/mean": 0.003124275477603078, "step": 639, "step_time": 9.629978211001799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1612.0, "completions/max_terminated_length": 1612.0, "completions/mean_length": 475.03125, "completions/mean_terminated_length": 475.03125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07851313054561615, "epoch": 0.0128, "frac_reward_zero_std": 0.5, "grad_norm": 0.12783390283584595, "kl": 0.9172619357705116, "learning_rate": 2.6587522384337808e-05, "loss": 0.0038, "num_tokens": 28400319.0, "reward": 0.8007521033287048, "reward_std": 0.3307695984840393, "rewards/rollout_reward_func/mean": 0.8007521033287048, "rewards/rollout_reward_func/std": 0.4924558103084564, "sampling/importance_sampling_ratio/max": 1.099919080734253, "sampling/importance_sampling_ratio/mean": 1.0082416534423828, "sampling/importance_sampling_ratio/min": 0.9676296710968018, "sampling/sampling_logp_difference/max": 0.0876426100730896, "sampling/sampling_logp_difference/mean": 0.00245889974758029, "step": 640, "step_time": 10.292284189001293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1672.0, "completions/max_terminated_length": 1672.0, "completions/mean_length": 520.65625, "completions/mean_terminated_length": 520.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06849506916478276, "epoch": 0.01282, "frac_reward_zero_std": 0.0, "grad_norm": 0.2960314154624939, "kl": 0.7139092851430178, "learning_rate": 2.6499152168454434e-05, "loss": 0.0046, "num_tokens": 28447612.0, "reward": 0.7987711429595947, "reward_std": 0.42587244510650635, "rewards/rollout_reward_func/mean": 0.7987711429595947, "rewards/rollout_reward_func/std": 0.42591342329978943, "sampling/importance_sampling_ratio/max": 1.2714558839797974, "sampling/importance_sampling_ratio/mean": 1.0083186626434326, "sampling/importance_sampling_ratio/min": 0.9740885496139526, "sampling/sampling_logp_difference/max": 0.23853111267089844, "sampling/sampling_logp_difference/mean": 0.002654700307175517, "step": 641, "step_time": 10.97817447199941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1612.0, "completions/max_terminated_length": 1612.0, "completions/mean_length": 515.6875, "completions/mean_terminated_length": 515.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08826666418462992, "epoch": 0.01284, "frac_reward_zero_std": 0.25, "grad_norm": 0.08872437477111816, "kl": 0.7702996581792831, "learning_rate": 2.6411004026969857e-05, "loss": -0.0005, "num_tokens": 28495213.0, "reward": 0.7002009749412537, "reward_std": 0.442436158657074, "rewards/rollout_reward_func/mean": 0.7002009749412537, "rewards/rollout_reward_func/std": 0.5481052398681641, "sampling/importance_sampling_ratio/max": 1.033329963684082, "sampling/importance_sampling_ratio/mean": 0.9980000257492065, "sampling/importance_sampling_ratio/min": 0.9258949160575867, "sampling/sampling_logp_difference/max": 0.07079899311065674, "sampling/sampling_logp_difference/mean": 0.0013098159106448293, "step": 642, "step_time": 10.724307025999224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 406.625, "completions/mean_terminated_length": 406.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06842542951926589, "epoch": 0.01286, "frac_reward_zero_std": 0.0, "grad_norm": 0.4116336405277252, "kl": 0.8558744341135025, "learning_rate": 2.6323079222746966e-05, "loss": 0.0202, "num_tokens": 28538134.0, "reward": 0.7966037392616272, "reward_std": 0.4407995939254761, "rewards/rollout_reward_func/mean": 0.7966037392616272, "rewards/rollout_reward_func/std": 0.4303697645664215, "sampling/importance_sampling_ratio/max": 1.0605672597885132, "sampling/importance_sampling_ratio/mean": 1.001350998878479, "sampling/importance_sampling_ratio/min": 0.9638196229934692, "sampling/sampling_logp_difference/max": 0.059208840131759644, "sampling/sampling_logp_difference/mean": 0.001392939011566341, "step": 643, "step_time": 10.687389386998802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1702.0, "completions/max_terminated_length": 1702.0, "completions/mean_length": 435.875, "completions/mean_terminated_length": 435.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06658484367653728, "epoch": 0.01288, "frac_reward_zero_std": 0.25, "grad_norm": 0.2731458842754364, "kl": 0.8081406801939011, "learning_rate": 2.623537901544899e-05, "loss": -0.0017, "num_tokens": 28582382.0, "reward": 0.8703924417495728, "reward_std": 0.3665856122970581, "rewards/rollout_reward_func/mean": 0.8703924417495728, "rewards/rollout_reward_func/std": 0.428507536649704, "sampling/importance_sampling_ratio/max": 1.0260095596313477, "sampling/importance_sampling_ratio/mean": 0.9890013933181763, "sampling/importance_sampling_ratio/min": 0.678094208240509, "sampling/sampling_logp_difference/max": 0.3564572334289551, "sampling/sampling_logp_difference/mean": 0.003575283335521817, "step": 644, "step_time": 10.229400733000148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1632.0, "completions/max_terminated_length": 1632.0, "completions/mean_length": 450.8125, "completions/mean_terminated_length": 450.8125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07283074920997024, "epoch": 0.0129, "frac_reward_zero_std": 0.25, "grad_norm": 0.0950453132390976, "kl": 0.9760079011321068, "learning_rate": 2.614790466152144e-05, "loss": -0.0035, "num_tokens": 28627537.0, "reward": 0.7937154769897461, "reward_std": 0.3679937720298767, "rewards/rollout_reward_func/mean": 0.7937154769897461, "rewards/rollout_reward_func/std": 0.43637552857398987, "sampling/importance_sampling_ratio/max": 1.039711833000183, "sampling/importance_sampling_ratio/mean": 0.9969959259033203, "sampling/importance_sampling_ratio/min": 0.8887773156166077, "sampling/sampling_logp_difference/max": 0.1179206371307373, "sampling/sampling_logp_difference/mean": 0.0018212692812085152, "step": 645, "step_time": 10.387673349999204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 263.375, "completions/mean_terminated_length": 263.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.052292805165052414, "epoch": 0.01292, "frac_reward_zero_std": 0.25, "grad_norm": 0.02741086669266224, "kl": 0.7439283411949873, "learning_rate": 2.606065741417416e-05, "loss": 0.0025, "num_tokens": 28663890.0, "reward": 0.864518404006958, "reward_std": 0.3153049349784851, "rewards/rollout_reward_func/mean": 0.864518404006958, "rewards/rollout_reward_func/std": 0.3643741309642792, "sampling/importance_sampling_ratio/max": 1.0190579891204834, "sampling/importance_sampling_ratio/mean": 0.9978686571121216, "sampling/importance_sampling_ratio/min": 0.9324625134468079, "sampling/sampling_logp_difference/max": 0.06610625982284546, "sampling/sampling_logp_difference/mean": 0.0011908004526048899, "step": 646, "step_time": 8.109118280000075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 440.8125, "completions/mean_terminated_length": 440.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07588532287627459, "epoch": 0.01294, "frac_reward_zero_std": 0.0, "grad_norm": 19.45127296447754, "kl": 27.70494154188782, "learning_rate": 2.5973638523363253e-05, "loss": 0.1132, "num_tokens": 28709062.0, "reward": 0.6731040477752686, "reward_std": 0.5993597507476807, "rewards/rollout_reward_func/mean": 0.6731040477752686, "rewards/rollout_reward_func/std": 0.608700156211853, "sampling/importance_sampling_ratio/max": 1.0498292446136475, "sampling/importance_sampling_ratio/mean": 0.9794780015945435, "sampling/importance_sampling_ratio/min": 0.47239840030670166, "sampling/sampling_logp_difference/max": 0.7499046325683594, "sampling/sampling_logp_difference/mean": 0.00724033173173666, "step": 647, "step_time": 8.729298475998803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 360.9375, "completions/mean_terminated_length": 360.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06890657055191696, "epoch": 0.01296, "frac_reward_zero_std": 0.0, "grad_norm": 0.026464825496077538, "kl": 0.9124945625662804, "learning_rate": 2.5886849235773306e-05, "loss": -0.0056, "num_tokens": 28749530.0, "reward": 0.7269687652587891, "reward_std": 0.464608758687973, "rewards/rollout_reward_func/mean": 0.7269687652587891, "rewards/rollout_reward_func/std": 0.48057079315185547, "sampling/importance_sampling_ratio/max": 1.0270195007324219, "sampling/importance_sampling_ratio/mean": 0.9996274709701538, "sampling/importance_sampling_ratio/min": 0.9847028255462646, "sampling/sampling_logp_difference/max": 0.027301877737045288, "sampling/sampling_logp_difference/mean": 0.0007828408852219582, "step": 648, "step_time": 8.02575031700053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 439.34375, "completions/mean_terminated_length": 439.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.062033484457060695, "epoch": 0.01298, "frac_reward_zero_std": 0.0, "grad_norm": 0.019174441695213318, "kl": 0.8591455928981304, "learning_rate": 2.5800290794799485e-05, "loss": 0.0055, "num_tokens": 28794556.0, "reward": 0.6595035791397095, "reward_std": 0.5188778638839722, "rewards/rollout_reward_func/mean": 0.6595035791397095, "rewards/rollout_reward_func/std": 0.5131913423538208, "sampling/importance_sampling_ratio/max": 1.0090724229812622, "sampling/importance_sampling_ratio/mean": 0.9972182512283325, "sampling/importance_sampling_ratio/min": 0.8837646245956421, "sampling/sampling_logp_difference/max": 0.12358283996582031, "sampling/sampling_logp_difference/mean": 0.0014331020647659898, "step": 649, "step_time": 8.05446451700027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 397.59375, "completions/mean_terminated_length": 397.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07746951493027154, "epoch": 0.013, "frac_reward_zero_std": 0.0, "grad_norm": 0.12488535791635513, "kl": 0.938814502209425, "learning_rate": 2.571396444052964e-05, "loss": -0.0014, "num_tokens": 28837392.0, "reward": 0.5032503008842468, "reward_std": 0.6531080007553101, "rewards/rollout_reward_func/mean": 0.5032503008842468, "rewards/rollout_reward_func/std": 0.64540034532547, "sampling/importance_sampling_ratio/max": 1.0093426704406738, "sampling/importance_sampling_ratio/mean": 0.9949424266815186, "sampling/importance_sampling_ratio/min": 0.8882217407226562, "sampling/sampling_logp_difference/max": 0.11968088150024414, "sampling/sampling_logp_difference/mean": 0.0019249909091740847, "step": 650, "step_time": 8.453402312001344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 433.875, "completions/mean_terminated_length": 433.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07886394765228033, "epoch": 0.01302, "frac_reward_zero_std": 0.0, "grad_norm": 0.08599448949098587, "kl": 0.7991550881415606, "learning_rate": 2.562787140972668e-05, "loss": -0.0032, "num_tokens": 28881915.0, "reward": 0.6596947908401489, "reward_std": 0.5340724587440491, "rewards/rollout_reward_func/mean": 0.6596947908401489, "rewards/rollout_reward_func/std": 0.5129356980323792, "sampling/importance_sampling_ratio/max": 1.0256848335266113, "sampling/importance_sampling_ratio/mean": 0.99940025806427, "sampling/importance_sampling_ratio/min": 0.9222131967544556, "sampling/sampling_logp_difference/max": 0.08307260274887085, "sampling/sampling_logp_difference/mean": 0.0016106455586850643, "step": 651, "step_time": 7.978659832999256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 354.6875, "completions/mean_terminated_length": 354.6875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06829377682879567, "epoch": 0.01304, "frac_reward_zero_std": 0.25, "grad_norm": 0.03266530483961105, "kl": 0.9095760956406593, "learning_rate": 2.5542012935810755e-05, "loss": 0.0092, "num_tokens": 28922672.0, "reward": 0.7956382036209106, "reward_std": 0.378406286239624, "rewards/rollout_reward_func/mean": 0.7956382036209106, "rewards/rollout_reward_func/std": 0.43231287598609924, "sampling/importance_sampling_ratio/max": 1.0053081512451172, "sampling/importance_sampling_ratio/mean": 0.9981598258018494, "sampling/importance_sampling_ratio/min": 0.9588103294372559, "sampling/sampling_logp_difference/max": 0.04472827911376953, "sampling/sampling_logp_difference/mean": 0.0008876851061359048, "step": 652, "step_time": 8.577082910998797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 307.25, "completions/mean_terminated_length": 307.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0816741255694069, "epoch": 0.01306, "frac_reward_zero_std": 0.0, "grad_norm": 0.08378472179174423, "kl": 1.0181107223033905, "learning_rate": 2.5456390248841602e-05, "loss": 0.0037, "num_tokens": 28961680.0, "reward": 0.7055177688598633, "reward_std": 0.5131123065948486, "rewards/rollout_reward_func/mean": 0.7055177688598633, "rewards/rollout_reward_func/std": 0.5986396670341492, "sampling/importance_sampling_ratio/max": 1.1234025955200195, "sampling/importance_sampling_ratio/mean": 0.998947024345398, "sampling/importance_sampling_ratio/min": 0.9085195064544678, "sampling/sampling_logp_difference/max": 0.11787605285644531, "sampling/sampling_logp_difference/mean": 0.0025765839964151382, "step": 653, "step_time": 7.916239449999921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 436.6875, "completions/mean_terminated_length": 436.6875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0949590322561562, "epoch": 0.01308, "frac_reward_zero_std": 0.0, "grad_norm": 0.0901331827044487, "kl": 1.2448096573352814, "learning_rate": 2.5371004575500953e-05, "loss": 0.007, "num_tokens": 29005940.0, "reward": 0.6205834150314331, "reward_std": 0.5300719738006592, "rewards/rollout_reward_func/mean": 0.6205834150314331, "rewards/rollout_reward_func/std": 0.5329249501228333, "sampling/importance_sampling_ratio/max": 1.1282172203063965, "sampling/importance_sampling_ratio/mean": 1.0057334899902344, "sampling/importance_sampling_ratio/min": 0.9223440289497375, "sampling/sampling_logp_difference/max": 0.12253332138061523, "sampling/sampling_logp_difference/mean": 0.0032627941109240055, "step": 654, "step_time": 8.110668957000598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 327.75, "completions/mean_terminated_length": 327.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09545507421717048, "epoch": 0.0131, "frac_reward_zero_std": 0.0, "grad_norm": 0.05848080664873123, "kl": 0.9446860924363136, "learning_rate": 2.528585713907493e-05, "loss": -0.0047, "num_tokens": 29045767.0, "reward": 0.5379408001899719, "reward_std": 0.6177741885185242, "rewards/rollout_reward_func/mean": 0.5379408001899719, "rewards/rollout_reward_func/std": 0.6909676790237427, "sampling/importance_sampling_ratio/max": 1.0067414045333862, "sampling/importance_sampling_ratio/mean": 0.984581708908081, "sampling/importance_sampling_ratio/min": 0.883951723575592, "sampling/sampling_logp_difference/max": 0.12338018417358398, "sampling/sampling_logp_difference/mean": 0.004623163491487503, "step": 655, "step_time": 8.37892123200163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 325.3125, "completions/mean_terminated_length": 325.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0793451126664877, "epoch": 0.01312, "frac_reward_zero_std": 0.25, "grad_norm": 0.043778859078884125, "kl": 0.8406942375004292, "learning_rate": 2.5200949159436526e-05, "loss": -0.0021, "num_tokens": 29086142.0, "reward": 0.8969874382019043, "reward_std": 0.29136359691619873, "rewards/rollout_reward_func/mean": 0.8969874382019043, "rewards/rollout_reward_func/std": 0.32545679807662964, "sampling/importance_sampling_ratio/max": 1.124870777130127, "sampling/importance_sampling_ratio/mean": 1.0031349658966064, "sampling/importance_sampling_ratio/min": 0.8951611518859863, "sampling/sampling_logp_difference/max": 0.1176450252532959, "sampling/sampling_logp_difference/mean": 0.0032161290291696787, "step": 656, "step_time": 7.890618647001247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 421.0, "completions/mean_terminated_length": 421.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09769261721521616, "epoch": 0.01314, "frac_reward_zero_std": 0.0, "grad_norm": 0.0424811914563179, "kl": 0.7143070586025715, "learning_rate": 2.511628185302814e-05, "loss": 0.0017, "num_tokens": 29129960.0, "reward": 0.5968103408813477, "reward_std": 0.558967649936676, "rewards/rollout_reward_func/mean": 0.5968103408813477, "rewards/rollout_reward_func/std": 0.5869821310043335, "sampling/importance_sampling_ratio/max": 1.2000782489776611, "sampling/importance_sampling_ratio/mean": 1.0129847526550293, "sampling/importance_sampling_ratio/min": 0.9858406782150269, "sampling/sampling_logp_difference/max": 0.18812823295593262, "sampling/sampling_logp_difference/mean": 0.0034864675253629684, "step": 657, "step_time": 8.545255320999786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 393.875, "completions/mean_terminated_length": 393.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0645979526307201, "epoch": 0.01316, "frac_reward_zero_std": 0.0, "grad_norm": 0.04740138724446297, "kl": 0.9291149452328682, "learning_rate": 2.5031856432844173e-05, "loss": 0.0123, "num_tokens": 29171298.0, "reward": 0.7965935468673706, "reward_std": 0.44235894083976746, "rewards/rollout_reward_func/mean": 0.7965935468673706, "rewards/rollout_reward_func/std": 0.430256724357605, "sampling/importance_sampling_ratio/max": 1.0175343751907349, "sampling/importance_sampling_ratio/mean": 0.9983072280883789, "sampling/importance_sampling_ratio/min": 0.962340772151947, "sampling/sampling_logp_difference/max": 0.03674811124801636, "sampling/sampling_logp_difference/mean": 0.0012641153298318386, "step": 658, "step_time": 8.40553940800055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 426.125, "completions/mean_terminated_length": 426.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0726242249365896, "epoch": 0.01318, "frac_reward_zero_std": 0.25, "grad_norm": 0.06794720143079758, "kl": 0.7874613665044308, "learning_rate": 2.4947674108413543e-05, "loss": 0.0134, "num_tokens": 29213981.0, "reward": 0.7310449481010437, "reward_std": 0.4030070900917053, "rewards/rollout_reward_func/mean": 0.7310449481010437, "rewards/rollout_reward_func/std": 0.4733141362667084, "sampling/importance_sampling_ratio/max": 1.1179149150848389, "sampling/importance_sampling_ratio/mean": 1.0019330978393555, "sampling/importance_sampling_ratio/min": 0.8978279829025269, "sampling/sampling_logp_difference/max": 0.1114499568939209, "sampling/sampling_logp_difference/mean": 0.002354663098230958, "step": 659, "step_time": 8.329691857999023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 362.09375, "completions/mean_terminated_length": 362.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07462042267434299, "epoch": 0.0132, "frac_reward_zero_std": 0.25, "grad_norm": 0.0873396024107933, "kl": 0.7805492486804724, "learning_rate": 2.486373608578252e-05, "loss": -0.0021, "num_tokens": 29254711.0, "reward": 0.8642237186431885, "reward_std": 0.3181001842021942, "rewards/rollout_reward_func/mean": 0.8642237186431885, "rewards/rollout_reward_func/std": 0.36501947045326233, "sampling/importance_sampling_ratio/max": 1.1179605722427368, "sampling/importance_sampling_ratio/mean": 1.0077900886535645, "sampling/importance_sampling_ratio/min": 0.988913893699646, "sampling/sampling_logp_difference/max": 0.11148786544799805, "sampling/sampling_logp_difference/mean": 0.002276177052408457, "step": 660, "step_time": 8.304776190000666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 342.96875, "completions/mean_terminated_length": 342.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07974500069394708, "epoch": 0.01322, "frac_reward_zero_std": 0.0, "grad_norm": 0.21560226380825043, "kl": 1.0096834301948547, "learning_rate": 2.47800435674973e-05, "loss": 0.0065, "num_tokens": 29294344.0, "reward": 0.7588627934455872, "reward_std": 0.46445149183273315, "rewards/rollout_reward_func/mean": 0.7588627934455872, "rewards/rollout_reward_func/std": 0.4631454050540924, "sampling/importance_sampling_ratio/max": 1.095084309577942, "sampling/importance_sampling_ratio/mean": 1.0078526735305786, "sampling/importance_sampling_ratio/min": 0.989112138748169, "sampling/sampling_logp_difference/max": 0.09085500240325928, "sampling/sampling_logp_difference/mean": 0.0022719723638147116, "step": 661, "step_time": 8.356641114000013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 519.78125, "completions/mean_terminated_length": 519.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07563267648220062, "epoch": 0.01324, "frac_reward_zero_std": 0.25, "grad_norm": 0.1103903129696846, "kl": 0.8397690951824188, "learning_rate": 2.469659775258686e-05, "loss": 0.0015, "num_tokens": 29341813.0, "reward": 0.7024654746055603, "reward_std": 0.45784157514572144, "rewards/rollout_reward_func/mean": 0.7024654746055603, "rewards/rollout_reward_func/std": 0.5446113348007202, "sampling/importance_sampling_ratio/max": 1.0103634595870972, "sampling/importance_sampling_ratio/mean": 1.000652551651001, "sampling/importance_sampling_ratio/min": 0.9892528057098389, "sampling/sampling_logp_difference/max": 0.010804511606693268, "sampling/sampling_logp_difference/mean": 0.0008668569498695433, "step": 662, "step_time": 8.557077728999502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 416.53125, "completions/mean_terminated_length": 416.53125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07357823615893722, "epoch": 0.01326, "frac_reward_zero_std": 0.0, "grad_norm": 3.525002956390381, "kl": 9.850266525521874, "learning_rate": 2.461339983654576e-05, "loss": 0.0361, "num_tokens": 29385068.0, "reward": 0.673120379447937, "reward_std": 0.5866246223449707, "rewards/rollout_reward_func/mean": 0.673120379447937, "rewards/rollout_reward_func/std": 0.6084914803504944, "sampling/importance_sampling_ratio/max": 1.1235570907592773, "sampling/importance_sampling_ratio/mean": 0.9981697797775269, "sampling/importance_sampling_ratio/min": 0.8845086097717285, "sampling/sampling_logp_difference/max": 0.12490463256835938, "sampling/sampling_logp_difference/mean": 0.0029994186479598284, "step": 663, "step_time": 8.05661775200133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 424.46875, "completions/mean_terminated_length": 424.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0736969041172415, "epoch": 0.01328, "frac_reward_zero_std": 0.0, "grad_norm": 0.0748039111495018, "kl": 1.0248960331082344, "learning_rate": 2.4530451011317e-05, "loss": 0.0011, "num_tokens": 29429085.0, "reward": 0.5440652370452881, "reward_std": 0.6742147207260132, "rewards/rollout_reward_func/mean": 0.5440652370452881, "rewards/rollout_reward_func/std": 0.6851218938827515, "sampling/importance_sampling_ratio/max": 1.1106202602386475, "sampling/importance_sampling_ratio/mean": 1.0019071102142334, "sampling/importance_sampling_ratio/min": 0.8849433064460754, "sampling/sampling_logp_difference/max": 0.12221431732177734, "sampling/sampling_logp_difference/mean": 0.002959667704999447, "step": 664, "step_time": 8.00660648200028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 356.4375, "completions/mean_terminated_length": 356.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.056734132347628474, "epoch": 0.0133, "frac_reward_zero_std": 0.0, "grad_norm": 0.10842128098011017, "kl": 0.8221209369366989, "learning_rate": 2.4447752465274976e-05, "loss": 0.0048, "num_tokens": 29469228.0, "reward": 0.6656303405761719, "reward_std": 0.5555636882781982, "rewards/rollout_reward_func/mean": 0.6656303405761719, "rewards/rollout_reward_func/std": 0.5643545389175415, "sampling/importance_sampling_ratio/max": 1.0091041326522827, "sampling/importance_sampling_ratio/mean": 0.9986834526062012, "sampling/importance_sampling_ratio/min": 0.9569199681282043, "sampling/sampling_logp_difference/max": 0.0464174747467041, "sampling/sampling_logp_difference/mean": 0.0008687339723110199, "step": 665, "step_time": 8.424576842001443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 351.6875, "completions/mean_terminated_length": 351.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04272610764019191, "epoch": 0.01332, "frac_reward_zero_std": 0.25, "grad_norm": 0.02427627518773079, "kl": 0.6104665594175458, "learning_rate": 2.43653053832084e-05, "loss": 0.0041, "num_tokens": 29509264.0, "reward": 0.8042845726013184, "reward_std": 0.40552008152008057, "rewards/rollout_reward_func/mean": 0.8042845726013184, "rewards/rollout_reward_func/std": 0.48406144976615906, "sampling/importance_sampling_ratio/max": 1.0053608417510986, "sampling/importance_sampling_ratio/mean": 0.9956597685813904, "sampling/importance_sampling_ratio/min": 0.888080358505249, "sampling/sampling_logp_difference/max": 0.11867880821228027, "sampling/sampling_logp_difference/mean": 0.0013420232571661472, "step": 666, "step_time": 8.336151353997593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 295.71875, "completions/mean_terminated_length": 295.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0656913339626044, "epoch": 0.01334, "frac_reward_zero_std": 0.25, "grad_norm": 0.1476443111896515, "kl": 0.8882767632603645, "learning_rate": 2.4283110946303422e-05, "loss": 0.0014, "num_tokens": 29547901.0, "reward": 0.7042559385299683, "reward_std": 0.4995666742324829, "rewards/rollout_reward_func/mean": 0.7042559385299683, "rewards/rollout_reward_func/std": 0.6002128720283508, "sampling/importance_sampling_ratio/max": 1.0303921699523926, "sampling/importance_sampling_ratio/mean": 0.9940125942230225, "sampling/importance_sampling_ratio/min": 0.8991129994392395, "sampling/sampling_logp_difference/max": 0.10638093948364258, "sampling/sampling_logp_difference/mean": 0.002601485699415207, "step": 667, "step_time": 7.868037743001878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 417.4375, "completions/mean_terminated_length": 417.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05871624290011823, "epoch": 0.01336, "frac_reward_zero_std": 0.0, "grad_norm": 0.08282963931560516, "kl": 0.6970071839168668, "learning_rate": 2.4201170332126565e-05, "loss": -0.0025, "num_tokens": 29591326.0, "reward": 0.7067669630050659, "reward_std": 0.5965672135353088, "rewards/rollout_reward_func/mean": 0.7067669630050659, "rewards/rollout_reward_func/std": 0.5955089330673218, "sampling/importance_sampling_ratio/max": 1.0086814165115356, "sampling/importance_sampling_ratio/mean": 0.9972478747367859, "sampling/importance_sampling_ratio/min": 0.8854069709777832, "sampling/sampling_logp_difference/max": 0.12175273895263672, "sampling/sampling_logp_difference/mean": 0.0013655077200382948, "step": 668, "step_time": 8.031117283998356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 404.59375, "completions/mean_terminated_length": 404.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05729120853357017, "epoch": 0.01338, "frac_reward_zero_std": 0.0, "grad_norm": 0.0480305440723896, "kl": 0.9366060048341751, "learning_rate": 2.4119484714608007e-05, "loss": 0.0057, "num_tokens": 29634159.0, "reward": 0.5042831897735596, "reward_std": 0.6678156852722168, "rewards/rollout_reward_func/mean": 0.5042831897735596, "rewards/rollout_reward_func/std": 0.6435107588768005, "sampling/importance_sampling_ratio/max": 1.0891233682632446, "sampling/importance_sampling_ratio/mean": 0.9962306022644043, "sampling/importance_sampling_ratio/min": 0.8893609642982483, "sampling/sampling_logp_difference/max": 0.11642515659332275, "sampling/sampling_logp_difference/mean": 0.002634196076542139, "step": 669, "step_time": 8.123649011002271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 346.40625, "completions/mean_terminated_length": 346.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05465353757608682, "epoch": 0.0134, "frac_reward_zero_std": 0.0, "grad_norm": 0.17814426124095917, "kl": 2.139100691303611, "learning_rate": 2.4038055264024672e-05, "loss": 0.0134, "num_tokens": 29673672.0, "reward": 0.7681518197059631, "reward_std": 0.5069836378097534, "rewards/rollout_reward_func/mean": 0.7681518197059631, "rewards/rollout_reward_func/std": 0.5127749443054199, "sampling/importance_sampling_ratio/max": 1.0263818502426147, "sampling/importance_sampling_ratio/mean": 0.9938065409660339, "sampling/importance_sampling_ratio/min": 0.8834177255630493, "sampling/sampling_logp_difference/max": 0.12395143508911133, "sampling/sampling_logp_difference/mean": 0.002841281471773982, "step": 670, "step_time": 8.42523207100021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 429.5625, "completions/mean_terminated_length": 429.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04303835041355342, "epoch": 0.01342, "frac_reward_zero_std": 0.25, "grad_norm": 0.028626177459955215, "kl": 0.8372820243239403, "learning_rate": 2.3956883146983443e-05, "loss": 0.01, "num_tokens": 29717455.0, "reward": 0.7299397587776184, "reward_std": 0.3792162537574768, "rewards/rollout_reward_func/mean": 0.7299397587776184, "rewards/rollout_reward_func/std": 0.4753146469593048, "sampling/importance_sampling_ratio/max": 1.1170380115509033, "sampling/importance_sampling_ratio/mean": 1.0052497386932373, "sampling/importance_sampling_ratio/min": 0.9966974854469299, "sampling/sampling_logp_difference/max": 0.11099529266357422, "sampling/sampling_logp_difference/mean": 0.0013552905293181539, "step": 671, "step_time": 8.372242266000285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 495.96875, "completions/mean_terminated_length": 495.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.059701784048229456, "epoch": 0.01344, "frac_reward_zero_std": 0.0, "grad_norm": 0.03574810177087784, "kl": 0.6575106419622898, "learning_rate": 2.3875969526404544e-05, "loss": 0.0023, "num_tokens": 29763873.0, "reward": 0.5646281242370605, "reward_std": 0.5976256132125854, "rewards/rollout_reward_func/mean": 0.5646281242370605, "rewards/rollout_reward_func/std": 0.5924075841903687, "sampling/importance_sampling_ratio/max": 1.1319419145584106, "sampling/importance_sampling_ratio/mean": 1.005547046661377, "sampling/importance_sampling_ratio/min": 0.8865154385566711, "sampling/sampling_logp_difference/max": 0.12389278411865234, "sampling/sampling_logp_difference/mean": 0.003029602812603116, "step": 672, "step_time": 8.146616280999297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 278.90625, "completions/mean_terminated_length": 278.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07124121021479368, "epoch": 0.01346, "frac_reward_zero_std": 0.0, "grad_norm": 0.11921080946922302, "kl": 0.864329943433404, "learning_rate": 2.3795315561504805e-05, "loss": 0.0018, "num_tokens": 29801773.0, "reward": 0.5834081172943115, "reward_std": 0.7324868440628052, "rewards/rollout_reward_func/mean": 0.5834081172943115, "rewards/rollout_reward_func/std": 0.7174580693244934, "sampling/importance_sampling_ratio/max": 1.0163623094558716, "sampling/importance_sampling_ratio/mean": 0.9799000024795532, "sampling/importance_sampling_ratio/min": 0.43039804697036743, "sampling/sampling_logp_difference/max": 0.8277056217193604, "sampling/sampling_logp_difference/mean": 0.00788140669465065, "step": 673, "step_time": 7.847182243000134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 402.75, "completions/mean_terminated_length": 402.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.051707192324101925, "epoch": 0.01348, "frac_reward_zero_std": 0.0, "grad_norm": 0.012645255774259567, "kl": 0.6189302831771784, "learning_rate": 2.3714922407781067e-05, "loss": 0.0064, "num_tokens": 29844411.0, "reward": 0.6326873302459717, "reward_std": 0.5682734251022339, "rewards/rollout_reward_func/mean": 0.6326873302459717, "rewards/rollout_reward_func/std": 0.5750466585159302, "sampling/importance_sampling_ratio/max": 1.0304232835769653, "sampling/importance_sampling_ratio/mean": 1.0013824701309204, "sampling/importance_sampling_ratio/min": 0.9874340891838074, "sampling/sampling_logp_difference/max": 0.03039202094078064, "sampling/sampling_logp_difference/mean": 0.0008791614091023803, "step": 674, "step_time": 8.458345060001193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 374.09375, "completions/mean_terminated_length": 374.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07336458330973983, "epoch": 0.0135, "frac_reward_zero_std": 0.25, "grad_norm": 0.028727678582072258, "kl": 0.6424506083130836, "learning_rate": 2.3634791216993637e-05, "loss": -0.0006, "num_tokens": 29885533.0, "reward": 0.7399654388427734, "reward_std": 0.46629756689071655, "rewards/rollout_reward_func/mean": 0.7399654388427734, "rewards/rollout_reward_func/std": 0.5806531310081482, "sampling/importance_sampling_ratio/max": 1.011343002319336, "sampling/importance_sampling_ratio/mean": 0.9978235960006714, "sampling/importance_sampling_ratio/min": 0.9733312726020813, "sampling/sampling_logp_difference/max": 0.027144938707351685, "sampling/sampling_logp_difference/mean": 0.0010558885987848043, "step": 675, "step_time": 7.876105035999899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 357.34375, "completions/mean_terminated_length": 357.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05390730081126094, "epoch": 0.01352, "frac_reward_zero_std": 0.25, "grad_norm": 0.0585114024579525, "kl": 0.8363133296370506, "learning_rate": 2.3554923137149776e-05, "loss": 0.0062, "num_tokens": 29925528.0, "reward": 0.7974623441696167, "reward_std": 0.36028802394866943, "rewards/rollout_reward_func/mean": 0.7974623441696167, "rewards/rollout_reward_func/std": 0.4283779263496399, "sampling/importance_sampling_ratio/max": 1.005143404006958, "sampling/importance_sampling_ratio/mean": 0.9966247081756592, "sampling/importance_sampling_ratio/min": 0.9047825932502747, "sampling/sampling_logp_difference/max": 0.11796164512634277, "sampling/sampling_logp_difference/mean": 0.0014773695729672909, "step": 676, "step_time": 8.378795438000452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 509.4375, "completions/mean_terminated_length": 509.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06354089919477701, "epoch": 0.01354, "frac_reward_zero_std": 0.0, "grad_norm": 0.039094459265470505, "kl": 0.6512551307678223, "learning_rate": 2.347531931248729e-05, "loss": 0.0064, "num_tokens": 29972252.0, "reward": 0.6687696576118469, "reward_std": 0.526627779006958, "rewards/rollout_reward_func/mean": 0.6687696576118469, "rewards/rollout_reward_func/std": 0.5599772930145264, "sampling/importance_sampling_ratio/max": 1.0167508125305176, "sampling/importance_sampling_ratio/mean": 0.9999052882194519, "sampling/importance_sampling_ratio/min": 0.9811890721321106, "sampling/sampling_logp_difference/max": 0.01900315284729004, "sampling/sampling_logp_difference/mean": 0.0008822468807920814, "step": 677, "step_time": 8.1158019279992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 395.65625, "completions/mean_terminated_length": 395.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1048563220538199, "epoch": 0.01356, "frac_reward_zero_std": 0.5, "grad_norm": 0.3359838128089905, "kl": 0.6871050707995892, "learning_rate": 2.3395980883458064e-05, "loss": 0.0036, "num_tokens": 30014471.0, "reward": 0.7787500023841858, "reward_std": 0.41997843980789185, "rewards/rollout_reward_func/mean": 0.7787500023841858, "rewards/rollout_reward_func/std": 0.6117122769355774, "sampling/importance_sampling_ratio/max": 1.1604933738708496, "sampling/importance_sampling_ratio/mean": 1.005637288093567, "sampling/importance_sampling_ratio/min": 0.8821377158164978, "sampling/sampling_logp_difference/max": 0.12915849685668945, "sampling/sampling_logp_difference/mean": 0.0032082798425108194, "step": 678, "step_time": 8.032962509000754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 407.46875, "completions/mean_terminated_length": 407.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08877968415617943, "epoch": 0.01358, "frac_reward_zero_std": 0.0, "grad_norm": 0.12690292298793793, "kl": 0.8865833453601226, "learning_rate": 2.331690898671183e-05, "loss": -0.0046, "num_tokens": 30057538.0, "reward": 0.7115446925163269, "reward_std": 0.6665570139884949, "rewards/rollout_reward_func/mean": 0.7115446925163269, "rewards/rollout_reward_func/std": 0.6429842114448547, "sampling/importance_sampling_ratio/max": 1.0066593885421753, "sampling/importance_sampling_ratio/mean": 0.9980838298797607, "sampling/importance_sampling_ratio/min": 0.961174726486206, "sampling/sampling_logp_difference/max": 0.040839552879333496, "sampling/sampling_logp_difference/mean": 0.0010344458278268576, "step": 679, "step_time": 8.386374299999261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 379.84375, "completions/mean_terminated_length": 379.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08379982737824321, "epoch": 0.0136, "frac_reward_zero_std": 0.25, "grad_norm": 0.05815599858760834, "kl": 0.46609470061957836, "learning_rate": 2.3238104755079733e-05, "loss": -0.0017, "num_tokens": 30099352.0, "reward": 0.6847018003463745, "reward_std": 0.455410361289978, "rewards/rollout_reward_func/mean": 0.6847018003463745, "rewards/rollout_reward_func/std": 0.6929658651351929, "sampling/importance_sampling_ratio/max": 1.008309245109558, "sampling/importance_sampling_ratio/mean": 0.9934828281402588, "sampling/importance_sampling_ratio/min": 0.885874330997467, "sampling/sampling_logp_difference/max": 0.12119150161743164, "sampling/sampling_logp_difference/mean": 0.0021824107971042395, "step": 680, "step_time": 7.98951503400076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 359.84375, "completions/mean_terminated_length": 359.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09441196266561747, "epoch": 0.01362, "frac_reward_zero_std": 0.0, "grad_norm": 0.07100699841976166, "kl": 0.8777330480515957, "learning_rate": 2.315956931755827e-05, "loss": -0.0004, "num_tokens": 30140482.0, "reward": 0.6375373601913452, "reward_std": 0.632695198059082, "rewards/rollout_reward_func/mean": 0.6375373601913452, "rewards/rollout_reward_func/std": 0.6234241127967834, "sampling/importance_sampling_ratio/max": 1.1016957759857178, "sampling/importance_sampling_ratio/mean": 1.002915859222412, "sampling/importance_sampling_ratio/min": 0.9063758850097656, "sampling/sampling_logp_difference/max": 0.09903836250305176, "sampling/sampling_logp_difference/mean": 0.002757760463282466, "step": 681, "step_time": 8.575178449000305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 377.09375, "completions/mean_terminated_length": 377.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.062229108065366745, "epoch": 0.01364, "frac_reward_zero_std": 0.0, "grad_norm": 0.07907839119434357, "kl": 0.6958044432103634, "learning_rate": 2.3081303799292988e-05, "loss": 0.0061, "num_tokens": 30181483.0, "reward": 0.6368851661682129, "reward_std": 0.5918692350387573, "rewards/rollout_reward_func/mean": 0.6368851661682129, "rewards/rollout_reward_func/std": 0.6238979697227478, "sampling/importance_sampling_ratio/max": 1.0198216438293457, "sampling/importance_sampling_ratio/mean": 0.9925670623779297, "sampling/importance_sampling_ratio/min": 0.8817384839057922, "sampling/sampling_logp_difference/max": 0.125779390335083, "sampling/sampling_logp_difference/mean": 0.0029301217291504145, "step": 682, "step_time": 7.917816656000468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 382.71875, "completions/mean_terminated_length": 382.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06804282427765429, "epoch": 0.01366, "frac_reward_zero_std": 0.0, "grad_norm": 0.13786225020885468, "kl": 0.5820475779473782, "learning_rate": 2.300330932156241e-05, "loss": 0.001, "num_tokens": 30223710.0, "reward": 0.7608281373977661, "reward_std": 0.46087372303009033, "rewards/rollout_reward_func/mean": 0.7608281373977661, "rewards/rollout_reward_func/std": 0.45929694175720215, "sampling/importance_sampling_ratio/max": 1.0356743335723877, "sampling/importance_sampling_ratio/mean": 0.9910843372344971, "sampling/importance_sampling_ratio/min": 0.774233877658844, "sampling/sampling_logp_difference/max": 0.24979403614997864, "sampling/sampling_logp_difference/mean": 0.003497482743114233, "step": 683, "step_time": 7.956316081998011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 391.15625, "completions/mean_terminated_length": 391.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04840454086661339, "epoch": 0.01368, "frac_reward_zero_std": 0.25, "grad_norm": 0.02925175242125988, "kl": 0.8872845694422722, "learning_rate": 2.2925587001761992e-05, "loss": 0.0157, "num_tokens": 30264908.0, "reward": 0.8305912017822266, "reward_std": 0.34753695130348206, "rewards/rollout_reward_func/mean": 0.8305912017822266, "rewards/rollout_reward_func/std": 0.40000173449516296, "sampling/importance_sampling_ratio/max": 1.0221441984176636, "sampling/importance_sampling_ratio/mean": 1.000906229019165, "sampling/importance_sampling_ratio/min": 0.9953344464302063, "sampling/sampling_logp_difference/max": 0.022508278489112854, "sampling/sampling_logp_difference/mean": 0.000410175824072212, "step": 684, "step_time": 8.272838195998702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 457.25, "completions/mean_terminated_length": 457.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.058087336365133524, "epoch": 0.0137, "frac_reward_zero_std": 0.25, "grad_norm": 0.02965245395898819, "kl": 0.7452571298927069, "learning_rate": 2.2848137953388085e-05, "loss": 0.0025, "num_tokens": 30309745.0, "reward": 0.737175464630127, "reward_std": 0.42521703243255615, "rewards/rollout_reward_func/mean": 0.737175464630127, "rewards/rollout_reward_func/std": 0.5262540578842163, "sampling/importance_sampling_ratio/max": 1.0377788543701172, "sampling/importance_sampling_ratio/mean": 1.0007884502410889, "sampling/importance_sampling_ratio/min": 0.9899727702140808, "sampling/sampling_logp_difference/max": 0.03679656982421875, "sampling/sampling_logp_difference/mean": 0.000875682570040226, "step": 685, "step_time": 8.787796721997438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 475.9375, "completions/mean_terminated_length": 475.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06819386873394251, "epoch": 0.01372, "frac_reward_zero_std": 0.0, "grad_norm": 0.01466942485421896, "kl": 0.6661853473633528, "learning_rate": 2.2770963286021977e-05, "loss": 0.0011, "num_tokens": 30355164.0, "reward": 0.701575756072998, "reward_std": 0.5494295954704285, "rewards/rollout_reward_func/mean": 0.701575756072998, "rewards/rollout_reward_func/std": 0.5457667112350464, "sampling/importance_sampling_ratio/max": 1.0043699741363525, "sampling/importance_sampling_ratio/mean": 0.9977512359619141, "sampling/importance_sampling_ratio/min": 0.9735260009765625, "sampling/sampling_logp_difference/max": 0.02707725763320923, "sampling/sampling_logp_difference/mean": 0.0007291364599950612, "step": 686, "step_time": 8.255175970996788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 466.5, "completions/mean_terminated_length": 466.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07347183348610997, "epoch": 0.01374, "frac_reward_zero_std": 0.0, "grad_norm": 0.6188807487487793, "kl": 0.6040122047998011, "learning_rate": 2.269406410531401e-05, "loss": -0.0023, "num_tokens": 30400798.0, "reward": 0.7286669015884399, "reward_std": 0.46348556876182556, "rewards/rollout_reward_func/mean": 0.7286669015884399, "rewards/rollout_reward_func/std": 0.47752100229263306, "sampling/importance_sampling_ratio/max": 1.5806912183761597, "sampling/importance_sampling_ratio/mean": 1.0190534591674805, "sampling/importance_sampling_ratio/min": 0.9934998750686646, "sampling/sampling_logp_difference/max": 0.4576452970504761, "sampling/sampling_logp_difference/mean": 0.0033416086807847023, "step": 687, "step_time": 8.657953429998088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 459.375, "completions/mean_terminated_length": 459.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08451933227479458, "epoch": 0.01376, "frac_reward_zero_std": 0.25, "grad_norm": 0.18035602569580078, "kl": 0.6739151375368237, "learning_rate": 2.261744151296778e-05, "loss": 0.0009, "num_tokens": 30446705.0, "reward": 0.70745450258255, "reward_std": 0.5067421197891235, "rewards/rollout_reward_func/mean": 0.70745450258255, "rewards/rollout_reward_func/std": 0.59405517578125, "sampling/importance_sampling_ratio/max": 1.0272185802459717, "sampling/importance_sampling_ratio/mean": 0.9985186457633972, "sampling/importance_sampling_ratio/min": 0.8883277773857117, "sampling/sampling_logp_difference/max": 0.11843538284301758, "sampling/sampling_logp_difference/mean": 0.00305764377117157, "step": 688, "step_time": 8.613237218001814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 450.4375, "completions/mean_terminated_length": 450.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08957357238978148, "epoch": 0.01378, "frac_reward_zero_std": 0.0, "grad_norm": 0.24022544920444489, "kl": 0.7174880318343639, "learning_rate": 2.254109660672425e-05, "loss": 0.0006, "num_tokens": 30491710.0, "reward": 0.5411630868911743, "reward_std": 0.6817464232444763, "rewards/rollout_reward_func/mean": 0.5411630868911743, "rewards/rollout_reward_func/std": 0.6871647238731384, "sampling/importance_sampling_ratio/max": 1.37526273727417, "sampling/importance_sampling_ratio/mean": 1.0100045204162598, "sampling/importance_sampling_ratio/min": 0.8871798515319824, "sampling/sampling_logp_difference/max": 0.21622014045715332, "sampling/sampling_logp_difference/mean": 0.004949123132973909, "step": 689, "step_time": 9.04665987000044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 349.75, "completions/mean_terminated_length": 349.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06835918687283993, "epoch": 0.0138, "frac_reward_zero_std": 0.0, "grad_norm": 0.06370001286268234, "kl": 0.9362319782376289, "learning_rate": 2.246503048034614e-05, "loss": 0.0052, "num_tokens": 30531456.0, "reward": 0.8021026849746704, "reward_std": 0.4927077293395996, "rewards/rollout_reward_func/mean": 0.8021026849746704, "rewards/rollout_reward_func/std": 0.4879705607891083, "sampling/importance_sampling_ratio/max": 1.0272376537322998, "sampling/importance_sampling_ratio/mean": 1.0020332336425781, "sampling/importance_sampling_ratio/min": 0.987278938293457, "sampling/sampling_logp_difference/max": 0.027927398681640625, "sampling/sampling_logp_difference/mean": 0.0009900422301143408, "step": 690, "step_time": 8.682703936001417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 506.71875, "completions/mean_terminated_length": 506.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08281061705201864, "epoch": 0.01382, "frac_reward_zero_std": 0.0, "grad_norm": 0.03272971510887146, "kl": 0.6099750291541568, "learning_rate": 2.2389244223602183e-05, "loss": 0.003, "num_tokens": 30578562.0, "reward": 0.5405476689338684, "reward_std": 0.6178385019302368, "rewards/rollout_reward_func/mean": 0.5405476689338684, "rewards/rollout_reward_func/std": 0.6385840773582458, "sampling/importance_sampling_ratio/max": 1.0088013410568237, "sampling/importance_sampling_ratio/mean": 0.9967511892318726, "sampling/importance_sampling_ratio/min": 0.9044874906539917, "sampling/sampling_logp_difference/max": 0.10049760341644287, "sampling/sampling_logp_difference/mean": 0.001327336998656392, "step": 691, "step_time": 8.717159298996194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 520.78125, "completions/mean_terminated_length": 520.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0740141342394054, "epoch": 0.01384, "frac_reward_zero_std": 0.0, "grad_norm": 0.05236506089568138, "kl": 0.5955845303833485, "learning_rate": 2.2313738922251513e-05, "loss": 0.0011, "num_tokens": 30626567.0, "reward": 0.5729681253433228, "reward_std": 0.6308143734931946, "rewards/rollout_reward_func/mean": 0.5729681253433228, "rewards/rollout_reward_func/std": 0.6347636580467224, "sampling/importance_sampling_ratio/max": 1.0097332000732422, "sampling/importance_sampling_ratio/mean": 0.9985445141792297, "sampling/importance_sampling_ratio/min": 0.9135587215423584, "sampling/sampling_logp_difference/max": 0.08823132514953613, "sampling/sampling_logp_difference/mean": 0.0014047815930098295, "step": 692, "step_time": 8.74872496399803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 339.5625, "completions/mean_terminated_length": 339.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09089602576568723, "epoch": 0.01386, "frac_reward_zero_std": 0.25, "grad_norm": 0.04851621016860008, "kl": 0.7705509588122368, "learning_rate": 2.223851565802816e-05, "loss": -0.0027, "num_tokens": 30667099.0, "reward": 0.5865253806114197, "reward_std": 0.554939329624176, "rewards/rollout_reward_func/mean": 0.5865253806114197, "rewards/rollout_reward_func/std": 0.76070636510849, "sampling/importance_sampling_ratio/max": 1.0911945104599, "sampling/importance_sampling_ratio/mean": 1.0002350807189941, "sampling/importance_sampling_ratio/min": 0.8895898461341858, "sampling/sampling_logp_difference/max": 0.11700558662414551, "sampling/sampling_logp_difference/mean": 0.0025558611378073692, "step": 693, "step_time": 8.327362446005282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 425.25, "completions/mean_terminated_length": 425.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09215330053120852, "epoch": 0.01388, "frac_reward_zero_std": 0.0, "grad_norm": 0.049205370247364044, "kl": 0.6721356362104416, "learning_rate": 2.2163575508625508e-05, "loss": -0.0085, "num_tokens": 30712210.0, "reward": 0.5169788599014282, "reward_std": 0.7110248804092407, "rewards/rollout_reward_func/mean": 0.5169788599014282, "rewards/rollout_reward_func/std": 0.7262645363807678, "sampling/importance_sampling_ratio/max": 1.1293210983276367, "sampling/importance_sampling_ratio/mean": 1.0037877559661865, "sampling/importance_sampling_ratio/min": 0.9737599492073059, "sampling/sampling_logp_difference/max": 0.12278938293457031, "sampling/sampling_logp_difference/mean": 0.0018340940587222576, "step": 694, "step_time": 8.867670374000227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 533.625, "completions/mean_terminated_length": 533.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08848231611773372, "epoch": 0.0139, "frac_reward_zero_std": 0.0, "grad_norm": 0.0532105527818203, "kl": 0.7978042010217905, "learning_rate": 2.2088919547680875e-05, "loss": 0.0015, "num_tokens": 30760726.0, "reward": 0.7622097134590149, "reward_std": 0.47305548191070557, "rewards/rollout_reward_func/mean": 0.7622097134590149, "rewards/rollout_reward_func/std": 0.4566234052181244, "sampling/importance_sampling_ratio/max": 1.0976258516311646, "sampling/importance_sampling_ratio/mean": 0.9957470893859863, "sampling/importance_sampling_ratio/min": 0.8223937153816223, "sampling/sampling_logp_difference/max": 0.19336450099945068, "sampling/sampling_logp_difference/mean": 0.0030975686386227608, "step": 695, "step_time": 9.303740321000078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 413.4375, "completions/mean_terminated_length": 413.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08315974473953247, "epoch": 0.01392, "frac_reward_zero_std": 0.25, "grad_norm": 0.10111155360937119, "kl": 0.8204970955848694, "learning_rate": 2.2014548844760106e-05, "loss": 0.0073, "num_tokens": 30803405.0, "reward": 0.8300632238388062, "reward_std": 0.3476952910423279, "rewards/rollout_reward_func/mean": 0.8300632238388062, "rewards/rollout_reward_func/std": 0.40134555101394653, "sampling/importance_sampling_ratio/max": 1.0182660818099976, "sampling/importance_sampling_ratio/mean": 0.997381865978241, "sampling/importance_sampling_ratio/min": 0.9608501195907593, "sampling/sampling_logp_difference/max": 0.04217112064361572, "sampling/sampling_logp_difference/mean": 0.0011372624430805445, "step": 696, "step_time": 8.317642472999069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 406.65625, "completions/mean_terminated_length": 406.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06757025863043964, "epoch": 0.01394, "frac_reward_zero_std": 0.25, "grad_norm": 0.05333234742283821, "kl": 0.8333413302898407, "learning_rate": 2.194046446534231e-05, "loss": 0.0106, "num_tokens": 30845802.0, "reward": 0.7710962295532227, "reward_std": 0.4117854833602905, "rewards/rollout_reward_func/mean": 0.7710962295532227, "rewards/rollout_reward_func/std": 0.5078986883163452, "sampling/importance_sampling_ratio/max": 1.012298345565796, "sampling/importance_sampling_ratio/mean": 1.000737190246582, "sampling/importance_sampling_ratio/min": 0.9876172542572021, "sampling/sampling_logp_difference/max": 0.012476682662963867, "sampling/sampling_logp_difference/mean": 0.000733252614736557, "step": 697, "step_time": 8.532395994998296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 425.5625, "completions/mean_terminated_length": 425.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09312394447624683, "epoch": 0.01396, "frac_reward_zero_std": 0.25, "grad_norm": 0.12501260638237, "kl": 1.4119931743480265, "learning_rate": 2.186666747080451e-05, "loss": 0.0028, "num_tokens": 30890245.0, "reward": 0.6771824359893799, "reward_std": 0.5184401273727417, "rewards/rollout_reward_func/mean": 0.6771824359893799, "rewards/rollout_reward_func/std": 0.6561891436576843, "sampling/importance_sampling_ratio/max": 1.1273730993270874, "sampling/importance_sampling_ratio/mean": 0.9992468357086182, "sampling/importance_sampling_ratio/min": 0.8864046931266785, "sampling/sampling_logp_difference/max": 0.1238851547241211, "sampling/sampling_logp_difference/mean": 0.0026977183297276497, "step": 698, "step_time": 8.458531014997789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 401.84375, "completions/mean_terminated_length": 401.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06269402941688895, "epoch": 0.01398, "frac_reward_zero_std": 0.0, "grad_norm": 0.009596828371286392, "kl": 0.6824464164674282, "learning_rate": 2.179315891840649e-05, "loss": 0.0062, "num_tokens": 30932215.0, "reward": 0.7609487175941467, "reward_std": 0.45996642112731934, "rewards/rollout_reward_func/mean": 0.7609487175941467, "rewards/rollout_reward_func/std": 0.45906421542167664, "sampling/importance_sampling_ratio/max": 1.0128535032272339, "sampling/importance_sampling_ratio/mean": 1.0003390312194824, "sampling/importance_sampling_ratio/min": 0.9837994575500488, "sampling/sampling_logp_difference/max": 0.015410363674163818, "sampling/sampling_logp_difference/mean": 0.0007692252402193844, "step": 699, "step_time": 8.809515760996874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 485.1875, "completions/mean_terminated_length": 485.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09206678159534931, "epoch": 0.014, "frac_reward_zero_std": 0.0, "grad_norm": 0.060553982853889465, "kl": 0.680925115942955, "learning_rate": 2.1719939861275683e-05, "loss": 0.0076, "num_tokens": 30978666.0, "reward": 0.6688923835754395, "reward_std": 0.5760442614555359, "rewards/rollout_reward_func/mean": 0.6688923835754395, "rewards/rollout_reward_func/std": 0.560004472732544, "sampling/importance_sampling_ratio/max": 1.112038016319275, "sampling/importance_sampling_ratio/mean": 0.9999260902404785, "sampling/importance_sampling_ratio/min": 0.8845908045768738, "sampling/sampling_logp_difference/max": 0.11939024925231934, "sampling/sampling_logp_difference/mean": 0.0022244395222514868, "step": 700, "step_time": 9.160705574997337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 408.53125, "completions/mean_terminated_length": 408.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0955426525324583, "epoch": 0.01402, "frac_reward_zero_std": 0.0, "grad_norm": 0.0684574544429779, "kl": 0.8002737239003181, "learning_rate": 2.1647011348391974e-05, "loss": -0.009, "num_tokens": 31023179.0, "reward": 0.6107186675071716, "reward_std": 0.6595156192779541, "rewards/rollout_reward_func/mean": 0.6107186675071716, "rewards/rollout_reward_func/std": 0.6732198596000671, "sampling/importance_sampling_ratio/max": 1.262300729751587, "sampling/importance_sampling_ratio/mean": 1.0029263496398926, "sampling/importance_sampling_ratio/min": 0.8855013847351074, "sampling/sampling_logp_difference/max": 0.23291778564453125, "sampling/sampling_logp_difference/mean": 0.003343454794958234, "step": 701, "step_time": 8.101104455998211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 418.9375, "completions/mean_terminated_length": 418.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08873784774914384, "epoch": 0.01404, "frac_reward_zero_std": 0.0, "grad_norm": 0.0799802616238594, "kl": 0.7679410055279732, "learning_rate": 2.1574374424572795e-05, "loss": 0.0037, "num_tokens": 31067493.0, "reward": 0.5122722387313843, "reward_std": 0.7114952206611633, "rewards/rollout_reward_func/mean": 0.5122722387313843, "rewards/rollout_reward_func/std": 0.6833759546279907, "sampling/importance_sampling_ratio/max": 1.1175285577774048, "sampling/importance_sampling_ratio/mean": 1.001922369003296, "sampling/importance_sampling_ratio/min": 0.9706324338912964, "sampling/sampling_logp_difference/max": 0.111236572265625, "sampling/sampling_logp_difference/mean": 0.0018288218416273594, "step": 702, "step_time": 8.546637308001664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 378.625, "completions/mean_terminated_length": 378.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06624843180179596, "epoch": 0.01406, "frac_reward_zero_std": 0.0, "grad_norm": 0.02068677358329296, "kl": 1.062006313353777, "learning_rate": 2.1502030130458064e-05, "loss": 0.0095, "num_tokens": 31108563.0, "reward": 0.6693350076675415, "reward_std": 0.5503820180892944, "rewards/rollout_reward_func/mean": 0.6693350076675415, "rewards/rollout_reward_func/std": 0.55806565284729, "sampling/importance_sampling_ratio/max": 1.0121431350708008, "sampling/importance_sampling_ratio/mean": 0.9993270635604858, "sampling/importance_sampling_ratio/min": 0.9851006269454956, "sampling/sampling_logp_difference/max": 0.01778927445411682, "sampling/sampling_logp_difference/mean": 0.0006022069719620049, "step": 703, "step_time": 8.395167313998172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 382.46875, "completions/mean_terminated_length": 382.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07016020361334085, "epoch": 0.01408, "frac_reward_zero_std": 0.25, "grad_norm": 0.10751023888587952, "kl": 0.731179803609848, "learning_rate": 2.1429979502495326e-05, "loss": -0.0008, "num_tokens": 31150080.0, "reward": 0.7351610660552979, "reward_std": 0.44383394718170166, "rewards/rollout_reward_func/mean": 0.7351610660552979, "rewards/rollout_reward_func/std": 0.5296246409416199, "sampling/importance_sampling_ratio/max": 1.0120829343795776, "sampling/importance_sampling_ratio/mean": 0.9955103993415833, "sampling/importance_sampling_ratio/min": 0.8929188251495361, "sampling/sampling_logp_difference/max": 0.11323881149291992, "sampling/sampling_logp_difference/mean": 0.0016853299457579851, "step": 704, "step_time": 9.394566085999031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 396.34375, "completions/mean_terminated_length": 396.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07232835050672293, "epoch": 0.0141, "frac_reward_zero_std": 0.25, "grad_norm": 0.006922692526131868, "kl": 0.5819394380087033, "learning_rate": 2.1358223572924887e-05, "loss": 0.0036, "num_tokens": 31192432.0, "reward": 0.8391402959823608, "reward_std": 0.3742592930793762, "rewards/rollout_reward_func/mean": 0.8391402959823608, "rewards/rollout_reward_func/std": 0.45509329438209534, "sampling/importance_sampling_ratio/max": 1.0790959596633911, "sampling/importance_sampling_ratio/mean": 1.0023009777069092, "sampling/importance_sampling_ratio/min": 0.9872984290122986, "sampling/sampling_logp_difference/max": 0.07754254341125488, "sampling/sampling_logp_difference/mean": 0.0011400587391108274, "step": 705, "step_time": 8.09558449999895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 379.40625, "completions/mean_terminated_length": 379.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0778978755697608, "epoch": 0.01412, "frac_reward_zero_std": 0.0, "grad_norm": 0.056680113077163696, "kl": 0.8257292583584785, "learning_rate": 2.1286763369765023e-05, "loss": 0.0003, "num_tokens": 31234336.0, "reward": 0.7103322744369507, "reward_std": 0.5382257699966431, "rewards/rollout_reward_func/mean": 0.7103322744369507, "rewards/rollout_reward_func/std": 0.5876783132553101, "sampling/importance_sampling_ratio/max": 1.2253143787384033, "sampling/importance_sampling_ratio/mean": 1.0127495527267456, "sampling/importance_sampling_ratio/min": 0.9748939275741577, "sampling/sampling_logp_difference/max": 0.2021808624267578, "sampling/sampling_logp_difference/mean": 0.0034166821278631687, "step": 706, "step_time": 8.356184238999049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 460.8125, "completions/mean_terminated_length": 460.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07594718411564827, "epoch": 0.01414, "frac_reward_zero_std": 0.0, "grad_norm": 0.014300907962024212, "kl": 0.6278905365616083, "learning_rate": 2.121559991679726e-05, "loss": -0.0061, "num_tokens": 31280260.0, "reward": 0.5443823337554932, "reward_std": 0.6931739449501038, "rewards/rollout_reward_func/mean": 0.5443823337554932, "rewards/rollout_reward_func/std": 0.6828132271766663, "sampling/importance_sampling_ratio/max": 1.0133527517318726, "sampling/importance_sampling_ratio/mean": 0.9988926649093628, "sampling/importance_sampling_ratio/min": 0.9614740610122681, "sampling/sampling_logp_difference/max": 0.03876465559005737, "sampling/sampling_logp_difference/mean": 0.0008880077512003481, "step": 707, "step_time": 8.48344844500025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 412.375, "completions/mean_terminated_length": 412.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06097755115479231, "epoch": 0.01416, "frac_reward_zero_std": 0.0, "grad_norm": 0.03172498568892479, "kl": 0.7081140801310539, "learning_rate": 2.114473423355169e-05, "loss": 0.0095, "num_tokens": 31323295.0, "reward": 0.7978514432907104, "reward_std": 0.44021350145339966, "rewards/rollout_reward_func/mean": 0.7978514432907104, "rewards/rollout_reward_func/std": 0.42756250500679016, "sampling/importance_sampling_ratio/max": 1.1326044797897339, "sampling/importance_sampling_ratio/mean": 1.0025041103363037, "sampling/importance_sampling_ratio/min": 0.989678144454956, "sampling/sampling_logp_difference/max": 0.12358617782592773, "sampling/sampling_logp_difference/mean": 0.001430958742275834, "step": 708, "step_time": 8.453162375999455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 448.5625, "completions/mean_terminated_length": 448.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07995165791362524, "epoch": 0.01418, "frac_reward_zero_std": 0.0, "grad_norm": 0.06949806958436966, "kl": 0.7905565351247787, "learning_rate": 2.107416733529243e-05, "loss": -0.0073, "num_tokens": 31368604.0, "reward": 0.734040379524231, "reward_std": 0.5388706922531128, "rewards/rollout_reward_func/mean": 0.734040379524231, "rewards/rollout_reward_func/std": 0.532521665096283, "sampling/importance_sampling_ratio/max": 1.010536789894104, "sampling/importance_sampling_ratio/mean": 0.9989596605300903, "sampling/importance_sampling_ratio/min": 0.9833791255950928, "sampling/sampling_logp_difference/max": 0.016787096858024597, "sampling/sampling_logp_difference/mean": 0.0008341345237568021, "step": 709, "step_time": 9.41681830699963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 383.40625, "completions/mean_terminated_length": 383.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05069296481087804, "epoch": 0.0142, "frac_reward_zero_std": 0.25, "grad_norm": 0.03298093006014824, "kl": 0.8904747143387794, "learning_rate": 2.1003900233002952e-05, "loss": 0.0038, "num_tokens": 31410151.0, "reward": 0.7298809289932251, "reward_std": 0.3806609809398651, "rewards/rollout_reward_func/mean": 0.7298809289932251, "rewards/rollout_reward_func/std": 0.4754309058189392, "sampling/importance_sampling_ratio/max": 1.008479118347168, "sampling/importance_sampling_ratio/mean": 0.9958246946334839, "sampling/importance_sampling_ratio/min": 0.8912842869758606, "sampling/sampling_logp_difference/max": 0.11509895324707031, "sampling/sampling_logp_difference/mean": 0.0012615188024938107, "step": 710, "step_time": 8.361099334000755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 489.6875, "completions/mean_terminated_length": 489.6875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06468295864760876, "epoch": 0.01422, "frac_reward_zero_std": 0.0, "grad_norm": 0.048092909157276154, "kl": 0.792800635099411, "learning_rate": 2.0933933933371724e-05, "loss": 0.0033, "num_tokens": 31456701.0, "reward": 0.6598241329193115, "reward_std": 0.5342763662338257, "rewards/rollout_reward_func/mean": 0.6598241329193115, "rewards/rollout_reward_func/std": 0.5127437114715576, "sampling/importance_sampling_ratio/max": 1.0197795629501343, "sampling/importance_sampling_ratio/mean": 1.0018861293792725, "sampling/importance_sampling_ratio/min": 0.993044912815094, "sampling/sampling_logp_difference/max": 0.019568845629692078, "sampling/sampling_logp_difference/mean": 0.0006724970880895853, "step": 711, "step_time": 8.615654095001446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 467.15625, "completions/mean_terminated_length": 467.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0605140186380595, "epoch": 0.01424, "frac_reward_zero_std": 0.0, "grad_norm": 0.026301465928554535, "kl": 0.5935965813696384, "learning_rate": 2.086426943877771e-05, "loss": 0.0026, "num_tokens": 31501777.0, "reward": 0.6946340799331665, "reward_std": 0.47835609316825867, "rewards/rollout_reward_func/mean": 0.6946340799331665, "rewards/rollout_reward_func/std": 0.4960525333881378, "sampling/importance_sampling_ratio/max": 1.0135188102722168, "sampling/importance_sampling_ratio/mean": 0.9998078346252441, "sampling/importance_sampling_ratio/min": 0.9808630347251892, "sampling/sampling_logp_difference/max": 0.018451184034347534, "sampling/sampling_logp_difference/mean": 0.0008182705496437848, "step": 712, "step_time": 7.915937268999187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 379.09375, "completions/mean_terminated_length": 379.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05358068970963359, "epoch": 0.01426, "frac_reward_zero_std": 0.0, "grad_norm": 0.03850352764129639, "kl": 0.6709879115223885, "learning_rate": 2.0794907747276056e-05, "loss": -0.0024, "num_tokens": 31543801.0, "reward": 0.7326384782791138, "reward_std": 0.5373206734657288, "rewards/rollout_reward_func/mean": 0.7326384782791138, "rewards/rollout_reward_func/std": 0.5349520444869995, "sampling/importance_sampling_ratio/max": 1.125827670097351, "sampling/importance_sampling_ratio/mean": 1.0044331550598145, "sampling/importance_sampling_ratio/min": 0.890924870967865, "sampling/sampling_logp_difference/max": 0.11849761009216309, "sampling/sampling_logp_difference/mean": 0.0029391299467533827, "step": 713, "step_time": 7.781498124999416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 347.375, "completions/mean_terminated_length": 347.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.062013591174036264, "epoch": 0.01428, "frac_reward_zero_std": 0.0, "grad_norm": 0.04697725549340248, "kl": 1.0858843550086021, "learning_rate": 2.072584985258375e-05, "loss": 0.0059, "num_tokens": 31584119.0, "reward": 0.5371172428131104, "reward_std": 0.6469892859458923, "rewards/rollout_reward_func/mean": 0.5371172428131104, "rewards/rollout_reward_func/std": 0.641907274723053, "sampling/importance_sampling_ratio/max": 1.0203386545181274, "sampling/importance_sampling_ratio/mean": 1.0003912448883057, "sampling/importance_sampling_ratio/min": 0.9719997048377991, "sampling/sampling_logp_difference/max": 0.027707040309906006, "sampling/sampling_logp_difference/mean": 0.000811964797321707, "step": 714, "step_time": 8.888431727998977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 370.8125, "completions/mean_terminated_length": 370.8125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05418798141181469, "epoch": 0.0143, "frac_reward_zero_std": 0.25, "grad_norm": 0.00788819044828415, "kl": 0.7944666258990765, "learning_rate": 2.0657096744065408e-05, "loss": 0.0173, "num_tokens": 31624880.0, "reward": 0.7994429469108582, "reward_std": 0.35624802112579346, "rewards/rollout_reward_func/mean": 0.7994429469108582, "rewards/rollout_reward_func/std": 0.4243275821208954, "sampling/importance_sampling_ratio/max": 1.0108373165130615, "sampling/importance_sampling_ratio/mean": 0.9979993104934692, "sampling/importance_sampling_ratio/min": 0.9581788778305054, "sampling/sampling_logp_difference/max": 0.042711660265922546, "sampling/sampling_logp_difference/mean": 0.0008449458982795477, "step": 715, "step_time": 7.789165005999166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 334.34375, "completions/mean_terminated_length": 334.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06332942983135581, "epoch": 0.01432, "frac_reward_zero_std": 0.0, "grad_norm": 0.06657620519399643, "kl": 1.181150970980525, "learning_rate": 2.0588649406719114e-05, "loss": -0.0023, "num_tokens": 31664388.0, "reward": 0.6444187164306641, "reward_std": 0.6803231239318848, "rewards/rollout_reward_func/mean": 0.6444187164306641, "rewards/rollout_reward_func/std": 0.6638475656509399, "sampling/importance_sampling_ratio/max": 1.008560299873352, "sampling/importance_sampling_ratio/mean": 0.9979396462440491, "sampling/importance_sampling_ratio/min": 0.8904808163642883, "sampling/sampling_logp_difference/max": 0.12220168113708496, "sampling/sampling_logp_difference/mean": 0.001663900213316083, "step": 716, "step_time": 7.844223044998216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 382.1875, "completions/mean_terminated_length": 382.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.054643726674839854, "epoch": 0.01434, "frac_reward_zero_std": 0.0, "grad_norm": 0.18250413239002228, "kl": 0.9968258962035179, "learning_rate": 2.0520508821162264e-05, "loss": 0.0149, "num_tokens": 31704839.0, "reward": 0.7312221527099609, "reward_std": 0.4833773076534271, "rewards/rollout_reward_func/mean": 0.7312221527099609, "rewards/rollout_reward_func/std": 0.4730028808116913, "sampling/importance_sampling_ratio/max": 1.133963704109192, "sampling/importance_sampling_ratio/mean": 1.003756046295166, "sampling/importance_sampling_ratio/min": 0.989071249961853, "sampling/sampling_logp_difference/max": 0.1235361099243164, "sampling/sampling_logp_difference/mean": 0.0012569925747811794, "step": 717, "step_time": 8.03366662499866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 421.9375, "completions/mean_terminated_length": 421.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06673922017216682, "epoch": 0.01436, "frac_reward_zero_std": 0.5, "grad_norm": 0.02008342370390892, "kl": 1.00664484500885, "learning_rate": 2.045267596361759e-05, "loss": -0.0024, "num_tokens": 31748723.0, "reward": 0.708114743232727, "reward_std": 0.36111658811569214, "rewards/rollout_reward_func/mean": 0.708114743232727, "rewards/rollout_reward_func/std": 0.5934668779373169, "sampling/importance_sampling_ratio/max": 1.1293929815292358, "sampling/importance_sampling_ratio/mean": 1.0038220882415771, "sampling/importance_sampling_ratio/min": 0.9838283658027649, "sampling/sampling_logp_difference/max": 0.12169003486633301, "sampling/sampling_logp_difference/mean": 0.0015412094071507454, "step": 718, "step_time": 8.044850313999632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 462.375, "completions/mean_terminated_length": 462.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07609696872532368, "epoch": 0.01438, "frac_reward_zero_std": 0.25, "grad_norm": 0.25840598344802856, "kl": 2.3795052990317345, "learning_rate": 2.0385151805899068e-05, "loss": 0.0059, "num_tokens": 31794081.0, "reward": 0.7074573040008545, "reward_std": 0.46945035457611084, "rewards/rollout_reward_func/mean": 0.7074573040008545, "rewards/rollout_reward_func/std": 0.5958166122436523, "sampling/importance_sampling_ratio/max": 1.0137945413589478, "sampling/importance_sampling_ratio/mean": 0.9951396584510803, "sampling/importance_sampling_ratio/min": 0.851798415184021, "sampling/sampling_logp_difference/max": 0.12438726425170898, "sampling/sampling_logp_difference/mean": 0.002040750812739134, "step": 719, "step_time": 9.463550737998958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 374.65625, "completions/mean_terminated_length": 374.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06712525896728039, "epoch": 0.0144, "frac_reward_zero_std": 0.0, "grad_norm": 0.04105393961071968, "kl": 0.7952616959810257, "learning_rate": 2.03179373153981e-05, "loss": 0.0012, "num_tokens": 31835731.0, "reward": 0.6065475940704346, "reward_std": 0.6102534532546997, "rewards/rollout_reward_func/mean": 0.6065475940704346, "rewards/rollout_reward_func/std": 0.6261217594146729, "sampling/importance_sampling_ratio/max": 1.0180634260177612, "sampling/importance_sampling_ratio/mean": 0.9981600046157837, "sampling/importance_sampling_ratio/min": 0.891973614692688, "sampling/sampling_logp_difference/max": 0.1143350601196289, "sampling/sampling_logp_difference/mean": 0.0015944496262818575, "step": 720, "step_time": 8.393786743001328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 420.84375, "completions/mean_terminated_length": 420.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06775534572079778, "epoch": 0.01442, "frac_reward_zero_std": 0.0, "grad_norm": 0.04887064918875694, "kl": 0.6215868908911943, "learning_rate": 2.025103345506961e-05, "loss": 0.011, "num_tokens": 31878791.0, "reward": 0.7649084329605103, "reward_std": 0.45332273840904236, "rewards/rollout_reward_func/mean": 0.7649084329605103, "rewards/rollout_reward_func/std": 0.4514136016368866, "sampling/importance_sampling_ratio/max": 1.011767029762268, "sampling/importance_sampling_ratio/mean": 0.99580979347229, "sampling/importance_sampling_ratio/min": 0.8899397253990173, "sampling/sampling_logp_difference/max": 0.11660408973693848, "sampling/sampling_logp_difference/mean": 0.0015742974355816841, "step": 721, "step_time": 8.550338835997536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 360.71875, "completions/mean_terminated_length": 360.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07301952550187707, "epoch": 0.01444, "frac_reward_zero_std": 0.0, "grad_norm": 0.039390772581100464, "kl": 0.730816064402461, "learning_rate": 2.0184441183418213e-05, "loss": -0.0005, "num_tokens": 31919345.0, "reward": 0.7330060005187988, "reward_std": 0.5231388807296753, "rewards/rollout_reward_func/mean": 0.7330060005187988, "rewards/rollout_reward_func/std": 0.5345431566238403, "sampling/importance_sampling_ratio/max": 1.0117323398590088, "sampling/importance_sampling_ratio/mean": 1.0009199380874634, "sampling/importance_sampling_ratio/min": 0.9894827008247375, "sampling/sampling_logp_difference/max": 0.011655889451503754, "sampling/sampling_logp_difference/mean": 0.0007363302865996957, "step": 722, "step_time": 8.376868002002084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 358.5, "completions/mean_terminated_length": 358.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06336927181109786, "epoch": 0.01446, "frac_reward_zero_std": 0.0, "grad_norm": 0.007101715076714754, "kl": 0.9589081555604935, "learning_rate": 2.011816145448457e-05, "loss": 0.0078, "num_tokens": 31958878.0, "reward": 0.734939694404602, "reward_std": 0.5191140174865723, "rewards/rollout_reward_func/mean": 0.734939694404602, "rewards/rollout_reward_func/std": 0.5312469601631165, "sampling/importance_sampling_ratio/max": 1.1208473443984985, "sampling/importance_sampling_ratio/mean": 1.0012487173080444, "sampling/importance_sampling_ratio/min": 0.8897659182548523, "sampling/sampling_logp_difference/max": 0.11679863929748535, "sampling/sampling_logp_difference/mean": 0.0022639725357294083, "step": 723, "step_time": 8.800011560999337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 505.53125, "completions/mean_terminated_length": 505.53125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09469800349324942, "epoch": 0.01448, "frac_reward_zero_std": 0.0, "grad_norm": 0.12783776223659515, "kl": 1.2069505713880062, "learning_rate": 2.0052195217831626e-05, "loss": 0.0053, "num_tokens": 32006447.0, "reward": 0.5728650093078613, "reward_std": 0.6024256944656372, "rewards/rollout_reward_func/mean": 0.5728650093078613, "rewards/rollout_reward_func/std": 0.6347174048423767, "sampling/importance_sampling_ratio/max": 1.1374822854995728, "sampling/importance_sampling_ratio/mean": 1.0049268007278442, "sampling/importance_sampling_ratio/min": 0.9882723689079285, "sampling/sampling_logp_difference/max": 0.12282037734985352, "sampling/sampling_logp_difference/mean": 0.0016660807887092233, "step": 724, "step_time": 8.957271193999986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 511.0, "completions/mean_terminated_length": 511.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08325353916734457, "epoch": 0.0145, "frac_reward_zero_std": 0.25, "grad_norm": 0.05185597389936447, "kl": 0.987035259604454, "learning_rate": 1.9986543418531065e-05, "loss": 0.0061, "num_tokens": 32053417.0, "reward": 0.7290469408035278, "reward_std": 0.40536898374557495, "rewards/rollout_reward_func/mean": 0.7290469408035278, "rewards/rollout_reward_func/std": 0.4769047200679779, "sampling/importance_sampling_ratio/max": 1.0957214832305908, "sampling/importance_sampling_ratio/mean": 0.9997626543045044, "sampling/importance_sampling_ratio/min": 0.8775330781936646, "sampling/sampling_logp_difference/max": 0.1327838897705078, "sampling/sampling_logp_difference/mean": 0.0021620960906147957, "step": 725, "step_time": 8.773772676002409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 371.3125, "completions/mean_terminated_length": 371.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08054055226966739, "epoch": 0.01452, "frac_reward_zero_std": 0.0, "grad_norm": 0.014463015832006931, "kl": 0.5507130976766348, "learning_rate": 1.992120699714976e-05, "loss": 0.0063, "num_tokens": 32095582.0, "reward": 0.7639446258544922, "reward_std": 0.4711016118526459, "rewards/rollout_reward_func/mean": 0.7639446258544922, "rewards/rollout_reward_func/std": 0.45336782932281494, "sampling/importance_sampling_ratio/max": 1.0396254062652588, "sampling/importance_sampling_ratio/mean": 1.0008525848388672, "sampling/importance_sampling_ratio/min": 0.9860002398490906, "sampling/sampling_logp_difference/max": 0.038878023624420166, "sampling/sampling_logp_difference/mean": 0.0011717670131474733, "step": 726, "step_time": 8.464811323001413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 311.15625, "completions/mean_terminated_length": 311.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06641333922743797, "epoch": 0.01454, "frac_reward_zero_std": 0.25, "grad_norm": 0.019747065380215645, "kl": 0.9095248617231846, "learning_rate": 1.9856186889736272e-05, "loss": 0.0154, "num_tokens": 32132386.0, "reward": 0.7032225131988525, "reward_std": 0.4292125105857849, "rewards/rollout_reward_func/mean": 0.7032225131988525, "rewards/rollout_reward_func/std": 0.5434975028038025, "sampling/importance_sampling_ratio/max": 1.0996222496032715, "sampling/importance_sampling_ratio/mean": 1.00205659866333, "sampling/importance_sampling_ratio/min": 0.9811133742332458, "sampling/sampling_logp_difference/max": 0.09433197975158691, "sampling/sampling_logp_difference/mean": 0.001631037681363523, "step": 727, "step_time": 8.153791817998354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 522.25, "completions/mean_terminated_length": 522.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0970228286460042, "epoch": 0.01456, "frac_reward_zero_std": 0.0, "grad_norm": 0.030987229198217392, "kl": 0.8447102531790733, "learning_rate": 1.9791484027807483e-05, "loss": 0.0028, "num_tokens": 32180646.0, "reward": 0.6334642767906189, "reward_std": 0.5684990882873535, "rewards/rollout_reward_func/mean": 0.6334642767906189, "rewards/rollout_reward_func/std": 0.5740100145339966, "sampling/importance_sampling_ratio/max": 1.0638834238052368, "sampling/importance_sampling_ratio/mean": 0.9999274611473083, "sampling/importance_sampling_ratio/min": 0.8856787085533142, "sampling/sampling_logp_difference/max": 0.12140297889709473, "sampling/sampling_logp_difference/mean": 0.0024334071204066277, "step": 728, "step_time": 9.881650267001532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 448.71875, "completions/mean_terminated_length": 448.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.10357665084302425, "epoch": 0.01458, "frac_reward_zero_std": 0.0, "grad_norm": 0.0450408011674881, "kl": 0.5246602213010192, "learning_rate": 1.9727099338335202e-05, "loss": -0.0085, "num_tokens": 32226275.0, "reward": 0.7113949060440063, "reward_std": 0.6097284555435181, "rewards/rollout_reward_func/mean": 0.7113949060440063, "rewards/rollout_reward_func/std": 0.6414049863815308, "sampling/importance_sampling_ratio/max": 1.026848554611206, "sampling/importance_sampling_ratio/mean": 0.9931608438491821, "sampling/importance_sampling_ratio/min": 0.8905016183853149, "sampling/sampling_logp_difference/max": 0.11598563194274902, "sampling/sampling_logp_difference/mean": 0.002458244329318404, "step": 729, "step_time": 8.542894971998976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 521.6875, "completions/mean_terminated_length": 521.6875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09001689869910479, "epoch": 0.0146, "frac_reward_zero_std": 0.0, "grad_norm": 0.030530409887433052, "kl": 0.7581933550536633, "learning_rate": 1.9663033743732944e-05, "loss": 0.0059, "num_tokens": 32273615.0, "reward": 0.6353811025619507, "reward_std": 0.5816757678985596, "rewards/rollout_reward_func/mean": 0.6353811025619507, "rewards/rollout_reward_func/std": 0.571584165096283, "sampling/importance_sampling_ratio/max": 1.0140897035598755, "sampling/importance_sampling_ratio/mean": 0.9946821928024292, "sampling/importance_sampling_ratio/min": 0.8542726635932922, "sampling/sampling_logp_difference/max": 0.15542370080947876, "sampling/sampling_logp_difference/mean": 0.002136496128514409, "step": 730, "step_time": 8.779368408000664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 327.65625, "completions/mean_terminated_length": 327.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0814457181841135, "epoch": 0.01462, "frac_reward_zero_std": 0.25, "grad_norm": 0.022589795291423798, "kl": 0.6501888735219836, "learning_rate": 1.9599288161842634e-05, "loss": 0.0004, "num_tokens": 32312635.0, "reward": 0.8367836475372314, "reward_std": 0.39571237564086914, "rewards/rollout_reward_func/mean": 0.8367836475372314, "rewards/rollout_reward_func/std": 0.45967191457748413, "sampling/importance_sampling_ratio/max": 1.026703119277954, "sampling/importance_sampling_ratio/mean": 0.9975690841674805, "sampling/importance_sampling_ratio/min": 0.8910241723060608, "sampling/sampling_logp_difference/max": 0.11539745330810547, "sampling/sampling_logp_difference/mean": 0.0018996675498783588, "step": 731, "step_time": 7.984839429000203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 537.53125, "completions/mean_terminated_length": 537.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.11065898835659027, "epoch": 0.01464, "frac_reward_zero_std": 0.0, "grad_norm": 0.07068422436714172, "kl": 0.7615730185061693, "learning_rate": 1.9535863505921547e-05, "loss": -0.0044, "num_tokens": 32361203.0, "reward": 0.5087643265724182, "reward_std": 0.6488678455352783, "rewards/rollout_reward_func/mean": 0.5087643265724182, "rewards/rollout_reward_func/std": 0.6891509890556335, "sampling/importance_sampling_ratio/max": 1.1181309223175049, "sampling/importance_sampling_ratio/mean": 1.001587152481079, "sampling/importance_sampling_ratio/min": 0.9861111044883728, "sampling/sampling_logp_difference/max": 0.1116476058959961, "sampling/sampling_logp_difference/mean": 0.0019215377978980541, "step": 732, "step_time": 8.285048323996307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 476.75, "completions/mean_terminated_length": 476.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0907349162735045, "epoch": 0.01466, "frac_reward_zero_std": 0.0, "grad_norm": 0.07658729702234268, "kl": 0.83467648178339, "learning_rate": 1.947276068462916e-05, "loss": 0.0052, "num_tokens": 32407017.0, "reward": 0.7289876937866211, "reward_std": 0.48577845096588135, "rewards/rollout_reward_func/mean": 0.7289876937866211, "rewards/rollout_reward_func/std": 0.4770256280899048, "sampling/importance_sampling_ratio/max": 1.1250025033950806, "sampling/importance_sampling_ratio/mean": 1.0072472095489502, "sampling/importance_sampling_ratio/min": 0.9832702875137329, "sampling/sampling_logp_difference/max": 0.11776971817016602, "sampling/sampling_logp_difference/mean": 0.0023927686270326376, "step": 733, "step_time": 9.347801126996274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 436.3125, "completions/mean_terminated_length": 436.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09813729394227266, "epoch": 0.01468, "frac_reward_zero_std": 0.0, "grad_norm": 0.12756980955600739, "kl": 1.0505084320902824, "learning_rate": 1.9409980602014157e-05, "loss": 0.0086, "num_tokens": 32450313.0, "reward": 0.5408326983451843, "reward_std": 0.6556152105331421, "rewards/rollout_reward_func/mean": 0.5408326983451843, "rewards/rollout_reward_func/std": 0.6398099660873413, "sampling/importance_sampling_ratio/max": 1.1481716632843018, "sampling/importance_sampling_ratio/mean": 1.0001904964447021, "sampling/importance_sampling_ratio/min": 0.8955555558204651, "sampling/sampling_logp_difference/max": 0.11780905723571777, "sampling/sampling_logp_difference/mean": 0.0032973638735711575, "step": 734, "step_time": 8.562173935000828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 361.625, "completions/mean_terminated_length": 361.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08021703222766519, "epoch": 0.0147, "frac_reward_zero_std": 0.0, "grad_norm": 0.04077957943081856, "kl": 0.7893468663096428, "learning_rate": 1.9347524157501473e-05, "loss": 0.004, "num_tokens": 32490193.0, "reward": 0.7414623498916626, "reward_std": 0.564730703830719, "rewards/rollout_reward_func/mean": 0.7414623498916626, "rewards/rollout_reward_func/std": 0.5779544711112976, "sampling/importance_sampling_ratio/max": 1.1191946268081665, "sampling/importance_sampling_ratio/mean": 1.0074942111968994, "sampling/importance_sampling_ratio/min": 0.9862596988677979, "sampling/sampling_logp_difference/max": 0.11256098747253418, "sampling/sampling_logp_difference/mean": 0.0025833293329924345, "step": 735, "step_time": 8.30277832899992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 380.0, "completions/mean_terminated_length": 380.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09602109435945749, "epoch": 0.01472, "frac_reward_zero_std": 0.0, "grad_norm": 0.0855555608868599, "kl": 0.7164056226611137, "learning_rate": 1.928539224587942e-05, "loss": 0.0053, "num_tokens": 32531332.0, "reward": 0.5738741159439087, "reward_std": 0.6178956627845764, "rewards/rollout_reward_func/mean": 0.5738741159439087, "rewards/rollout_reward_func/std": 0.6333353519439697, "sampling/importance_sampling_ratio/max": 1.10468327999115, "sampling/importance_sampling_ratio/mean": 1.002160668373108, "sampling/importance_sampling_ratio/min": 0.8977099061012268, "sampling/sampling_logp_difference/max": 0.11108756065368652, "sampling/sampling_logp_difference/mean": 0.002734915353357792, "step": 736, "step_time": 8.41706357500152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 441.0, "completions/mean_terminated_length": 441.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08888082299381495, "epoch": 0.01474, "frac_reward_zero_std": 0.0, "grad_norm": 0.025110917165875435, "kl": 0.6310742422938347, "learning_rate": 1.9223585757286844e-05, "loss": -0.0049, "num_tokens": 32575990.0, "reward": 0.7401877045631409, "reward_std": 0.5595182776451111, "rewards/rollout_reward_func/mean": 0.7401877045631409, "rewards/rollout_reward_func/std": 0.5801507830619812, "sampling/importance_sampling_ratio/max": 1.0137715339660645, "sampling/importance_sampling_ratio/mean": 0.9957199096679688, "sampling/importance_sampling_ratio/min": 0.8934869170188904, "sampling/sampling_logp_difference/max": 0.11261677742004395, "sampling/sampling_logp_difference/mean": 0.001615843502804637, "step": 737, "step_time": 8.624930464999125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 298.34375, "completions/mean_terminated_length": 298.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09241548459976912, "epoch": 0.01476, "frac_reward_zero_std": 0.0, "grad_norm": 0.04056842252612114, "kl": 0.8998844921588898, "learning_rate": 1.9162105577200388e-05, "loss": 0.0102, "num_tokens": 32613784.0, "reward": 0.7053919434547424, "reward_std": 0.6013154983520508, "rewards/rollout_reward_func/mean": 0.7053919434547424, "rewards/rollout_reward_func/std": 0.5987849235534668, "sampling/importance_sampling_ratio/max": 1.123124361038208, "sampling/importance_sampling_ratio/mean": 1.0049917697906494, "sampling/importance_sampling_ratio/min": 0.9859681725502014, "sampling/sampling_logp_difference/max": 0.1161336898803711, "sampling/sampling_logp_difference/mean": 0.0017476406646892428, "step": 738, "step_time": 9.077357359996313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 387.28125, "completions/mean_terminated_length": 387.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09499595314264297, "epoch": 0.01478, "frac_reward_zero_std": 0.25, "grad_norm": 0.030170388519763947, "kl": 0.7503101173788309, "learning_rate": 1.9100952586421842e-05, "loss": -0.0003, "num_tokens": 32655474.0, "reward": 0.7397196292877197, "reward_std": 0.48197993636131287, "rewards/rollout_reward_func/mean": 0.7397196292877197, "rewards/rollout_reward_func/std": 0.5810235738754272, "sampling/importance_sampling_ratio/max": 1.0215189456939697, "sampling/importance_sampling_ratio/mean": 0.9951015710830688, "sampling/importance_sampling_ratio/min": 0.7985783815383911, "sampling/sampling_logp_difference/max": 0.21641135215759277, "sampling/sampling_logp_difference/mean": 0.0023513040505349636, "step": 739, "step_time": 8.226426647999688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 376.5, "completions/mean_terminated_length": 376.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09544621594250202, "epoch": 0.0148, "frac_reward_zero_std": 0.25, "grad_norm": 0.011210296303033829, "kl": 0.651635404676199, "learning_rate": 1.9040127661065408e-05, "loss": 0.0029, "num_tokens": 32696452.0, "reward": 0.7692926526069641, "reward_std": 0.41278785467147827, "rewards/rollout_reward_func/mean": 0.7692926526069641, "rewards/rollout_reward_func/std": 0.5103688836097717, "sampling/importance_sampling_ratio/max": 1.0186443328857422, "sampling/importance_sampling_ratio/mean": 0.9985110759735107, "sampling/importance_sampling_ratio/min": 0.9585080146789551, "sampling/sampling_logp_difference/max": 0.043509334325790405, "sampling/sampling_logp_difference/mean": 0.0013448480749502778, "step": 740, "step_time": 8.416539072000887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 399.59375, "completions/mean_terminated_length": 399.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09974350640550256, "epoch": 0.01482, "frac_reward_zero_std": 0.0, "grad_norm": 0.06070663034915924, "kl": 0.8644493967294693, "learning_rate": 1.897963167254532e-05, "loss": 0.0014, "num_tokens": 32738374.0, "reward": 0.7752887010574341, "reward_std": 0.5143888592720032, "rewards/rollout_reward_func/mean": 0.7752887010574341, "rewards/rollout_reward_func/std": 0.5600798726081848, "sampling/importance_sampling_ratio/max": 1.062543272972107, "sampling/importance_sampling_ratio/mean": 0.9995467662811279, "sampling/importance_sampling_ratio/min": 0.9037717580795288, "sampling/sampling_logp_difference/max": 0.09593713283538818, "sampling/sampling_logp_difference/mean": 0.0020475685596466064, "step": 741, "step_time": 8.331998584999383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 465.46875, "completions/mean_terminated_length": 465.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0828267321921885, "epoch": 0.01484, "frac_reward_zero_std": 0.25, "grad_norm": 0.05192435160279274, "kl": 0.8559789843857288, "learning_rate": 1.8919465487563198e-05, "loss": 0.0057, "num_tokens": 32783919.0, "reward": 0.6666743755340576, "reward_std": 0.43210893869400024, "rewards/rollout_reward_func/mean": 0.6666743755340576, "rewards/rollout_reward_func/std": 0.5632357001304626, "sampling/importance_sampling_ratio/max": 1.1208049058914185, "sampling/importance_sampling_ratio/mean": 1.0029311180114746, "sampling/importance_sampling_ratio/min": 0.8946593403816223, "sampling/sampling_logp_difference/max": 0.11405062675476074, "sampling/sampling_logp_difference/mean": 0.00249822111800313, "step": 742, "step_time": 8.281943782996677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 530.125, "completions/mean_terminated_length": 530.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.12024276796728373, "epoch": 0.01486, "frac_reward_zero_std": 0.0, "grad_norm": 0.07829420268535614, "kl": 0.6542666003806517, "learning_rate": 1.8859629968095718e-05, "loss": -0.0018, "num_tokens": 32832552.0, "reward": 0.707250714302063, "reward_std": 0.5701620578765869, "rewards/rollout_reward_func/mean": 0.707250714302063, "rewards/rollout_reward_func/std": 0.5946168899536133, "sampling/importance_sampling_ratio/max": 1.1275715827941895, "sampling/importance_sampling_ratio/mean": 1.0056637525558472, "sampling/importance_sampling_ratio/min": 0.9794052839279175, "sampling/sampling_logp_difference/max": 0.12006998062133789, "sampling/sampling_logp_difference/mean": 0.002105989959090948, "step": 743, "step_time": 9.090335199001856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 438.3125, "completions/mean_terminated_length": 438.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.10752060869708657, "epoch": 0.01488, "frac_reward_zero_std": 0.0, "grad_norm": 0.08588694781064987, "kl": 0.7805885300040245, "learning_rate": 1.8800125971382258e-05, "loss": 0.0012, "num_tokens": 32877399.0, "reward": 0.6747429370880127, "reward_std": 0.6072078943252563, "rewards/rollout_reward_func/mean": 0.6747429370880127, "rewards/rollout_reward_func/std": 0.6051228642463684, "sampling/importance_sampling_ratio/max": 1.0089995861053467, "sampling/importance_sampling_ratio/mean": 0.9918121695518494, "sampling/importance_sampling_ratio/min": 0.8834794759750366, "sampling/sampling_logp_difference/max": 0.12389707565307617, "sampling/sampling_logp_difference/mean": 0.002446225378662348, "step": 744, "step_time": 7.991949689003377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 360.0, "completions/mean_terminated_length": 360.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09623634535819292, "epoch": 0.0149, "frac_reward_zero_std": 0.0, "grad_norm": 0.07036251574754715, "kl": 0.7665203362703323, "learning_rate": 1.8740954349912597e-05, "loss": 0.0048, "num_tokens": 32918730.0, "reward": 0.6458105444908142, "reward_std": 0.6139308214187622, "rewards/rollout_reward_func/mean": 0.6458105444908142, "rewards/rollout_reward_func/std": 0.6613067388534546, "sampling/importance_sampling_ratio/max": 1.0099236965179443, "sampling/importance_sampling_ratio/mean": 0.9932495355606079, "sampling/importance_sampling_ratio/min": 0.8866111636161804, "sampling/sampling_logp_difference/max": 0.12038064002990723, "sampling/sampling_logp_difference/mean": 0.0024995487183332443, "step": 745, "step_time": 8.124896743000136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 240.46875, "completions/mean_terminated_length": 240.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09343619318678975, "epoch": 0.01492, "frac_reward_zero_std": 0.25, "grad_norm": 0.08990854024887085, "kl": 1.444103367626667, "learning_rate": 1.8682115951414694e-05, "loss": 0.0005, "num_tokens": 32954001.0, "reward": 0.7175295352935791, "reward_std": 0.594281792640686, "rewards/rollout_reward_func/mean": 0.7175295352935791, "rewards/rollout_reward_func/std": 0.682815432548523, "sampling/importance_sampling_ratio/max": 1.020188331604004, "sampling/importance_sampling_ratio/mean": 0.9904094934463501, "sampling/importance_sampling_ratio/min": 0.883202850818634, "sampling/sampling_logp_difference/max": 0.12334632873535156, "sampling/sampling_logp_difference/mean": 0.003693469101563096, "step": 746, "step_time": 7.300660633000007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 288.125, "completions/mean_terminated_length": 288.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.11494610272347927, "epoch": 0.01494, "frac_reward_zero_std": 0.0, "grad_norm": 0.1437385082244873, "kl": 0.9576815292239189, "learning_rate": 1.862361161884258e-05, "loss": -0.0055, "num_tokens": 32992690.0, "reward": 0.4929078221321106, "reward_std": 0.8246766328811646, "rewards/rollout_reward_func/mean": 0.4929078221321106, "rewards/rollout_reward_func/std": 0.805338978767395, "sampling/importance_sampling_ratio/max": 1.1250616312026978, "sampling/importance_sampling_ratio/mean": 1.0063464641571045, "sampling/importance_sampling_ratio/min": 0.9580234289169312, "sampling/sampling_logp_difference/max": 0.11787843704223633, "sampling/sampling_logp_difference/mean": 0.003232467919588089, "step": 747, "step_time": 8.475367981998716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 525.09375, "completions/mean_terminated_length": 525.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09767964296042919, "epoch": 0.01496, "frac_reward_zero_std": 0.0, "grad_norm": 0.03892698138952255, "kl": 0.68136877566576, "learning_rate": 1.8565442190364233e-05, "loss": 0.0092, "num_tokens": 33040041.0, "reward": 0.7311248779296875, "reward_std": 0.42470604181289673, "rewards/rollout_reward_func/mean": 0.7311248779296875, "rewards/rollout_reward_func/std": 0.4731757938861847, "sampling/importance_sampling_ratio/max": 1.1133439540863037, "sampling/importance_sampling_ratio/mean": 1.002157211303711, "sampling/importance_sampling_ratio/min": 0.9772976636886597, "sampling/sampling_logp_difference/max": 0.10727739334106445, "sampling/sampling_logp_difference/mean": 0.001790445763617754, "step": 748, "step_time": 8.264898270999765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 315.6875, "completions/mean_terminated_length": 315.6875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07725523924455047, "epoch": 0.01498, "frac_reward_zero_std": 0.0, "grad_norm": 0.017491115257143974, "kl": 1.0336815044283867, "learning_rate": 1.8507608499349616e-05, "loss": 0.0007, "num_tokens": 33078873.0, "reward": 0.7328000068664551, "reward_std": 0.5140470266342163, "rewards/rollout_reward_func/mean": 0.7328000068664551, "rewards/rollout_reward_func/std": 0.5345086455345154, "sampling/importance_sampling_ratio/max": 1.1244546175003052, "sampling/importance_sampling_ratio/mean": 0.9997423887252808, "sampling/importance_sampling_ratio/min": 0.8949574828147888, "sampling/sampling_logp_difference/max": 0.11728477478027344, "sampling/sampling_logp_difference/mean": 0.0027207592502236366, "step": 749, "step_time": 7.840617944999394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 328.65625, "completions/mean_terminated_length": 328.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.10419935220852494, "epoch": 0.015, "frac_reward_zero_std": 0.0, "grad_norm": 0.1150776743888855, "kl": 0.8035344332456589, "learning_rate": 1.8450111374358705e-05, "loss": -0.0073, "num_tokens": 33119135.0, "reward": 0.6163073778152466, "reward_std": 0.7314113974571228, "rewards/rollout_reward_func/mean": 0.6163073778152466, "rewards/rollout_reward_func/std": 0.7146843075752258, "sampling/importance_sampling_ratio/max": 1.0141024589538574, "sampling/importance_sampling_ratio/mean": 0.9945303201675415, "sampling/importance_sampling_ratio/min": 0.8939104080200195, "sampling/sampling_logp_difference/max": 0.11215734481811523, "sampling/sampling_logp_difference/mean": 0.002812393009662628, "step": 750, "step_time": 8.04099983699598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 500.9375, "completions/mean_terminated_length": 500.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08399426005780697, "epoch": 0.01502, "frac_reward_zero_std": 0.25, "grad_norm": 0.0150993000715971, "kl": 0.7952219247817993, "learning_rate": 1.839295163912964e-05, "loss": 0.0081, "num_tokens": 33165578.0, "reward": 0.7295859456062317, "reward_std": 0.3800237476825714, "rewards/rollout_reward_func/mean": 0.7295859456062317, "rewards/rollout_reward_func/std": 0.47587135434150696, "sampling/importance_sampling_ratio/max": 1.0138777494430542, "sampling/importance_sampling_ratio/mean": 1.0000288486480713, "sampling/importance_sampling_ratio/min": 0.9754276871681213, "sampling/sampling_logp_difference/max": 0.024601995944976807, "sampling/sampling_logp_difference/mean": 0.0010474550072103739, "step": 751, "step_time": 8.238704925999627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 436.25, "completions/mean_terminated_length": 436.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09118667989969254, "epoch": 0.01504, "frac_reward_zero_std": 0.0, "grad_norm": 0.034711241722106934, "kl": 0.7379360646009445, "learning_rate": 1.83361301125669e-05, "loss": 0.0021, "num_tokens": 33208980.0, "reward": 0.735044002532959, "reward_std": 0.5191532373428345, "rewards/rollout_reward_func/mean": 0.735044002532959, "rewards/rollout_reward_func/std": 0.5312781929969788, "sampling/importance_sampling_ratio/max": 1.0099191665649414, "sampling/importance_sampling_ratio/mean": 0.9992231130599976, "sampling/importance_sampling_ratio/min": 0.9832984209060669, "sampling/sampling_logp_difference/max": 0.017640545964241028, "sampling/sampling_logp_difference/mean": 0.0007389874663203955, "step": 752, "step_time": 9.111880312997528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 386.375, "completions/mean_terminated_length": 386.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07705993298441172, "epoch": 0.01506, "frac_reward_zero_std": 0.25, "grad_norm": 0.027272578328847885, "kl": 0.6265348293236457, "learning_rate": 1.827964760872959e-05, "loss": 0.0023, "num_tokens": 33251244.0, "reward": 0.8977458477020264, "reward_std": 0.289218544960022, "rewards/rollout_reward_func/mean": 0.8977458477020264, "rewards/rollout_reward_func/std": 0.323042631149292, "sampling/importance_sampling_ratio/max": 1.1310611963272095, "sampling/importance_sampling_ratio/mean": 0.9930012226104736, "sampling/importance_sampling_ratio/min": 0.8130782246589661, "sampling/sampling_logp_difference/max": 0.20693349838256836, "sampling/sampling_logp_difference/mean": 0.003950279206037521, "step": 753, "step_time": 7.991594638999231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 388.90625, "completions/mean_terminated_length": 388.90625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07147897500544786, "epoch": 0.01508, "frac_reward_zero_std": 0.25, "grad_norm": 0.02589552104473114, "kl": 0.9660728946328163, "learning_rate": 1.8223504936819792e-05, "loss": 0.008, "num_tokens": 33293119.0, "reward": 0.7350316643714905, "reward_std": 0.4517788887023926, "rewards/rollout_reward_func/mean": 0.7350316643714905, "rewards/rollout_reward_func/std": 0.5294991731643677, "sampling/importance_sampling_ratio/max": 1.010696530342102, "sampling/importance_sampling_ratio/mean": 0.9947588443756104, "sampling/importance_sampling_ratio/min": 0.888946533203125, "sampling/sampling_logp_difference/max": 0.11772418022155762, "sampling/sampling_logp_difference/mean": 0.0017489364836364985, "step": 754, "step_time": 7.824269685001127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 427.375, "completions/mean_terminated_length": 427.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.10135653056204319, "epoch": 0.0151, "frac_reward_zero_std": 0.0, "grad_norm": 0.07843118906021118, "kl": 1.024379387497902, "learning_rate": 1.816770290117092e-05, "loss": -0.0024, "num_tokens": 33337778.0, "reward": 0.6715118885040283, "reward_std": 0.6034761667251587, "rewards/rollout_reward_func/mean": 0.6715118885040283, "rewards/rollout_reward_func/std": 0.610720694065094, "sampling/importance_sampling_ratio/max": 1.0514419078826904, "sampling/importance_sampling_ratio/mean": 0.9970581531524658, "sampling/importance_sampling_ratio/min": 0.890038251876831, "sampling/sampling_logp_difference/max": 0.11650991439819336, "sampling/sampling_logp_difference/mean": 0.002055360237136483, "step": 755, "step_time": 8.088731603005726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 330.78125, "completions/mean_terminated_length": 330.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07746587228029966, "epoch": 0.01512, "frac_reward_zero_std": 0.0, "grad_norm": 0.0507831797003746, "kl": 0.7887879498302937, "learning_rate": 1.8112242301236254e-05, "loss": 0.0116, "num_tokens": 33376775.0, "reward": 0.5776767730712891, "reward_std": 0.575559139251709, "rewards/rollout_reward_func/mean": 0.5776767730712891, "rewards/rollout_reward_func/std": 0.6793192625045776, "sampling/importance_sampling_ratio/max": 1.0123039484024048, "sampling/importance_sampling_ratio/mean": 0.9988201856613159, "sampling/importance_sampling_ratio/min": 0.987777829170227, "sampling/sampling_logp_difference/max": 0.012307040393352509, "sampling/sampling_logp_difference/mean": 0.0008683096384629607, "step": 756, "step_time": 7.860303558998567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 457.15625, "completions/mean_terminated_length": 457.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08264220505952835, "epoch": 0.01514, "frac_reward_zero_std": 0.0, "grad_norm": 0.07934325188398361, "kl": 0.7165999822318554, "learning_rate": 1.805712393157745e-05, "loss": -0.0042, "num_tokens": 33422585.0, "reward": 0.7342089414596558, "reward_std": 0.5112618207931519, "rewards/rollout_reward_func/mean": 0.7342089414596558, "rewards/rollout_reward_func/std": 0.5310125946998596, "sampling/importance_sampling_ratio/max": 1.1190756559371948, "sampling/importance_sampling_ratio/mean": 1.0006577968597412, "sampling/importance_sampling_ratio/min": 0.9241052865982056, "sampling/sampling_logp_difference/max": 0.1125040054321289, "sampling/sampling_logp_difference/mean": 0.002487227553501725, "step": 757, "step_time": 8.372381385999688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 408.8125, "completions/mean_terminated_length": 408.8125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09278180543333292, "epoch": 0.01516, "frac_reward_zero_std": 0.0, "grad_norm": 0.03642945736646652, "kl": 0.5847600041888654, "learning_rate": 1.8002348581853176e-05, "loss": -0.0095, "num_tokens": 33466823.0, "reward": 0.5747978687286377, "reward_std": 0.6666852235794067, "rewards/rollout_reward_func/mean": 0.5747978687286377, "rewards/rollout_reward_func/std": 0.6833681464195251, "sampling/importance_sampling_ratio/max": 1.106086015701294, "sampling/importance_sampling_ratio/mean": 1.0065302848815918, "sampling/importance_sampling_ratio/min": 0.9923562407493591, "sampling/sampling_logp_difference/max": 0.09935283660888672, "sampling/sampling_logp_difference/mean": 0.0020159161649644375, "step": 758, "step_time": 7.991257948004204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 494.59375, "completions/mean_terminated_length": 494.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09997318685054779, "epoch": 0.01518, "frac_reward_zero_std": 0.0, "grad_norm": 0.060292962938547134, "kl": 0.8468060083687305, "learning_rate": 1.794791703680778e-05, "loss": -0.0079, "num_tokens": 33513675.0, "reward": 0.6670295000076294, "reward_std": 0.5659693479537964, "rewards/rollout_reward_func/mean": 0.6670295000076294, "rewards/rollout_reward_func/std": 0.5628648996353149, "sampling/importance_sampling_ratio/max": 1.1105235815048218, "sampling/importance_sampling_ratio/mean": 1.00018310546875, "sampling/importance_sampling_ratio/min": 0.8893080353736877, "sampling/sampling_logp_difference/max": 0.11731481552124023, "sampling/sampling_logp_difference/mean": 0.0037885704077780247, "step": 759, "step_time": 8.258365493002202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 449.78125, "completions/mean_terminated_length": 449.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08146755071356893, "epoch": 0.0152, "frac_reward_zero_std": 0.0, "grad_norm": 0.0575830303132534, "kl": 0.9958676546812057, "learning_rate": 1.789383007626009e-05, "loss": 0.0027, "num_tokens": 33558863.0, "reward": 0.6404991149902344, "reward_std": 0.626112163066864, "rewards/rollout_reward_func/mean": 0.6404991149902344, "rewards/rollout_reward_func/std": 0.6170802712440491, "sampling/importance_sampling_ratio/max": 1.0124326944351196, "sampling/importance_sampling_ratio/mean": 0.9985641837120056, "sampling/importance_sampling_ratio/min": 0.9680770635604858, "sampling/sampling_logp_difference/max": 0.0314880907535553, "sampling/sampling_logp_difference/mean": 0.0011035484494641423, "step": 760, "step_time": 8.146460957999807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 446.3125, "completions/mean_terminated_length": 446.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07379373954609036, "epoch": 0.01522, "frac_reward_zero_std": 0.25, "grad_norm": 0.062438834458589554, "kl": 0.5660603754222393, "learning_rate": 1.784008847509216e-05, "loss": 0.0055, "num_tokens": 33603130.0, "reward": 0.8036329746246338, "reward_std": 0.406211793422699, "rewards/rollout_reward_func/mean": 0.8036329746246338, "rewards/rollout_reward_func/std": 0.4850512444972992, "sampling/importance_sampling_ratio/max": 1.0193474292755127, "sampling/importance_sampling_ratio/mean": 1.0012242794036865, "sampling/importance_sampling_ratio/min": 0.9862703680992126, "sampling/sampling_logp_difference/max": 0.02058929204940796, "sampling/sampling_logp_difference/mean": 0.0009337689843960106, "step": 761, "step_time": 8.094910053001513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 436.25, "completions/mean_terminated_length": 436.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07941654277965426, "epoch": 0.01524, "frac_reward_zero_std": 0.25, "grad_norm": 0.009089902974665165, "kl": 0.7331058979034424, "learning_rate": 1.7786693003238266e-05, "loss": 0.0028, "num_tokens": 33647239.0, "reward": 0.7960431575775146, "reward_std": 0.37768635153770447, "rewards/rollout_reward_func/mean": 0.7960431575775146, "rewards/rollout_reward_func/std": 0.4314172863960266, "sampling/importance_sampling_ratio/max": 1.0198122262954712, "sampling/importance_sampling_ratio/mean": 0.9985195994377136, "sampling/importance_sampling_ratio/min": 0.8966137766838074, "sampling/sampling_logp_difference/max": 0.10643863677978516, "sampling/sampling_logp_difference/mean": 0.0014669925440102816, "step": 762, "step_time": 9.135317331998522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 391.09375, "completions/mean_terminated_length": 391.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08402724657207727, "epoch": 0.01526, "frac_reward_zero_std": 0.0, "grad_norm": 0.058945078402757645, "kl": 1.0574384815990925, "learning_rate": 1.7733644425673805e-05, "loss": -0.0017, "num_tokens": 33690304.0, "reward": 0.6052941083908081, "reward_std": 0.6359267234802246, "rewards/rollout_reward_func/mean": 0.6052941083908081, "rewards/rollout_reward_func/std": 0.6289322972297668, "sampling/importance_sampling_ratio/max": 1.0159573554992676, "sampling/importance_sampling_ratio/mean": 0.9981439113616943, "sampling/importance_sampling_ratio/min": 0.8971946835517883, "sampling/sampling_logp_difference/max": 0.10840582847595215, "sampling/sampling_logp_difference/mean": 0.0016164081171154976, "step": 763, "step_time": 8.018724090001342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 308.6875, "completions/mean_terminated_length": 308.6875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06962606671731919, "epoch": 0.01528, "frac_reward_zero_std": 0.0, "grad_norm": 0.020955312997102737, "kl": 1.039126549672801, "learning_rate": 1.768094350240437e-05, "loss": 0.0009, "num_tokens": 33729880.0, "reward": 0.6653138399124146, "reward_std": 0.5666763186454773, "rewards/rollout_reward_func/mean": 0.6653138399124146, "rewards/rollout_reward_func/std": 0.5651322603225708, "sampling/importance_sampling_ratio/max": 1.0455694198608398, "sampling/importance_sampling_ratio/mean": 1.002254605293274, "sampling/importance_sampling_ratio/min": 0.9734240174293518, "sampling/sampling_logp_difference/max": 0.04459899663925171, "sampling/sampling_logp_difference/mean": 0.001515584415756166, "step": 764, "step_time": 7.816642822001086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 444.28125, "completions/mean_terminated_length": 444.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.061393054027576, "epoch": 0.0153, "frac_reward_zero_std": 0.0, "grad_norm": 0.053427327424287796, "kl": 0.9420679844915867, "learning_rate": 1.7628590988454834e-05, "loss": 0.0071, "num_tokens": 33775224.0, "reward": 0.7969150543212891, "reward_std": 0.4429575204849243, "rewards/rollout_reward_func/mean": 0.7969150543212891, "rewards/rollout_reward_func/std": 0.42956292629241943, "sampling/importance_sampling_ratio/max": 1.01096510887146, "sampling/importance_sampling_ratio/mean": 1.0002098083496094, "sampling/importance_sampling_ratio/min": 0.989949107170105, "sampling/sampling_logp_difference/max": 0.012553848326206207, "sampling/sampling_logp_difference/mean": 0.0006633107550442219, "step": 765, "step_time": 8.087547672001165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 324.1875, "completions/mean_terminated_length": 324.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05806259601376951, "epoch": 0.01532, "frac_reward_zero_std": 0.25, "grad_norm": 0.11330641061067581, "kl": 0.9271285720169544, "learning_rate": 1.7576587633858575e-05, "loss": 0.003, "num_tokens": 33814660.0, "reward": 0.7950878143310547, "reward_std": 0.3649955689907074, "rewards/rollout_reward_func/mean": 0.7950878143310547, "rewards/rollout_reward_func/std": 0.43352386355400085, "sampling/importance_sampling_ratio/max": 1.123925805091858, "sampling/importance_sampling_ratio/mean": 0.9999889731407166, "sampling/importance_sampling_ratio/min": 0.8900832533836365, "sampling/sampling_logp_difference/max": 0.11692547798156738, "sampling/sampling_logp_difference/mean": 0.002412480767816305, "step": 766, "step_time": 7.838969225998881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 393.96875, "completions/mean_terminated_length": 393.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06478302576579154, "epoch": 0.01534, "frac_reward_zero_std": 0.25, "grad_norm": 0.0299687497317791, "kl": 0.9800536930561066, "learning_rate": 1.752493418364669e-05, "loss": -0.0002, "num_tokens": 33857258.0, "reward": 0.830620288848877, "reward_std": 0.34638532996177673, "rewards/rollout_reward_func/mean": 0.830620288848877, "rewards/rollout_reward_func/std": 0.399961918592453, "sampling/importance_sampling_ratio/max": 1.1140685081481934, "sampling/importance_sampling_ratio/mean": 1.0026270151138306, "sampling/importance_sampling_ratio/min": 0.9534787535667419, "sampling/sampling_logp_difference/max": 0.10361886024475098, "sampling/sampling_logp_difference/mean": 0.0016611726023256779, "step": 767, "step_time": 8.852820739999515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 385.75, "completions/mean_terminated_length": 385.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0667042201384902, "epoch": 0.01536, "frac_reward_zero_std": 0.0, "grad_norm": 0.048508934676647186, "kl": 0.8169445507228374, "learning_rate": 1.7473631377837325e-05, "loss": 0.0031, "num_tokens": 33898802.0, "reward": 0.6320017576217651, "reward_std": 0.5941572189331055, "rewards/rollout_reward_func/mean": 0.6320017576217651, "rewards/rollout_reward_func/std": 0.5745890140533447, "sampling/importance_sampling_ratio/max": 1.1222001314163208, "sampling/importance_sampling_ratio/mean": 1.0000264644622803, "sampling/importance_sampling_ratio/min": 0.8889807462692261, "sampling/sampling_logp_difference/max": 0.1152658462524414, "sampling/sampling_logp_difference/mean": 0.0023063390981405973, "step": 768, "step_time": 8.039976960999411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 504.5625, "completions/mean_terminated_length": 504.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05532364780083299, "epoch": 0.01538, "frac_reward_zero_std": 0.25, "grad_norm": 0.022359469905495644, "kl": 0.6614800058305264, "learning_rate": 1.7422679951425124e-05, "loss": 0.0056, "num_tokens": 33945644.0, "reward": 0.762317419052124, "reward_std": 0.3672657608985901, "rewards/rollout_reward_func/mean": 0.762317419052124, "rewards/rollout_reward_func/std": 0.4564096927642822, "sampling/importance_sampling_ratio/max": 1.010849952697754, "sampling/importance_sampling_ratio/mean": 0.9960411787033081, "sampling/importance_sampling_ratio/min": 0.8905414342880249, "sampling/sampling_logp_difference/max": 0.11592721939086914, "sampling/sampling_logp_difference/mean": 0.001359292189590633, "step": 769, "step_time": 8.138840421999703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 435.34375, "completions/mean_terminated_length": 435.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.060995446518063545, "epoch": 0.0154, "frac_reward_zero_std": 0.25, "grad_norm": 0.06605660170316696, "kl": 0.7089426033198833, "learning_rate": 1.737208063437062e-05, "loss": 0.0052, "num_tokens": 33989358.0, "reward": 0.7965446710586548, "reward_std": 0.3767530918121338, "rewards/rollout_reward_func/mean": 0.7965446710586548, "rewards/rollout_reward_func/std": 0.4303547143936157, "sampling/importance_sampling_ratio/max": 1.0176151990890503, "sampling/importance_sampling_ratio/mean": 0.9999148845672607, "sampling/importance_sampling_ratio/min": 0.9903237819671631, "sampling/sampling_logp_difference/max": 0.017456427216529846, "sampling/sampling_logp_difference/mean": 0.0005602091550827026, "step": 770, "step_time": 8.084775312001511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 400.96875, "completions/mean_terminated_length": 400.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06476705754175782, "epoch": 0.01542, "frac_reward_zero_std": 0.25, "grad_norm": 0.013997037895023823, "kl": 0.6609172821044922, "learning_rate": 1.732183415158983e-05, "loss": 0.0001, "num_tokens": 34032116.0, "reward": 0.7075601816177368, "reward_std": 0.5029250383377075, "rewards/rollout_reward_func/mean": 0.7075601816177368, "rewards/rollout_reward_func/std": 0.5940497517585754, "sampling/importance_sampling_ratio/max": 1.017808198928833, "sampling/importance_sampling_ratio/mean": 0.9986888766288757, "sampling/importance_sampling_ratio/min": 0.9880044460296631, "sampling/sampling_logp_difference/max": 0.016351550817489624, "sampling/sampling_logp_difference/mean": 0.0007636506343260407, "step": 771, "step_time": 8.328851510003005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 376.625, "completions/mean_terminated_length": 376.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07649446767754853, "epoch": 0.01544, "frac_reward_zero_std": 0.0, "grad_norm": 0.017648104578256607, "kl": 0.7815244942903519, "learning_rate": 1.7271941222943878e-05, "loss": 0.0032, "num_tokens": 34073731.0, "reward": 0.6739544868469238, "reward_std": 0.5606675744056702, "rewards/rollout_reward_func/mean": 0.6739544868469238, "rewards/rollout_reward_func/std": 0.6089154481887817, "sampling/importance_sampling_ratio/max": 1.015665888786316, "sampling/importance_sampling_ratio/mean": 0.9967939853668213, "sampling/importance_sampling_ratio/min": 0.8897490501403809, "sampling/sampling_logp_difference/max": 0.11679410934448242, "sampling/sampling_logp_difference/mean": 0.00166688475292176, "step": 772, "step_time": 9.11596239499886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 435.25, "completions/mean_terminated_length": 435.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06091142166405916, "epoch": 0.01546, "frac_reward_zero_std": 0.0, "grad_norm": 0.06099911779165268, "kl": 0.8555112965404987, "learning_rate": 1.7222402563228617e-05, "loss": 0.0161, "num_tokens": 34117269.0, "reward": 0.6015176773071289, "reward_std": 0.5945396423339844, "rewards/rollout_reward_func/mean": 0.6015176773071289, "rewards/rollout_reward_func/std": 0.579899251461029, "sampling/importance_sampling_ratio/max": 1.0093735456466675, "sampling/importance_sampling_ratio/mean": 0.9994684457778931, "sampling/importance_sampling_ratio/min": 0.9907811284065247, "sampling/sampling_logp_difference/max": 0.011200875043869019, "sampling/sampling_logp_difference/mean": 0.0007386314100585878, "step": 773, "step_time": 8.282531844002733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 481.25, "completions/mean_terminated_length": 481.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.053107390413060784, "epoch": 0.01548, "frac_reward_zero_std": 0.0, "grad_norm": 0.01913008652627468, "kl": 0.6781502477824688, "learning_rate": 1.7173218882164475e-05, "loss": 0.0058, "num_tokens": 34163660.0, "reward": 0.6632349491119385, "reward_std": 0.5034691095352173, "rewards/rollout_reward_func/mean": 0.6632349491119385, "rewards/rollout_reward_func/std": 0.5076708793640137, "sampling/importance_sampling_ratio/max": 1.0083773136138916, "sampling/importance_sampling_ratio/mean": 0.9960179924964905, "sampling/importance_sampling_ratio/min": 0.8877311944961548, "sampling/sampling_logp_difference/max": 0.11910629272460938, "sampling/sampling_logp_difference/mean": 0.001276570837944746, "step": 774, "step_time": 8.220307621002576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 382.53125, "completions/mean_terminated_length": 382.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04948446317575872, "epoch": 0.0155, "frac_reward_zero_std": 0.0, "grad_norm": 0.018215175718069077, "kl": 0.8165777772665024, "learning_rate": 1.712439088438622e-05, "loss": 0.0071, "num_tokens": 34204682.0, "reward": 0.6414945125579834, "reward_std": 0.6100935935974121, "rewards/rollout_reward_func/mean": 0.6414945125579834, "rewards/rollout_reward_func/std": 0.6144280433654785, "sampling/importance_sampling_ratio/max": 1.002885341644287, "sampling/importance_sampling_ratio/mean": 0.9990237951278687, "sampling/importance_sampling_ratio/min": 0.9930909276008606, "sampling/sampling_logp_difference/max": 0.011404678225517273, "sampling/sampling_logp_difference/mean": 0.0004229692567605525, "step": 775, "step_time": 7.932601034997788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 345.9375, "completions/mean_terminated_length": 345.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06247577094472945, "epoch": 0.01552, "frac_reward_zero_std": 0.0, "grad_norm": 0.03096099942922592, "kl": 0.8267674315720797, "learning_rate": 1.7075919269432895e-05, "loss": 0.0021, "num_tokens": 34245184.0, "reward": 0.608827531337738, "reward_std": 0.6446946859359741, "rewards/rollout_reward_func/mean": 0.608827531337738, "rewards/rollout_reward_func/std": 0.6747926473617554, "sampling/importance_sampling_ratio/max": 1.069825530052185, "sampling/importance_sampling_ratio/mean": 1.0009210109710693, "sampling/importance_sampling_ratio/min": 0.9682822227478027, "sampling/sampling_logp_difference/max": 0.06747782230377197, "sampling/sampling_logp_difference/mean": 0.001130637596361339, "step": 776, "step_time": 8.252911868003139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 408.28125, "completions/mean_terminated_length": 408.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04151812847703695, "epoch": 0.01554, "frac_reward_zero_std": 0.25, "grad_norm": 0.0013982332311570644, "kl": 0.6866093641147017, "learning_rate": 1.7027804731737788e-05, "loss": 0.0036, "num_tokens": 34287971.0, "reward": 0.8290935754776001, "reward_std": 0.34937673807144165, "rewards/rollout_reward_func/mean": 0.8290935754776001, "rewards/rollout_reward_func/std": 0.4035678803920746, "sampling/importance_sampling_ratio/max": 1.0073940753936768, "sampling/importance_sampling_ratio/mean": 0.9993148446083069, "sampling/importance_sampling_ratio/min": 0.9767596125602722, "sampling/sampling_logp_difference/max": 0.02353835105895996, "sampling/sampling_logp_difference/mean": 0.000528045347891748, "step": 777, "step_time": 8.582545478000611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 450.0625, "completions/mean_terminated_length": 450.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04753253859234974, "epoch": 0.01556, "frac_reward_zero_std": 0.0, "grad_norm": 0.042555633932352066, "kl": 0.7577363476157188, "learning_rate": 1.698004796061849e-05, "loss": 0.0109, "num_tokens": 34332512.0, "reward": 0.763343334197998, "reward_std": 0.4711299240589142, "rewards/rollout_reward_func/mean": 0.763343334197998, "rewards/rollout_reward_func/std": 0.4544086456298828, "sampling/importance_sampling_ratio/max": 1.1246798038482666, "sampling/importance_sampling_ratio/mean": 1.0042691230773926, "sampling/importance_sampling_ratio/min": 0.9846996068954468, "sampling/sampling_logp_difference/max": 0.11749029159545898, "sampling/sampling_logp_difference/mean": 0.001401805318892002, "step": 778, "step_time": 8.363974392997989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 514.0625, "completions/mean_terminated_length": 514.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05557265831157565, "epoch": 0.01558, "frac_reward_zero_std": 0.25, "grad_norm": 0.010465836152434349, "kl": 0.7719097741064616, "learning_rate": 1.6932649640267003e-05, "loss": 0.0023, "num_tokens": 34380815.0, "reward": 0.701444149017334, "reward_std": 0.43184179067611694, "rewards/rollout_reward_func/mean": 0.701444149017334, "rewards/rollout_reward_func/std": 0.5474402904510498, "sampling/importance_sampling_ratio/max": 1.1279658079147339, "sampling/importance_sampling_ratio/mean": 1.0045151710510254, "sampling/importance_sampling_ratio/min": 0.9920402765274048, "sampling/sampling_logp_difference/max": 0.12041544914245605, "sampling/sampling_logp_difference/mean": 0.0014102304121479392, "step": 779, "step_time": 8.302886846000547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 358.5625, "completions/mean_terminated_length": 358.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06377437943592668, "epoch": 0.0156, "frac_reward_zero_std": 0.0, "grad_norm": 0.05676717311143875, "kl": 0.8316807905212045, "learning_rate": 1.6885610449739964e-05, "loss": -0.0016, "num_tokens": 34422049.0, "reward": 0.6363032460212708, "reward_std": 0.5726150870323181, "rewards/rollout_reward_func/mean": 0.6363032460212708, "rewards/rollout_reward_func/std": 0.6246985197067261, "sampling/importance_sampling_ratio/max": 1.010441780090332, "sampling/importance_sampling_ratio/mean": 0.9955865740776062, "sampling/importance_sampling_ratio/min": 0.8892061710357666, "sampling/sampling_logp_difference/max": 0.117156982421875, "sampling/sampling_logp_difference/mean": 0.001790159847587347, "step": 780, "step_time": 7.937529071996323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 384.28125, "completions/mean_terminated_length": 384.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05013401195174083, "epoch": 0.01562, "frac_reward_zero_std": 0.25, "grad_norm": 0.032154228538274765, "kl": 0.9933641403913498, "learning_rate": 1.6838931062948904e-05, "loss": 0.0071, "num_tokens": 34463311.0, "reward": 0.8365590572357178, "reward_std": 0.396314412355423, "rewards/rollout_reward_func/mean": 0.8365590572357178, "rewards/rollout_reward_func/std": 0.4620205760002136, "sampling/importance_sampling_ratio/max": 1.1267611980438232, "sampling/importance_sampling_ratio/mean": 1.0049428939819336, "sampling/importance_sampling_ratio/min": 0.9921354055404663, "sampling/sampling_logp_difference/max": 0.11934924125671387, "sampling/sampling_logp_difference/mean": 0.0015630170237272978, "step": 781, "step_time": 8.333016112999758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 419.71875, "completions/mean_terminated_length": 419.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0473555147764273, "epoch": 0.01564, "frac_reward_zero_std": 0.0, "grad_norm": 0.01228280644863844, "kl": 0.8163771480321884, "learning_rate": 1.6792612148650557e-05, "loss": 0.0085, "num_tokens": 34505949.0, "reward": 0.7032076716423035, "reward_std": 0.5031043291091919, "rewards/rollout_reward_func/mean": 0.7032076716423035, "rewards/rollout_reward_func/std": 0.5436233282089233, "sampling/importance_sampling_ratio/max": 1.0078779458999634, "sampling/importance_sampling_ratio/mean": 0.998394250869751, "sampling/importance_sampling_ratio/min": 0.9759617447853088, "sampling/sampling_logp_difference/max": 0.025487512350082397, "sampling/sampling_logp_difference/mean": 0.0007662124698981643, "step": 782, "step_time": 8.421324782997544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 489.9375, "completions/mean_terminated_length": 489.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08625463861972094, "epoch": 0.01566, "frac_reward_zero_std": 0.0, "grad_norm": 0.04707939922809601, "kl": 0.754468428902328, "learning_rate": 1.674665437043736e-05, "loss": -0.0007, "num_tokens": 34553194.0, "reward": 0.4010940194129944, "reward_std": 0.6662822365760803, "rewards/rollout_reward_func/mean": 0.4010940194129944, "rewards/rollout_reward_func/std": 0.643752932548523, "sampling/importance_sampling_ratio/max": 1.1249622106552124, "sampling/importance_sampling_ratio/mean": 1.0016181468963623, "sampling/importance_sampling_ratio/min": 0.9346106648445129, "sampling/sampling_logp_difference/max": 0.11774516105651855, "sampling/sampling_logp_difference/mean": 0.0019760315772145987, "step": 783, "step_time": 8.1507631759996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 341.5625, "completions/mean_terminated_length": 341.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.061994995223358274, "epoch": 0.01568, "frac_reward_zero_std": 0.0, "grad_norm": 0.019221363589167595, "kl": 0.7332786619663239, "learning_rate": 1.6701058386727868e-05, "loss": -0.0092, "num_tokens": 34594071.0, "reward": 0.5769693851470947, "reward_std": 0.6962371468544006, "rewards/rollout_reward_func/mean": 0.5769693851470947, "rewards/rollout_reward_func/std": 0.6775482296943665, "sampling/importance_sampling_ratio/max": 1.124122142791748, "sampling/importance_sampling_ratio/mean": 1.0035004615783691, "sampling/importance_sampling_ratio/min": 0.9794101119041443, "sampling/sampling_logp_difference/max": 0.11698222160339355, "sampling/sampling_logp_difference/mean": 0.001597935683093965, "step": 784, "step_time": 7.973482416004117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 509.5, "completions/mean_terminated_length": 509.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07298826007172465, "epoch": 0.0157, "frac_reward_zero_std": 0.0, "grad_norm": 0.03277268633246422, "kl": 0.8591923452913761, "learning_rate": 1.6655824850757368e-05, "loss": 0.0019, "num_tokens": 34641389.0, "reward": 0.735480546951294, "reward_std": 0.5507925152778625, "rewards/rollout_reward_func/mean": 0.735480546951294, "rewards/rollout_reward_func/std": 0.5304496884346008, "sampling/importance_sampling_ratio/max": 1.1203761100769043, "sampling/importance_sampling_ratio/mean": 1.0046133995056152, "sampling/importance_sampling_ratio/min": 0.9804285168647766, "sampling/sampling_logp_difference/max": 0.11072206497192383, "sampling/sampling_logp_difference/mean": 0.0016187557484954596, "step": 785, "step_time": 8.204157931000736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 411.625, "completions/mean_terminated_length": 411.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.060215474339202046, "epoch": 0.01572, "frac_reward_zero_std": 0.25, "grad_norm": 0.05137073993682861, "kl": 0.7205069568008184, "learning_rate": 1.661095441056849e-05, "loss": 0.0032, "num_tokens": 34685968.0, "reward": 0.8703435659408569, "reward_std": 0.3667238652706146, "rewards/rollout_reward_func/mean": 0.8703435659408569, "rewards/rollout_reward_func/std": 0.4312556982040405, "sampling/importance_sampling_ratio/max": 1.008284330368042, "sampling/importance_sampling_ratio/mean": 1.0007961988449097, "sampling/importance_sampling_ratio/min": 0.9917957782745361, "sampling/sampling_logp_difference/max": 0.008240774273872375, "sampling/sampling_logp_difference/mean": 0.0004489162238314748, "step": 786, "step_time": 8.36963431699769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 454.59375, "completions/mean_terminated_length": 454.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05391461402177811, "epoch": 0.01574, "frac_reward_zero_std": 0.25, "grad_norm": 0.013602036982774734, "kl": 0.6341364160180092, "learning_rate": 1.656644770900194e-05, "loss": 0.0085, "num_tokens": 34730905.0, "reward": 0.7624059915542603, "reward_std": 0.37680771946907043, "rewards/rollout_reward_func/mean": 0.7624059915542603, "rewards/rollout_reward_func/std": 0.4562264084815979, "sampling/importance_sampling_ratio/max": 1.0078788995742798, "sampling/importance_sampling_ratio/mean": 0.9942229986190796, "sampling/importance_sampling_ratio/min": 0.8910136818885803, "sampling/sampling_logp_difference/max": 0.11662745475769043, "sampling/sampling_logp_difference/mean": 0.0016141172964125872, "step": 787, "step_time": 8.89533756199853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 422.59375, "completions/mean_terminated_length": 422.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05167287704534829, "epoch": 0.01576, "frac_reward_zero_std": 0.5, "grad_norm": 0.003913382068276405, "kl": 0.7901235949248075, "learning_rate": 1.65223053836873e-05, "loss": -0.0001, "num_tokens": 34774454.0, "reward": 0.9315446615219116, "reward_std": 0.19362089037895203, "rewards/rollout_reward_func/mean": 0.9315446615219116, "rewards/rollout_reward_func/std": 0.2694025933742523, "sampling/importance_sampling_ratio/max": 1.0105103254318237, "sampling/importance_sampling_ratio/mean": 0.9995189905166626, "sampling/importance_sampling_ratio/min": 0.9816344380378723, "sampling/sampling_logp_difference/max": 0.0185299813747406, "sampling/sampling_logp_difference/mean": 0.0005671929102391005, "step": 788, "step_time": 7.86071864799851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 372.0, "completions/mean_terminated_length": 372.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06554656894877553, "epoch": 0.01578, "frac_reward_zero_std": 0.0, "grad_norm": 0.03736132010817528, "kl": 0.7805263977497816, "learning_rate": 1.6478528067033864e-05, "loss": 0.0001, "num_tokens": 34816726.0, "reward": 0.7064270973205566, "reward_std": 0.5576552152633667, "rewards/rollout_reward_func/mean": 0.7064270973205566, "rewards/rollout_reward_func/std": 0.5959733724594116, "sampling/importance_sampling_ratio/max": 1.0915488004684448, "sampling/importance_sampling_ratio/mean": 1.0040838718414307, "sampling/importance_sampling_ratio/min": 0.9852777719497681, "sampling/sampling_logp_difference/max": 0.0875859260559082, "sampling/sampling_logp_difference/mean": 0.0014925911091268063, "step": 789, "step_time": 8.152078069999334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 407.59375, "completions/mean_terminated_length": 407.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.057402313221246004, "epoch": 0.0158, "frac_reward_zero_std": 0.25, "grad_norm": 0.011755947954952717, "kl": 0.6981163583695889, "learning_rate": 1.6435116386221618e-05, "loss": 0.0021, "num_tokens": 34859727.0, "reward": 0.8039124011993408, "reward_std": 0.4055701494216919, "rewards/rollout_reward_func/mean": 0.8039124011993408, "rewards/rollout_reward_func/std": 0.4847460389137268, "sampling/importance_sampling_ratio/max": 1.0080881118774414, "sampling/importance_sampling_ratio/mean": 0.9996989965438843, "sampling/importance_sampling_ratio/min": 0.9801159501075745, "sampling/sampling_logp_difference/max": 0.01893627643585205, "sampling/sampling_logp_difference/mean": 0.0006141515914350748, "step": 790, "step_time": 7.961952832001771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 382.28125, "completions/mean_terminated_length": 382.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.056648278376087546, "epoch": 0.01582, "frac_reward_zero_std": 0.0, "grad_norm": 0.01986757293343544, "kl": 0.8837477378547192, "learning_rate": 1.6392070963192196e-05, "loss": 0.008, "num_tokens": 34900456.0, "reward": 0.8035431504249573, "reward_std": 0.48977863788604736, "rewards/rollout_reward_func/mean": 0.8035431504249573, "rewards/rollout_reward_func/std": 0.4872267544269562, "sampling/importance_sampling_ratio/max": 1.0170392990112305, "sampling/importance_sampling_ratio/mean": 0.9961025714874268, "sampling/importance_sampling_ratio/min": 0.890734076499939, "sampling/sampling_logp_difference/max": 0.11573338508605957, "sampling/sampling_logp_difference/mean": 0.0017884031403809786, "step": 791, "step_time": 8.634185927001454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 452.875, "completions/mean_terminated_length": 452.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0642528694588691, "epoch": 0.01584, "frac_reward_zero_std": 0.25, "grad_norm": 0.0158521868288517, "kl": 0.6496627405285835, "learning_rate": 1.6349392414640034e-05, "loss": 0.0013, "num_tokens": 34944707.0, "reward": 0.7023695707321167, "reward_std": 0.455386757850647, "rewards/rollout_reward_func/mean": 0.7023695707321167, "rewards/rollout_reward_func/std": 0.544752836227417, "sampling/importance_sampling_ratio/max": 1.023719072341919, "sampling/importance_sampling_ratio/mean": 0.997847318649292, "sampling/importance_sampling_ratio/min": 0.9085052013397217, "sampling/sampling_logp_difference/max": 0.09595632553100586, "sampling/sampling_logp_difference/mean": 0.0014935205690562725, "step": 792, "step_time": 8.75390556900129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 402.59375, "completions/mean_terminated_length": 402.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06642402347642928, "epoch": 0.01586, "frac_reward_zero_std": 0.0, "grad_norm": 0.08941247314214706, "kl": 0.8314196765422821, "learning_rate": 1.6307081352003494e-05, "loss": 0.0035, "num_tokens": 34987036.0, "reward": 0.6664844155311584, "reward_std": 0.5540716052055359, "rewards/rollout_reward_func/mean": 0.6664844155311584, "rewards/rollout_reward_func/std": 0.5633814334869385, "sampling/importance_sampling_ratio/max": 1.0439141988754272, "sampling/importance_sampling_ratio/mean": 0.9964668154716492, "sampling/importance_sampling_ratio/min": 0.8946638107299805, "sampling/sampling_logp_difference/max": 0.11220431327819824, "sampling/sampling_logp_difference/mean": 0.002315998077392578, "step": 793, "step_time": 7.976567740999599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 475.78125, "completions/mean_terminated_length": 475.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06237508519552648, "epoch": 0.01588, "frac_reward_zero_std": 0.0, "grad_norm": 0.06793925166130066, "kl": 0.6279394179582596, "learning_rate": 1.62651383814561e-05, "loss": 0.0023, "num_tokens": 35033321.0, "reward": 0.7362751364707947, "reward_std": 0.5319033265113831, "rewards/rollout_reward_func/mean": 0.7362751364707947, "rewards/rollout_reward_func/std": 0.5261227488517761, "sampling/importance_sampling_ratio/max": 1.0023537874221802, "sampling/importance_sampling_ratio/mean": 0.9975550174713135, "sampling/importance_sampling_ratio/min": 0.9670704007148743, "sampling/sampling_logp_difference/max": 0.03351351618766785, "sampling/sampling_logp_difference/mean": 0.0006599896005354822, "step": 794, "step_time": 8.2285776610006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 361.84375, "completions/mean_terminated_length": 361.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.051604877691715956, "epoch": 0.0159, "frac_reward_zero_std": 0.0, "grad_norm": 0.5985586643218994, "kl": 3.979921966791153, "learning_rate": 1.6223564103897908e-05, "loss": 0.0178, "num_tokens": 35073526.0, "reward": 0.6052958369255066, "reward_std": 0.6149159669876099, "rewards/rollout_reward_func/mean": 0.6052958369255066, "rewards/rollout_reward_func/std": 0.6305144429206848, "sampling/importance_sampling_ratio/max": 1.0090422630310059, "sampling/importance_sampling_ratio/mean": 0.9958429336547852, "sampling/importance_sampling_ratio/min": 0.8899779915809631, "sampling/sampling_logp_difference/max": 0.12466192245483398, "sampling/sampling_logp_difference/mean": 0.0014145125169306993, "step": 795, "step_time": 8.378248322003856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 407.46875, "completions/mean_terminated_length": 407.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06452440680004656, "epoch": 0.01592, "frac_reward_zero_std": 0.0, "grad_norm": 0.11366371065378189, "kl": 0.8130898736417294, "learning_rate": 1.618235911494682e-05, "loss": 0.0026, "num_tokens": 35116925.0, "reward": 0.7014188170433044, "reward_std": 0.5232608318328857, "rewards/rollout_reward_func/mean": 0.7014188170433044, "rewards/rollout_reward_func/std": 0.5470775961875916, "sampling/importance_sampling_ratio/max": 1.0063245296478271, "sampling/importance_sampling_ratio/mean": 0.9953609108924866, "sampling/importance_sampling_ratio/min": 0.8832389712333679, "sampling/sampling_logp_difference/max": 0.11936020851135254, "sampling/sampling_logp_difference/mean": 0.0013395115965977311, "step": 796, "step_time": 8.073186461999285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 398.0, "completions/mean_terminated_length": 398.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06008747697342187, "epoch": 0.01594, "frac_reward_zero_std": 0.0, "grad_norm": 0.0804167315363884, "kl": 0.9989961013197899, "learning_rate": 1.614152400493011e-05, "loss": 0.0108, "num_tokens": 35158674.0, "reward": 0.6679733991622925, "reward_std": 0.5204194784164429, "rewards/rollout_reward_func/mean": 0.6679733991622925, "rewards/rollout_reward_func/std": 0.5610801577568054, "sampling/importance_sampling_ratio/max": 1.1256625652313232, "sampling/importance_sampling_ratio/mean": 1.0047290325164795, "sampling/importance_sampling_ratio/min": 0.988968551158905, "sampling/sampling_logp_difference/max": 0.11838054656982422, "sampling/sampling_logp_difference/mean": 0.0014370176941156387, "step": 797, "step_time": 8.388752183998804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 380.40625, "completions/mean_terminated_length": 380.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07223883853293955, "epoch": 0.01596, "frac_reward_zero_std": 0.0, "grad_norm": 0.04579365998506546, "kl": 0.7041960526257753, "learning_rate": 1.6101059358875935e-05, "loss": 0.0069, "num_tokens": 35200216.0, "reward": 0.7353888750076294, "reward_std": 0.5509779453277588, "rewards/rollout_reward_func/mean": 0.7353888750076294, "rewards/rollout_reward_func/std": 0.5306271910667419, "sampling/importance_sampling_ratio/max": 1.019615888595581, "sampling/importance_sampling_ratio/mean": 0.9967086911201477, "sampling/importance_sampling_ratio/min": 0.8910372853279114, "sampling/sampling_logp_difference/max": 0.11537742614746094, "sampling/sampling_logp_difference/mean": 0.001523254089988768, "step": 798, "step_time": 7.798069387996293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 470.4375, "completions/mean_terminated_length": 470.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06855399673804641, "epoch": 0.01598, "frac_reward_zero_std": 0.25, "grad_norm": 0.03794488310813904, "kl": 0.630191083997488, "learning_rate": 1.606096575650497e-05, "loss": 0.0003, "num_tokens": 35245705.0, "reward": 0.7016433477401733, "reward_std": 0.45659032464027405, "rewards/rollout_reward_func/mean": 0.7016433477401733, "rewards/rollout_reward_func/std": 0.5453360676765442, "sampling/importance_sampling_ratio/max": 1.1232162714004517, "sampling/importance_sampling_ratio/mean": 1.001288890838623, "sampling/importance_sampling_ratio/min": 0.906802773475647, "sampling/sampling_logp_difference/max": 0.11619734764099121, "sampling/sampling_logp_difference/mean": 0.0021399694960564375, "step": 799, "step_time": 8.120913160999407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 504.34375, "completions/mean_terminated_length": 504.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08631008910015225, "epoch": 0.016, "frac_reward_zero_std": 0.25, "grad_norm": 0.035973817110061646, "kl": 0.8967455215752125, "learning_rate": 1.60212437722221e-05, "loss": 0.0038, "num_tokens": 35293243.0, "reward": 0.6684175729751587, "reward_std": 0.4524528980255127, "rewards/rollout_reward_func/mean": 0.6684175729751587, "rewards/rollout_reward_func/std": 0.5593339800834656, "sampling/importance_sampling_ratio/max": 1.018982172012329, "sampling/importance_sampling_ratio/mean": 0.9970566630363464, "sampling/importance_sampling_ratio/min": 0.9189255833625793, "sampling/sampling_logp_difference/max": 0.08409261703491211, "sampling/sampling_logp_difference/mean": 0.001475007040426135, "step": 800, "step_time": 8.68343882299996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 424.9375, "completions/mean_terminated_length": 424.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07012075511738658, "epoch": 0.01602, "frac_reward_zero_std": 0.0, "grad_norm": 0.06605147570371628, "kl": 0.8790568895637989, "learning_rate": 1.5981893975108176e-05, "loss": -0.0019, "num_tokens": 35337486.0, "reward": 0.7055404782295227, "reward_std": 0.6166064143180847, "rewards/rollout_reward_func/mean": 0.7055404782295227, "rewards/rollout_reward_func/std": 0.5985879302024841, "sampling/importance_sampling_ratio/max": 1.0125051736831665, "sampling/importance_sampling_ratio/mean": 0.9981537461280823, "sampling/importance_sampling_ratio/min": 0.9034892320632935, "sampling/sampling_logp_difference/max": 0.10364413261413574, "sampling/sampling_logp_difference/mean": 0.0013865145156159997, "step": 801, "step_time": 8.489510819003044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 284.78125, "completions/mean_terminated_length": 284.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06311054457910359, "epoch": 0.01604, "frac_reward_zero_std": 0.5, "grad_norm": 0.011549519374966621, "kl": 0.7263636775314808, "learning_rate": 1.5942916928911885e-05, "loss": -0.0004, "num_tokens": 35374880.0, "reward": 0.7763888835906982, "reward_std": 0.3155328035354614, "rewards/rollout_reward_func/mean": 0.7763888835906982, "rewards/rollout_reward_func/std": 0.5567944049835205, "sampling/importance_sampling_ratio/max": 1.1051552295684814, "sampling/importance_sampling_ratio/mean": 1.002097249031067, "sampling/importance_sampling_ratio/min": 0.9759295582771301, "sampling/sampling_logp_difference/max": 0.10004043579101562, "sampling/sampling_logp_difference/mean": 0.0016478943871334195, "step": 802, "step_time": 7.886306267999316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 351.1875, "completions/mean_terminated_length": 351.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04742398066446185, "epoch": 0.01606, "frac_reward_zero_std": 0.25, "grad_norm": 0.012438046745955944, "kl": 0.92726094648242, "learning_rate": 1.5904313192041646e-05, "loss": 0.0097, "num_tokens": 35414485.0, "reward": 0.7968647480010986, "reward_std": 0.36123281717300415, "rewards/rollout_reward_func/mean": 0.7968647480010986, "rewards/rollout_reward_func/std": 0.4296363890171051, "sampling/importance_sampling_ratio/max": 1.0071756839752197, "sampling/importance_sampling_ratio/mean": 0.9975693821907043, "sampling/importance_sampling_ratio/min": 0.9586537480354309, "sampling/sampling_logp_difference/max": 0.041099607944488525, "sampling/sampling_logp_difference/mean": 0.000864475267007947, "step": 803, "step_time": 7.993106127998544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 486.78125, "completions/mean_terminated_length": 486.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07710423786193132, "epoch": 0.01608, "frac_reward_zero_std": 0.0, "grad_norm": 0.035115886479616165, "kl": 0.9844316244125366, "learning_rate": 1.5866083317557657e-05, "loss": 0.0019, "num_tokens": 35461071.0, "reward": 0.3386470675468445, "reward_std": 0.689888596534729, "rewards/rollout_reward_func/mean": 0.3386470675468445, "rewards/rollout_reward_func/std": 0.680852472782135, "sampling/importance_sampling_ratio/max": 1.0361777544021606, "sampling/importance_sampling_ratio/mean": 0.9990265369415283, "sampling/importance_sampling_ratio/min": 0.8871454000473022, "sampling/sampling_logp_difference/max": 0.11974430084228516, "sampling/sampling_logp_difference/mean": 0.0015851883217692375, "step": 804, "step_time": 8.250393581998651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 300.5, "completions/mean_terminated_length": 300.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07304479787126184, "epoch": 0.0161, "frac_reward_zero_std": 0.5, "grad_norm": 0.03416779637336731, "kl": 0.821578711271286, "learning_rate": 1.5828227853163896e-05, "loss": -0.003, "num_tokens": 35498734.0, "reward": 0.6704901456832886, "reward_std": 0.3823014497756958, "rewards/rollout_reward_func/mean": 0.6704901456832886, "rewards/rollout_reward_func/std": 0.6132839322090149, "sampling/importance_sampling_ratio/max": 1.0062108039855957, "sampling/importance_sampling_ratio/mean": 0.9988445043563843, "sampling/importance_sampling_ratio/min": 0.955150842666626, "sampling/sampling_logp_difference/max": 0.046167850494384766, "sampling/sampling_logp_difference/mean": 0.0008236067369580269, "step": 805, "step_time": 8.343354137003189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 510.8125, "completions/mean_terminated_length": 510.8125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07106104353442788, "epoch": 0.01612, "frac_reward_zero_std": 0.0, "grad_norm": 0.04188307374715805, "kl": 1.1950361505150795, "learning_rate": 1.579074734120036e-05, "loss": -0.0021, "num_tokens": 35546883.0, "reward": 0.6309818029403687, "reward_std": 0.5864956378936768, "rewards/rollout_reward_func/mean": 0.6309818029403687, "rewards/rollout_reward_func/std": 0.577252984046936, "sampling/importance_sampling_ratio/max": 1.1176190376281738, "sampling/importance_sampling_ratio/mean": 1.0036146640777588, "sampling/importance_sampling_ratio/min": 0.9739316701889038, "sampling/sampling_logp_difference/max": 0.12194585800170898, "sampling/sampling_logp_difference/mean": 0.0015788780292496085, "step": 806, "step_time": 8.81240533699929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 447.34375, "completions/mean_terminated_length": 447.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.056206359062343836, "epoch": 0.01614, "frac_reward_zero_std": 0.0, "grad_norm": 0.02296527475118637, "kl": 0.7718424275517464, "learning_rate": 1.5753642318635237e-05, "loss": 0.0169, "num_tokens": 35590191.0, "reward": 0.7644960880279541, "reward_std": 0.4542505145072937, "rewards/rollout_reward_func/mean": 0.7644960880279541, "rewards/rollout_reward_func/std": 0.45220446586608887, "sampling/importance_sampling_ratio/max": 1.0249199867248535, "sampling/importance_sampling_ratio/mean": 1.000868320465088, "sampling/importance_sampling_ratio/min": 0.9887489676475525, "sampling/sampling_logp_difference/max": 0.022709161043167114, "sampling/sampling_logp_difference/mean": 0.0006379460101015866, "step": 807, "step_time": 8.075879277996137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 353.90625, "completions/mean_terminated_length": 353.90625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04878453677520156, "epoch": 0.01616, "frac_reward_zero_std": 0.5, "grad_norm": 0.0026365460362285376, "kl": 0.5041298680007458, "learning_rate": 1.5716913317057216e-05, "loss": 0.0033, "num_tokens": 35630365.0, "reward": 0.8982484340667725, "reward_std": 0.2213784009218216, "rewards/rollout_reward_func/mean": 0.8982484340667725, "rewards/rollout_reward_func/std": 0.32142913341522217, "sampling/importance_sampling_ratio/max": 1.0061407089233398, "sampling/importance_sampling_ratio/mean": 1.000422716140747, "sampling/importance_sampling_ratio/min": 0.9934672117233276, "sampling/sampling_logp_difference/max": 0.0065551698207855225, "sampling/sampling_logp_difference/mean": 0.00038021206273697317, "step": 808, "step_time": 8.11809142600032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 449.03125, "completions/mean_terminated_length": 449.03125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07198605639860034, "epoch": 0.01618, "frac_reward_zero_std": 0.0, "grad_norm": 0.030751101672649384, "kl": 1.0916265211999416, "learning_rate": 1.568056086266791e-05, "loss": 0.0001, "num_tokens": 35674953.0, "reward": 0.5989353656768799, "reward_std": 0.5562080144882202, "rewards/rollout_reward_func/mean": 0.5989353656768799, "rewards/rollout_reward_func/std": 0.5845658183097839, "sampling/importance_sampling_ratio/max": 1.0163671970367432, "sampling/importance_sampling_ratio/mean": 0.9971620440483093, "sampling/importance_sampling_ratio/min": 0.9173423647880554, "sampling/sampling_logp_difference/max": 0.0867682695388794, "sampling/sampling_logp_difference/mean": 0.0013596841599792242, "step": 809, "step_time": 8.561567707996801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 432.03125, "completions/mean_terminated_length": 432.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07398681296035647, "epoch": 0.0162, "frac_reward_zero_std": 0.0, "grad_norm": 0.0359635055065155, "kl": 0.603895815089345, "learning_rate": 1.564458547627427e-05, "loss": -0.0077, "num_tokens": 35718903.0, "reward": 0.6790850162506104, "reward_std": 0.6500009298324585, "rewards/rollout_reward_func/mean": 0.6790850162506104, "rewards/rollout_reward_func/std": 0.6519681811332703, "sampling/importance_sampling_ratio/max": 1.1116830110549927, "sampling/importance_sampling_ratio/mean": 1.001267671585083, "sampling/importance_sampling_ratio/min": 0.9460585117340088, "sampling/sampling_logp_difference/max": 0.1039273738861084, "sampling/sampling_logp_difference/mean": 0.0016520100180059671, "step": 810, "step_time": 9.010202614001173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 420.125, "completions/mean_terminated_length": 420.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0560151319950819, "epoch": 0.01622, "frac_reward_zero_std": 0.25, "grad_norm": 0.022545794025063515, "kl": 0.8702238313853741, "learning_rate": 1.5608987673281167e-05, "loss": 0.0031, "num_tokens": 35762303.0, "reward": 0.7360911965370178, "reward_std": 0.4430886507034302, "rewards/rollout_reward_func/mean": 0.7360911965370178, "rewards/rollout_reward_func/std": 0.5280490517616272, "sampling/importance_sampling_ratio/max": 1.0091607570648193, "sampling/importance_sampling_ratio/mean": 0.9965794086456299, "sampling/importance_sampling_ratio/min": 0.889564573764801, "sampling/sampling_logp_difference/max": 0.11792683601379395, "sampling/sampling_logp_difference/mean": 0.0013866155641153455, "step": 811, "step_time": 9.020318988999861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 389.9375, "completions/mean_terminated_length": 389.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06011184933595359, "epoch": 0.01624, "frac_reward_zero_std": 0.25, "grad_norm": 0.020194638520479202, "kl": 0.780786694958806, "learning_rate": 1.5573767963683963e-05, "loss": -0.0033, "num_tokens": 35804885.0, "reward": 0.6651945114135742, "reward_std": 0.46336784958839417, "rewards/rollout_reward_func/mean": 0.6651945114135742, "rewards/rollout_reward_func/std": 0.5635818243026733, "sampling/importance_sampling_ratio/max": 1.015379786491394, "sampling/importance_sampling_ratio/mean": 1.0005251169204712, "sampling/importance_sampling_ratio/min": 0.978553056716919, "sampling/sampling_logp_difference/max": 0.02312570810317993, "sampling/sampling_logp_difference/mean": 0.0007126822602003813, "step": 812, "step_time": 8.564413396998134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 446.15625, "completions/mean_terminated_length": 446.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05588191468268633, "epoch": 0.01626, "frac_reward_zero_std": 0.25, "grad_norm": 0.02821241319179535, "kl": 0.6836834885179996, "learning_rate": 1.5538926852061252e-05, "loss": 0.0096, "num_tokens": 35848787.0, "reward": 0.6673328280448914, "reward_std": 0.45564940571784973, "rewards/rollout_reward_func/mean": 0.6673328280448914, "rewards/rollout_reward_func/std": 0.561802089214325, "sampling/importance_sampling_ratio/max": 1.033838152885437, "sampling/importance_sampling_ratio/mean": 1.001457691192627, "sampling/importance_sampling_ratio/min": 0.9924077391624451, "sampling/sampling_logp_difference/max": 0.033247604966163635, "sampling/sampling_logp_difference/mean": 0.00063021678943187, "step": 813, "step_time": 8.651210995998554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 440.875, "completions/mean_terminated_length": 440.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05827107094228268, "epoch": 0.01628, "frac_reward_zero_std": 0.25, "grad_norm": 0.004112121183425188, "kl": 0.6182333081960678, "learning_rate": 1.550446483756759e-05, "loss": 0.0019, "num_tokens": 35893236.0, "reward": 0.8986976146697998, "reward_std": 0.28652656078338623, "rewards/rollout_reward_func/mean": 0.8986976146697998, "rewards/rollout_reward_func/std": 0.32019907236099243, "sampling/importance_sampling_ratio/max": 1.0212689638137817, "sampling/importance_sampling_ratio/mean": 1.0019484758377075, "sampling/importance_sampling_ratio/min": 0.9920545220375061, "sampling/sampling_logp_difference/max": 0.021143615245819092, "sampling/sampling_logp_difference/mean": 0.0007661266718059778, "step": 814, "step_time": 8.96901049799635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 409.78125, "completions/mean_terminated_length": 409.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0371398557908833, "epoch": 0.0163, "frac_reward_zero_std": 0.0, "grad_norm": 0.011742603965103626, "kl": 0.6919258274137974, "learning_rate": 1.547038241392637e-05, "loss": 0.0099, "num_tokens": 35935372.0, "reward": 0.6976609826087952, "reward_std": 0.49777719378471375, "rewards/rollout_reward_func/mean": 0.6976609826087952, "rewards/rollout_reward_func/std": 0.491085022687912, "sampling/importance_sampling_ratio/max": 1.1263525485992432, "sampling/importance_sampling_ratio/mean": 1.004740834236145, "sampling/importance_sampling_ratio/min": 0.9931029081344604, "sampling/sampling_logp_difference/max": 0.11902379989624023, "sampling/sampling_logp_difference/mean": 0.0012996350415050983, "step": 815, "step_time": 8.519079981000687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 421.3125, "completions/mean_terminated_length": 421.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0573875040281564, "epoch": 0.01632, "frac_reward_zero_std": 0.0, "grad_norm": 0.08515994250774384, "kl": 0.6538042277097702, "learning_rate": 1.5436680069422742e-05, "loss": 0.0023, "num_tokens": 35978939.0, "reward": 0.7677947282791138, "reward_std": 0.5062540769577026, "rewards/rollout_reward_func/mean": 0.7677947282791138, "rewards/rollout_reward_func/std": 0.5134842395782471, "sampling/importance_sampling_ratio/max": 1.109200119972229, "sampling/importance_sampling_ratio/mean": 1.0000571012496948, "sampling/importance_sampling_ratio/min": 0.8837888836860657, "sampling/sampling_logp_difference/max": 0.1235356330871582, "sampling/sampling_logp_difference/mean": 0.0035614706575870514, "step": 816, "step_time": 9.131920240000909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 369.03125, "completions/mean_terminated_length": 369.03125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0621969816274941, "epoch": 0.01634, "frac_reward_zero_std": 0.25, "grad_norm": 0.07359834015369415, "kl": 0.7743665166199207, "learning_rate": 1.54033582868966e-05, "loss": 0.0033, "num_tokens": 36020746.0, "reward": 0.736119270324707, "reward_std": 0.4416116178035736, "rewards/rollout_reward_func/mean": 0.736119270324707, "rewards/rollout_reward_func/std": 0.5292543172836304, "sampling/importance_sampling_ratio/max": 1.0060442686080933, "sampling/importance_sampling_ratio/mean": 0.9956815838813782, "sampling/importance_sampling_ratio/min": 0.8954839110374451, "sampling/sampling_logp_difference/max": 0.10622930526733398, "sampling/sampling_logp_difference/mean": 0.0015915692783892155, "step": 817, "step_time": 8.065226477998294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 459.25, "completions/mean_terminated_length": 459.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04585908446460962, "epoch": 0.01636, "frac_reward_zero_std": 0.0, "grad_norm": 0.08185821771621704, "kl": 0.6921622566878796, "learning_rate": 1.53704175437357e-05, "loss": 0.0054, "num_tokens": 36066962.0, "reward": 0.728678822517395, "reward_std": 0.4605882465839386, "rewards/rollout_reward_func/mean": 0.728678822517395, "rewards/rollout_reward_func/std": 0.4776410758495331, "sampling/importance_sampling_ratio/max": 1.1282459497451782, "sampling/importance_sampling_ratio/mean": 1.0050780773162842, "sampling/importance_sampling_ratio/min": 0.9958339333534241, "sampling/sampling_logp_difference/max": 0.12065434455871582, "sampling/sampling_logp_difference/mean": 0.0014532688073813915, "step": 818, "step_time": 8.164417069998308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 493.84375, "completions/mean_terminated_length": 493.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05672586685977876, "epoch": 0.01638, "frac_reward_zero_std": 0.0, "grad_norm": 0.02317848987877369, "kl": 1.0809706784784794, "learning_rate": 1.533785831186879e-05, "loss": 0.0128, "num_tokens": 36113487.0, "reward": 0.6353225708007812, "reward_std": 0.580845832824707, "rewards/rollout_reward_func/mean": 0.6353225708007812, "rewards/rollout_reward_func/std": 0.5717714428901672, "sampling/importance_sampling_ratio/max": 1.0092449188232422, "sampling/importance_sampling_ratio/mean": 0.9985250234603882, "sampling/importance_sampling_ratio/min": 0.97586989402771, "sampling/sampling_logp_difference/max": 0.026120617985725403, "sampling/sampling_logp_difference/mean": 0.0007048675906844437, "step": 819, "step_time": 8.769640262004032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 363.8125, "completions/mean_terminated_length": 363.8125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04550462937913835, "epoch": 0.0164, "frac_reward_zero_std": 0.25, "grad_norm": 0.02104831486940384, "kl": 0.8296581096947193, "learning_rate": 1.5305681057758858e-05, "loss": 0.0079, "num_tokens": 36153811.0, "reward": 0.7957538366317749, "reward_std": 0.3625938296318054, "rewards/rollout_reward_func/mean": 0.7957538366317749, "rewards/rollout_reward_func/std": 0.4320549964904785, "sampling/importance_sampling_ratio/max": 1.0120164155960083, "sampling/importance_sampling_ratio/mean": 0.9958393573760986, "sampling/importance_sampling_ratio/min": 0.8871482610702515, "sampling/sampling_logp_difference/max": 0.11973905563354492, "sampling/sampling_logp_difference/mean": 0.0016409934032708406, "step": 820, "step_time": 8.15920336399904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 479.3125, "completions/mean_terminated_length": 479.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04327695700339973, "epoch": 0.01642, "frac_reward_zero_std": 0.0, "grad_norm": 0.02695535682141781, "kl": 0.86151715926826, "learning_rate": 1.5273886242396457e-05, "loss": 0.0148, "num_tokens": 36198464.0, "reward": 0.6952789425849915, "reward_std": 0.5012289881706238, "rewards/rollout_reward_func/mean": 0.6952789425849915, "rewards/rollout_reward_func/std": 0.49497851729393005, "sampling/importance_sampling_ratio/max": 1.0144306421279907, "sampling/importance_sampling_ratio/mean": 1.0007717609405518, "sampling/importance_sampling_ratio/min": 0.9954867362976074, "sampling/sampling_logp_difference/max": 0.014438897371292114, "sampling/sampling_logp_difference/mean": 0.00040338776307180524, "step": 821, "step_time": 9.336275475001457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 389.71875, "completions/mean_terminated_length": 389.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04808772192336619, "epoch": 0.01644, "frac_reward_zero_std": 0.0, "grad_norm": 0.043658994138240814, "kl": 0.7224203152582049, "learning_rate": 1.5242474321293087e-05, "loss": -0.0036, "num_tokens": 36240899.0, "reward": 0.6643294095993042, "reward_std": 0.5333971977233887, "rewards/rollout_reward_func/mean": 0.6643294095993042, "rewards/rollout_reward_func/std": 0.566520631313324, "sampling/importance_sampling_ratio/max": 1.1179192066192627, "sampling/importance_sampling_ratio/mean": 1.0038456916809082, "sampling/importance_sampling_ratio/min": 0.9787328243255615, "sampling/sampling_logp_difference/max": 0.10895776748657227, "sampling/sampling_logp_difference/mean": 0.0014018472284078598, "step": 822, "step_time": 8.233554148000621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 428.4375, "completions/mean_terminated_length": 428.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05485102999955416, "epoch": 0.01646, "frac_reward_zero_std": 0.0, "grad_norm": 0.1936264932155609, "kl": 2.230668768286705, "learning_rate": 1.5211445744474684e-05, "loss": 0.0015, "num_tokens": 36284788.0, "reward": 0.7064613103866577, "reward_std": 0.595603346824646, "rewards/rollout_reward_func/mean": 0.7064613103866577, "rewards/rollout_reward_func/std": 0.5959100127220154, "sampling/importance_sampling_ratio/max": 1.011829137802124, "sampling/importance_sampling_ratio/mean": 0.9950277805328369, "sampling/importance_sampling_ratio/min": 0.8857820630073547, "sampling/sampling_logp_difference/max": 0.12421369552612305, "sampling/sampling_logp_difference/mean": 0.002468930324539542, "step": 823, "step_time": 8.363725195998995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 385.40625, "completions/mean_terminated_length": 385.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05378544004634023, "epoch": 0.01648, "frac_reward_zero_std": 0.0, "grad_norm": 0.040861863642930984, "kl": 0.703784903511405, "learning_rate": 1.5180800956475153e-05, "loss": -0.002, "num_tokens": 36326480.0, "reward": 0.7335395812988281, "reward_std": 0.5355923175811768, "rewards/rollout_reward_func/mean": 0.7335395812988281, "rewards/rollout_reward_func/std": 0.531883955001831, "sampling/importance_sampling_ratio/max": 1.0065306425094604, "sampling/importance_sampling_ratio/mean": 0.9993828535079956, "sampling/importance_sampling_ratio/min": 0.9787564873695374, "sampling/sampling_logp_difference/max": 0.02201858162879944, "sampling/sampling_logp_difference/mean": 0.0005565286264754832, "step": 824, "step_time": 8.220533427996997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 413.03125, "completions/mean_terminated_length": 413.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04546947847120464, "epoch": 0.0165, "frac_reward_zero_std": 0.0, "grad_norm": 0.08123034983873367, "kl": 1.238835446536541, "learning_rate": 1.5150540396330036e-05, "loss": -0.001, "num_tokens": 36370456.0, "reward": 0.5713375210762024, "reward_std": 0.6173105239868164, "rewards/rollout_reward_func/mean": 0.5713375210762024, "rewards/rollout_reward_func/std": 0.6365612149238586, "sampling/importance_sampling_ratio/max": 1.0048421621322632, "sampling/importance_sampling_ratio/mean": 0.9963082671165466, "sampling/importance_sampling_ratio/min": 0.8884348273277283, "sampling/sampling_logp_difference/max": 0.11827540397644043, "sampling/sampling_logp_difference/mean": 0.001350810518488288, "step": 825, "step_time": 8.307988634001958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 478.21875, "completions/mean_terminated_length": 478.21875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.059843486873432994, "epoch": 0.01652, "frac_reward_zero_std": 0.25, "grad_norm": 0.05146130919456482, "kl": 0.6200976893305779, "learning_rate": 1.5120664497570161e-05, "loss": 0.0038, "num_tokens": 36416372.0, "reward": 0.8050457239151001, "reward_std": 0.4040268659591675, "rewards/rollout_reward_func/mean": 0.8050457239151001, "rewards/rollout_reward_func/std": 0.4844854474067688, "sampling/importance_sampling_ratio/max": 1.0049134492874146, "sampling/importance_sampling_ratio/mean": 0.9993922114372253, "sampling/importance_sampling_ratio/min": 0.9811071157455444, "sampling/sampling_logp_difference/max": 0.01905156672000885, "sampling/sampling_logp_difference/mean": 0.00048224651254713535, "step": 826, "step_time": 9.124938093998935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 270.9375, "completions/mean_terminated_length": 270.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.036634857300668955, "epoch": 0.01654, "frac_reward_zero_std": 0.25, "grad_norm": 0.004414405208081007, "kl": 0.8314519077539444, "learning_rate": 1.5091173688215501e-05, "loss": 0.0128, "num_tokens": 36452370.0, "reward": 0.8668214678764343, "reward_std": 0.3124648332595825, "rewards/rollout_reward_func/mean": 0.8668214678764343, "rewards/rollout_reward_func/std": 0.3580608665943146, "sampling/importance_sampling_ratio/max": 1.0049915313720703, "sampling/importance_sampling_ratio/mean": 1.0003620386123657, "sampling/importance_sampling_ratio/min": 0.9951646327972412, "sampling/sampling_logp_difference/max": 0.004964835941791534, "sampling/sampling_logp_difference/mean": 0.0002981925499625504, "step": 827, "step_time": 8.012956541000676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 477.4375, "completions/mean_terminated_length": 477.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05154008255340159, "epoch": 0.01656, "frac_reward_zero_std": 0.0, "grad_norm": 0.15677131712436676, "kl": 1.99698044359684, "learning_rate": 1.5062068390768989e-05, "loss": 0.0075, "num_tokens": 36497954.0, "reward": 0.6041171550750732, "reward_std": 0.622161865234375, "rewards/rollout_reward_func/mean": 0.6041171550750732, "rewards/rollout_reward_func/std": 0.6318795680999756, "sampling/importance_sampling_ratio/max": 1.0141350030899048, "sampling/importance_sampling_ratio/mean": 0.9989094734191895, "sampling/importance_sampling_ratio/min": 0.9787051677703857, "sampling/sampling_logp_difference/max": 0.02192486822605133, "sampling/sampling_logp_difference/mean": 0.0007481951033696532, "step": 828, "step_time": 8.642817738998929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 392.0625, "completions/mean_terminated_length": 392.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08381593693047762, "epoch": 0.01658, "frac_reward_zero_std": 0.0, "grad_norm": 0.059415653347969055, "kl": 0.7543481774628162, "learning_rate": 1.5033349022210494e-05, "loss": -0.0123, "num_tokens": 36540952.0, "reward": 0.450755774974823, "reward_std": 0.7105492353439331, "rewards/rollout_reward_func/mean": 0.450755774974823, "rewards/rollout_reward_func/std": 0.77128666639328, "sampling/importance_sampling_ratio/max": 1.0345838069915771, "sampling/importance_sampling_ratio/mean": 1.0031328201293945, "sampling/importance_sampling_ratio/min": 0.983874499797821, "sampling/sampling_logp_difference/max": 0.03365647792816162, "sampling/sampling_logp_difference/mean": 0.001374849583953619, "step": 829, "step_time": 8.962845025005663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 377.34375, "completions/mean_terminated_length": 377.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05924565810710192, "epoch": 0.0166, "frac_reward_zero_std": 0.0, "grad_norm": 0.05260129272937775, "kl": 0.9830521270632744, "learning_rate": 1.5005015993990828e-05, "loss": 0.0072, "num_tokens": 36581942.0, "reward": 0.574288010597229, "reward_std": 0.6213366985321045, "rewards/rollout_reward_func/mean": 0.574288010597229, "rewards/rollout_reward_func/std": 0.633277952671051, "sampling/importance_sampling_ratio/max": 1.128801941871643, "sampling/importance_sampling_ratio/mean": 0.9980472922325134, "sampling/importance_sampling_ratio/min": 0.887250542640686, "sampling/sampling_logp_difference/max": 0.12115478515625, "sampling/sampling_logp_difference/mean": 0.0033213640563189983, "step": 830, "step_time": 7.9819387489988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 506.4375, "completions/mean_terminated_length": 506.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.060952065978199244, "epoch": 0.01662, "frac_reward_zero_std": 0.0, "grad_norm": 0.09937496483325958, "kl": 0.6879401872865856, "learning_rate": 1.4977069712025868e-05, "loss": -0.0061, "num_tokens": 36629182.0, "reward": 0.6731650829315186, "reward_std": 0.5760180354118347, "rewards/rollout_reward_func/mean": 0.6731650829315186, "rewards/rollout_reward_func/std": 0.6085712909698486, "sampling/importance_sampling_ratio/max": 1.2599050998687744, "sampling/importance_sampling_ratio/mean": 1.0128400325775146, "sampling/importance_sampling_ratio/min": 0.999647855758667, "sampling/sampling_logp_difference/max": 0.1162424087524414, "sampling/sampling_logp_difference/mean": 0.0026949443854391575, "step": 831, "step_time": 8.664811336997445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 493.65625, "completions/mean_terminated_length": 493.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05717154545709491, "epoch": 0.01664, "frac_reward_zero_std": 0.0, "grad_norm": 0.04670167341828346, "kl": 0.8983644973486662, "learning_rate": 1.494951057669073e-05, "loss": 0.005, "num_tokens": 36675841.0, "reward": 0.600641131401062, "reward_std": 0.5857999324798584, "rewards/rollout_reward_func/mean": 0.600641131401062, "rewards/rollout_reward_func/std": 0.5822984576225281, "sampling/importance_sampling_ratio/max": 1.032374620437622, "sampling/importance_sampling_ratio/mean": 0.9981716871261597, "sampling/importance_sampling_ratio/min": 0.9008136987686157, "sampling/sampling_logp_difference/max": 0.1187124252319336, "sampling/sampling_logp_difference/mean": 0.001532580005005002, "step": 832, "step_time": 8.140646728999855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 435.6875, "completions/mean_terminated_length": 435.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06377347186207771, "epoch": 0.01666, "frac_reward_zero_std": 0.0, "grad_norm": 0.07555516064167023, "kl": 1.4016031809151173, "learning_rate": 1.4922338982814038e-05, "loss": 0.0037, "num_tokens": 36719194.0, "reward": 0.6121745109558105, "reward_std": 0.6396194100379944, "rewards/rollout_reward_func/mean": 0.6121745109558105, "rewards/rollout_reward_func/std": 0.6703152060508728, "sampling/importance_sampling_ratio/max": 1.0054054260253906, "sampling/importance_sampling_ratio/mean": 0.995147705078125, "sampling/importance_sampling_ratio/min": 0.884665846824646, "sampling/sampling_logp_difference/max": 0.12304544448852539, "sampling/sampling_logp_difference/mean": 0.0016927642282098532, "step": 833, "step_time": 8.607745955998325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 448.71875, "completions/mean_terminated_length": 448.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.043650972191244364, "epoch": 0.01668, "frac_reward_zero_std": 0.5, "grad_norm": 0.03192811459302902, "kl": 0.4580346199218184, "learning_rate": 1.4895555319672266e-05, "loss": 0.0006, "num_tokens": 36763575.0, "reward": 0.7374066114425659, "reward_std": 0.30330345034599304, "rewards/rollout_reward_func/mean": 0.7374066114425659, "rewards/rollout_reward_func/std": 0.525875449180603, "sampling/importance_sampling_ratio/max": 1.08594810962677, "sampling/importance_sampling_ratio/mean": 1.003523588180542, "sampling/importance_sampling_ratio/min": 0.9952242374420166, "sampling/sampling_logp_difference/max": 0.08226180076599121, "sampling/sampling_logp_difference/mean": 0.0009673017193563282, "step": 834, "step_time": 8.092446775001008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 402.71875, "completions/mean_terminated_length": 402.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05761260469444096, "epoch": 0.0167, "frac_reward_zero_std": 0.0, "grad_norm": 0.015948623418807983, "kl": 0.8238977445289493, "learning_rate": 1.4869159970984158e-05, "loss": 0.0003, "num_tokens": 36805888.0, "reward": 0.6993547677993774, "reward_std": 0.541802167892456, "rewards/rollout_reward_func/mean": 0.6993547677993774, "rewards/rollout_reward_func/std": 0.5492945909500122, "sampling/importance_sampling_ratio/max": 1.042350172996521, "sampling/importance_sampling_ratio/mean": 0.9997262954711914, "sampling/importance_sampling_ratio/min": 0.9734691381454468, "sampling/sampling_logp_difference/max": 0.041470736265182495, "sampling/sampling_logp_difference/mean": 0.0010904786176979542, "step": 835, "step_time": 8.04595576200336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 354.4375, "completions/mean_terminated_length": 354.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05524573102593422, "epoch": 0.01672, "frac_reward_zero_std": 0.0, "grad_norm": 0.07343056052923203, "kl": 0.8190113916061819, "learning_rate": 1.4843153314905235e-05, "loss": 0.0021, "num_tokens": 36846940.0, "reward": 0.6813766360282898, "reward_std": 0.6530385613441467, "rewards/rollout_reward_func/mean": 0.6813766360282898, "rewards/rollout_reward_func/std": 0.6473329663276672, "sampling/importance_sampling_ratio/max": 1.2563180923461914, "sampling/importance_sampling_ratio/mean": 1.007590651512146, "sampling/importance_sampling_ratio/min": 0.9822481870651245, "sampling/sampling_logp_difference/max": 0.2285308837890625, "sampling/sampling_logp_difference/mean": 0.0024209017865359783, "step": 836, "step_time": 8.17139157100064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 462.875, "completions/mean_terminated_length": 462.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04807454743422568, "epoch": 0.01674, "frac_reward_zero_std": 0.0, "grad_norm": 0.1007944792509079, "kl": 1.24337038397789, "learning_rate": 1.4817535724022387e-05, "loss": 0.0089, "num_tokens": 36892420.0, "reward": 0.6680226922035217, "reward_std": 0.5342428684234619, "rewards/rollout_reward_func/mean": 0.6680226922035217, "rewards/rollout_reward_func/std": 0.5614187121391296, "sampling/importance_sampling_ratio/max": 1.0094196796417236, "sampling/importance_sampling_ratio/mean": 0.9931314587593079, "sampling/importance_sampling_ratio/min": 0.8684622645378113, "sampling/sampling_logp_difference/max": 0.12413167953491211, "sampling/sampling_logp_difference/mean": 0.002200852148234844, "step": 837, "step_time": 8.19255076400077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 463.96875, "completions/mean_terminated_length": 463.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04658126179128885, "epoch": 0.01676, "frac_reward_zero_std": 0.0, "grad_norm": 0.023944582790136337, "kl": 0.7683697734028101, "learning_rate": 1.4792307565348491e-05, "loss": 0.0046, "num_tokens": 36937526.0, "reward": 0.8039124011993408, "reward_std": 0.4889276325702667, "rewards/rollout_reward_func/mean": 0.8039124011993408, "rewards/rollout_reward_func/std": 0.4847460389137268, "sampling/importance_sampling_ratio/max": 1.007127046585083, "sampling/importance_sampling_ratio/mean": 0.9999940395355225, "sampling/importance_sampling_ratio/min": 0.9882185459136963, "sampling/sampling_logp_difference/max": 0.011858612298965454, "sampling/sampling_logp_difference/mean": 0.0005011459579691291, "step": 838, "step_time": 8.58798081400164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 434.65625, "completions/mean_terminated_length": 434.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04094124957919121, "epoch": 0.01678, "frac_reward_zero_std": 0.0, "grad_norm": 0.005670307669788599, "kl": 0.7225266769528389, "learning_rate": 1.4767469200317226e-05, "loss": 0.013, "num_tokens": 36980592.0, "reward": 0.7655415534973145, "reward_std": 0.46706655621528625, "rewards/rollout_reward_func/mean": 0.7655415534973145, "rewards/rollout_reward_func/std": 0.4502011835575104, "sampling/importance_sampling_ratio/max": 1.0080193281173706, "sampling/importance_sampling_ratio/mean": 1.00044846534729, "sampling/importance_sampling_ratio/min": 0.9962496757507324, "sampling/sampling_logp_difference/max": 0.00796961784362793, "sampling/sampling_logp_difference/mean": 0.00034821141161955893, "step": 839, "step_time": 8.145435153001017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 462.71875, "completions/mean_terminated_length": 462.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05585383623838425, "epoch": 0.0168, "frac_reward_zero_std": 0.0, "grad_norm": 0.040723539888858795, "kl": 0.9436648264527321, "learning_rate": 1.4743020984777827e-05, "loss": -0.0034, "num_tokens": 37026540.0, "reward": 0.6388609409332275, "reward_std": 0.606910228729248, "rewards/rollout_reward_func/mean": 0.6388609409332275, "rewards/rollout_reward_func/std": 0.6201330423355103, "sampling/importance_sampling_ratio/max": 1.0106072425842285, "sampling/importance_sampling_ratio/mean": 0.9961076974868774, "sampling/importance_sampling_ratio/min": 0.9054762721061707, "sampling/sampling_logp_difference/max": 0.10177850723266602, "sampling/sampling_logp_difference/mean": 0.001617221161723137, "step": 840, "step_time": 8.643873760998758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 325.9375, "completions/mean_terminated_length": 325.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05022431677207351, "epoch": 0.01682, "frac_reward_zero_std": 0.0, "grad_norm": 0.04169469326734543, "kl": 1.0586289837956429, "learning_rate": 1.4718963268990027e-05, "loss": 0.0024, "num_tokens": 37065658.0, "reward": 0.7965763807296753, "reward_std": 0.4420364499092102, "rewards/rollout_reward_func/mean": 0.7965763807296753, "rewards/rollout_reward_func/std": 0.4302918016910553, "sampling/importance_sampling_ratio/max": 1.0099191665649414, "sampling/importance_sampling_ratio/mean": 1.0015918016433716, "sampling/importance_sampling_ratio/min": 0.9975568652153015, "sampling/sampling_logp_difference/max": 0.009815827012062073, "sampling/sampling_logp_difference/mean": 0.0004892681026831269, "step": 841, "step_time": 7.919848783005364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 330.65625, "completions/mean_terminated_length": 330.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.043214283185079694, "epoch": 0.01684, "frac_reward_zero_std": 0.0, "grad_norm": 0.07022053003311157, "kl": 0.9132380187511444, "learning_rate": 1.4695296397619016e-05, "loss": -0.0032, "num_tokens": 37105280.0, "reward": 0.8010431528091431, "reward_std": 0.4968496859073639, "rewards/rollout_reward_func/mean": 0.8010431528091431, "rewards/rollout_reward_func/std": 0.49189919233322144, "sampling/importance_sampling_ratio/max": 1.1232244968414307, "sampling/importance_sampling_ratio/mean": 1.0015716552734375, "sampling/importance_sampling_ratio/min": 0.8950838446617126, "sampling/sampling_logp_difference/max": 0.11600708961486816, "sampling/sampling_logp_difference/mean": 0.002593143144622445, "step": 842, "step_time": 7.98979400200551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 498.34375, "completions/mean_terminated_length": 498.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04697767924517393, "epoch": 0.01686, "frac_reward_zero_std": 0.0, "grad_norm": 0.015168175101280212, "kl": 0.49425575975328684, "learning_rate": 1.4672020709730534e-05, "loss": -0.0005, "num_tokens": 37152415.0, "reward": 0.6731700301170349, "reward_std": 0.6099646091461182, "rewards/rollout_reward_func/mean": 0.6731700301170349, "rewards/rollout_reward_func/std": 0.6082368493080139, "sampling/importance_sampling_ratio/max": 1.0037050247192383, "sampling/importance_sampling_ratio/mean": 0.9996103048324585, "sampling/importance_sampling_ratio/min": 0.9892399907112122, "sampling/sampling_logp_difference/max": 0.010941624641418457, "sampling/sampling_logp_difference/mean": 0.0004125390842091292, "step": 843, "step_time": 8.880945295999481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 399.65625, "completions/mean_terminated_length": 399.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04421049077063799, "epoch": 0.01688, "frac_reward_zero_std": 0.0, "grad_norm": 0.02675473690032959, "kl": 0.7450989261269569, "learning_rate": 1.464913653878597e-05, "loss": -0.0087, "num_tokens": 37195186.0, "reward": 0.6791346669197083, "reward_std": 0.588908314704895, "rewards/rollout_reward_func/mean": 0.6791346669197083, "rewards/rollout_reward_func/std": 0.6513751745223999, "sampling/importance_sampling_ratio/max": 1.0101145505905151, "sampling/importance_sampling_ratio/mean": 1.0009695291519165, "sampling/importance_sampling_ratio/min": 0.9962289333343506, "sampling/sampling_logp_difference/max": 0.010639742016792297, "sampling/sampling_logp_difference/mean": 0.00044328533113002777, "step": 844, "step_time": 8.270190414001263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 412.5625, "completions/mean_terminated_length": 412.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0524726128205657, "epoch": 0.0169, "frac_reward_zero_std": 0.25, "grad_norm": 0.03147036209702492, "kl": 0.5373848862946033, "learning_rate": 1.4626644212637613e-05, "loss": 0.0058, "num_tokens": 37238929.0, "reward": 0.80573570728302, "reward_std": 0.4036408066749573, "rewards/rollout_reward_func/mean": 0.80573570728302, "rewards/rollout_reward_func/std": 0.48146945238113403, "sampling/importance_sampling_ratio/max": 1.133946180343628, "sampling/importance_sampling_ratio/mean": 1.0031602382659912, "sampling/importance_sampling_ratio/min": 0.9895176291465759, "sampling/sampling_logp_difference/max": 0.1259608268737793, "sampling/sampling_logp_difference/mean": 0.0013607682194560766, "step": 845, "step_time": 9.166681446000439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 390.375, "completions/mean_terminated_length": 390.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05222357762977481, "epoch": 0.01692, "frac_reward_zero_std": 0.25, "grad_norm": 0.013464240357279778, "kl": 0.6077217943966389, "learning_rate": 1.4604544053523967e-05, "loss": 0.0018, "num_tokens": 37281335.0, "reward": 0.7609807848930359, "reward_std": 0.3944953680038452, "rewards/rollout_reward_func/mean": 0.7609807848930359, "rewards/rollout_reward_func/std": 0.4590326249599457, "sampling/importance_sampling_ratio/max": 1.0785746574401855, "sampling/importance_sampling_ratio/mean": 1.003387451171875, "sampling/importance_sampling_ratio/min": 0.9963753819465637, "sampling/sampling_logp_difference/max": 0.0749964714050293, "sampling/sampling_logp_difference/mean": 0.0009280926315113902, "step": 846, "step_time": 8.491543504997026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 388.96875, "completions/mean_terminated_length": 388.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06379276188090444, "epoch": 0.01694, "frac_reward_zero_std": 0.0, "grad_norm": 0.15089713037014008, "kl": 0.8631294518709183, "learning_rate": 1.4582836378065086e-05, "loss": -0.0081, "num_tokens": 37322776.0, "reward": 0.7049218416213989, "reward_std": 0.5745493173599243, "rewards/rollout_reward_func/mean": 0.7049218416213989, "rewards/rollout_reward_func/std": 0.599292516708374, "sampling/importance_sampling_ratio/max": 1.09018075466156, "sampling/importance_sampling_ratio/mean": 1.0009486675262451, "sampling/importance_sampling_ratio/min": 0.9308592677116394, "sampling/sampling_logp_difference/max": 0.08608222007751465, "sampling/sampling_logp_difference/mean": 0.0017037256620824337, "step": 847, "step_time": 8.050202769001771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 331.375, "completions/mean_terminated_length": 331.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05099083064123988, "epoch": 0.01696, "frac_reward_zero_std": 0.25, "grad_norm": 0.025171175599098206, "kl": 0.9157573208212852, "learning_rate": 1.45615214972581e-05, "loss": -0.0013, "num_tokens": 37362873.0, "reward": 0.7667670249938965, "reward_std": 0.4167524576187134, "rewards/rollout_reward_func/mean": 0.7667670249938965, "rewards/rollout_reward_func/std": 0.5136274099349976, "sampling/importance_sampling_ratio/max": 1.0138003826141357, "sampling/importance_sampling_ratio/mean": 1.0010859966278076, "sampling/importance_sampling_ratio/min": 0.9916899800300598, "sampling/sampling_logp_difference/max": 0.013708680868148804, "sampling/sampling_logp_difference/mean": 0.0006789830513298512, "step": 848, "step_time": 8.456566117003604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 270.46875, "completions/mean_terminated_length": 270.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05961883720010519, "epoch": 0.01698, "frac_reward_zero_std": 0.0, "grad_norm": 0.08678673207759857, "kl": 1.0517778885550797, "learning_rate": 1.4540599716472713e-05, "loss": -0.002, "num_tokens": 37399670.0, "reward": 0.7132236361503601, "reward_std": 0.6608710289001465, "rewards/rollout_reward_func/mean": 0.7132236361503601, "rewards/rollout_reward_func/std": 0.6368502974510193, "sampling/importance_sampling_ratio/max": 1.0103387832641602, "sampling/importance_sampling_ratio/mean": 1.0002052783966064, "sampling/importance_sampling_ratio/min": 0.9895550608634949, "sampling/sampling_logp_difference/max": 0.010523796081542969, "sampling/sampling_logp_difference/mean": 0.0005592514062300324, "step": 849, "step_time": 8.299303259997032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 304.25, "completions/mean_terminated_length": 304.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04746263800188899, "epoch": 0.017, "frac_reward_zero_std": 0.25, "grad_norm": 0.02343401499092579, "kl": 0.9515262953937054, "learning_rate": 1.4520071335446839e-05, "loss": 0.0003, "num_tokens": 37437868.0, "reward": 0.7954296469688416, "reward_std": 0.3648753762245178, "rewards/rollout_reward_func/mean": 0.7954296469688416, "rewards/rollout_reward_func/std": 0.43279969692230225, "sampling/importance_sampling_ratio/max": 1.0248712301254272, "sampling/importance_sampling_ratio/mean": 1.0015501976013184, "sampling/importance_sampling_ratio/min": 0.9976759552955627, "sampling/sampling_logp_difference/max": 0.02456599473953247, "sampling/sampling_logp_difference/mean": 0.0004622183623723686, "step": 850, "step_time": 9.130841248999786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 519.1875, "completions/mean_terminated_length": 519.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.044414767529815435, "epoch": 0.01702, "frac_reward_zero_std": 0.0, "grad_norm": 0.023032767698168755, "kl": 0.7914568157866597, "learning_rate": 1.4499936648282328e-05, "loss": 0.0136, "num_tokens": 37485318.0, "reward": 0.7298493385314941, "reward_std": 0.4702496826648712, "rewards/rollout_reward_func/mean": 0.7298493385314941, "rewards/rollout_reward_func/std": 0.47540751099586487, "sampling/importance_sampling_ratio/max": 1.0106147527694702, "sampling/importance_sampling_ratio/mean": 0.9963542819023132, "sampling/importance_sampling_ratio/min": 0.8859620690345764, "sampling/sampling_logp_difference/max": 0.1210784912109375, "sampling/sampling_logp_difference/mean": 0.001492262352257967, "step": 851, "step_time": 8.641859301000295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 483.90625, "completions/mean_terminated_length": 483.90625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04822094552218914, "epoch": 0.01704, "frac_reward_zero_std": 0.25, "grad_norm": 0.03208164498209953, "kl": 0.6393531672656536, "learning_rate": 1.4480195943440715e-05, "loss": 0.0009, "num_tokens": 37531153.0, "reward": 0.7696272134780884, "reward_std": 0.41092783212661743, "rewards/rollout_reward_func/mean": 0.7696272134780884, "rewards/rollout_reward_func/std": 0.5084302425384521, "sampling/importance_sampling_ratio/max": 1.127915620803833, "sampling/importance_sampling_ratio/mean": 1.0068135261535645, "sampling/importance_sampling_ratio/min": 0.9945821762084961, "sampling/sampling_logp_difference/max": 0.12024378776550293, "sampling/sampling_logp_difference/mean": 0.0016275814268738031, "step": 852, "step_time": 8.053568354001982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 477.25, "completions/mean_terminated_length": 477.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.03557906858623028, "epoch": 0.01706, "frac_reward_zero_std": 0.0, "grad_norm": 0.004894325044006109, "kl": 0.6365198735147715, "learning_rate": 1.4460849503739128e-05, "loss": 0.0051, "num_tokens": 37576625.0, "reward": 0.7299555540084839, "reward_std": 0.48673591017723083, "rewards/rollout_reward_func/mean": 0.7299555540084839, "rewards/rollout_reward_func/std": 0.47527748346328735, "sampling/importance_sampling_ratio/max": 1.0037156343460083, "sampling/importance_sampling_ratio/mean": 0.9996873140335083, "sampling/importance_sampling_ratio/min": 0.9933878779411316, "sampling/sampling_logp_difference/max": 0.0057158879935741425, "sampling/sampling_logp_difference/mean": 0.000284406851278618, "step": 853, "step_time": 9.01274890500099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 372.40625, "completions/mean_terminated_length": 372.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.050829325802624226, "epoch": 0.01708, "frac_reward_zero_std": 0.0, "grad_norm": 0.048435986042022705, "kl": 0.8540756963193417, "learning_rate": 1.4441897606346211e-05, "loss": -0.0022, "num_tokens": 37617759.0, "reward": 0.7059335708618164, "reward_std": 0.5970220565795898, "rewards/rollout_reward_func/mean": 0.7059335708618164, "rewards/rollout_reward_func/std": 0.5977337956428528, "sampling/importance_sampling_ratio/max": 1.129751205444336, "sampling/importance_sampling_ratio/mean": 1.0046567916870117, "sampling/importance_sampling_ratio/min": 0.9961043000221252, "sampling/sampling_logp_difference/max": 0.12200260162353516, "sampling/sampling_logp_difference/mean": 0.0012436731485649943, "step": 854, "step_time": 8.366336537997995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 386.46875, "completions/mean_terminated_length": 386.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05360121792182326, "epoch": 0.0171, "frac_reward_zero_std": 0.25, "grad_norm": 0.10262228548526764, "kl": 1.3554922081530094, "learning_rate": 1.4423340522778158e-05, "loss": -0.0013, "num_tokens": 37660689.0, "reward": 0.8361577987670898, "reward_std": 0.3800540566444397, "rewards/rollout_reward_func/mean": 0.8361577987670898, "rewards/rollout_reward_func/std": 0.46125108003616333, "sampling/importance_sampling_ratio/max": 1.0076367855072021, "sampling/importance_sampling_ratio/mean": 0.9961690902709961, "sampling/importance_sampling_ratio/min": 0.8817474842071533, "sampling/sampling_logp_difference/max": 0.1243906021118164, "sampling/sampling_logp_difference/mean": 0.0013284797314554453, "step": 855, "step_time": 8.309945229000732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 464.15625, "completions/mean_terminated_length": 464.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05188705353066325, "epoch": 0.01712, "frac_reward_zero_std": 0.0, "grad_norm": 0.060470789670944214, "kl": 0.9629034213721752, "learning_rate": 1.4405178518894822e-05, "loss": 0.0027, "num_tokens": 37705633.0, "reward": 0.6678467988967896, "reward_std": 0.578435480594635, "rewards/rollout_reward_func/mean": 0.6678467988967896, "rewards/rollout_reward_func/std": 0.5617048740386963, "sampling/importance_sampling_ratio/max": 1.0317461490631104, "sampling/importance_sampling_ratio/mean": 1.0022194385528564, "sampling/importance_sampling_ratio/min": 0.9956632852554321, "sampling/sampling_logp_difference/max": 0.03125715255737305, "sampling/sampling_logp_difference/mean": 0.00070146971847862, "step": 856, "step_time": 8.091616228999555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 477.78125, "completions/mean_terminated_length": 477.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05378439789637923, "epoch": 0.01714, "frac_reward_zero_std": 0.0, "grad_norm": 0.07607148587703705, "kl": 0.7667591236531734, "learning_rate": 1.4387411854895919e-05, "loss": -0.0013, "num_tokens": 37752104.0, "reward": 0.5723659992218018, "reward_std": 0.6483472585678101, "rewards/rollout_reward_func/mean": 0.5723659992218018, "rewards/rollout_reward_func/std": 0.6352910399436951, "sampling/importance_sampling_ratio/max": 1.0116119384765625, "sampling/importance_sampling_ratio/mean": 0.9998437166213989, "sampling/importance_sampling_ratio/min": 0.9889190196990967, "sampling/sampling_logp_difference/max": 0.011545732617378235, "sampling/sampling_logp_difference/mean": 0.0005791471339762211, "step": 857, "step_time": 8.424893065001015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 390.3125, "completions/mean_terminated_length": 390.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07264073332771659, "epoch": 0.01716, "frac_reward_zero_std": 0.0, "grad_norm": 0.1002451628446579, "kl": 1.3604678958654404, "learning_rate": 1.4370040785317274e-05, "loss": 0.0017, "num_tokens": 37794864.0, "reward": 0.5402854681015015, "reward_std": 0.7014409303665161, "rewards/rollout_reward_func/mean": 0.5402854681015015, "rewards/rollout_reward_func/std": 0.6892061233520508, "sampling/importance_sampling_ratio/max": 1.0589537620544434, "sampling/importance_sampling_ratio/mean": 1.0034546852111816, "sampling/importance_sampling_ratio/min": 0.9565439224243164, "sampling/sampling_logp_difference/max": 0.057480454444885254, "sampling/sampling_logp_difference/mean": 0.0017498506931588054, "step": 858, "step_time": 7.943162333000146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 463.96875, "completions/mean_terminated_length": 463.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.052674413193017244, "epoch": 0.01718, "frac_reward_zero_std": 0.0, "grad_norm": 0.1709037870168686, "kl": 0.6439626067876816, "learning_rate": 1.435306555902721e-05, "loss": 0.0011, "num_tokens": 37840973.0, "reward": 0.6127427816390991, "reward_std": 0.628068208694458, "rewards/rollout_reward_func/mean": 0.6127427816390991, "rewards/rollout_reward_func/std": 0.6695498824119568, "sampling/importance_sampling_ratio/max": 1.011268973350525, "sampling/importance_sampling_ratio/mean": 1.000633716583252, "sampling/importance_sampling_ratio/min": 0.9955971837043762, "sampling/sampling_logp_difference/max": 0.010762766003608704, "sampling/sampling_logp_difference/mean": 0.00042666797526180744, "step": 859, "step_time": 8.591728874000182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 431.96875, "completions/mean_terminated_length": 431.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04269623523578048, "epoch": 0.0172, "frac_reward_zero_std": 0.0, "grad_norm": 0.01739727519452572, "kl": 0.997953400015831, "learning_rate": 1.4336486419222937e-05, "loss": -0.001, "num_tokens": 37885987.0, "reward": 0.5965332984924316, "reward_std": 0.6022471189498901, "rewards/rollout_reward_func/mean": 0.5965332984924316, "rewards/rollout_reward_func/std": 0.5874714255332947, "sampling/importance_sampling_ratio/max": 1.0055713653564453, "sampling/importance_sampling_ratio/mean": 1.000736951828003, "sampling/importance_sampling_ratio/min": 0.9971662759780884, "sampling/sampling_logp_difference/max": 0.005589380860328674, "sampling/sampling_logp_difference/mean": 0.00031864922493696213, "step": 860, "step_time": 9.015058048007631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 358.1875, "completions/mean_terminated_length": 358.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07121313968673348, "epoch": 0.01722, "frac_reward_zero_std": 0.0, "grad_norm": 0.22664706408977509, "kl": 1.4202239215373993, "learning_rate": 1.4320303603427121e-05, "loss": -0.0003, "num_tokens": 37927398.0, "reward": 0.7053022384643555, "reward_std": 0.6163532733917236, "rewards/rollout_reward_func/mean": 0.7053022384643555, "rewards/rollout_reward_func/std": 0.5973548293113708, "sampling/importance_sampling_ratio/max": 1.1283069849014282, "sampling/importance_sampling_ratio/mean": 0.9941366910934448, "sampling/importance_sampling_ratio/min": 0.7149350047111511, "sampling/sampling_logp_difference/max": 0.4162464141845703, "sampling/sampling_logp_difference/mean": 0.006684551481157541, "step": 861, "step_time": 8.395191101002638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 322.75, "completions/mean_terminated_length": 322.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04780286364257336, "epoch": 0.01724, "frac_reward_zero_std": 0.0, "grad_norm": 0.010369968600571156, "kl": 0.8003004379570484, "learning_rate": 1.4304517343484437e-05, "loss": 0.0028, "num_tokens": 37966312.0, "reward": 0.6394479274749756, "reward_std": 0.5895068645477295, "rewards/rollout_reward_func/mean": 0.6394479274749756, "rewards/rollout_reward_func/std": 0.6181334853172302, "sampling/importance_sampling_ratio/max": 1.0184519290924072, "sampling/importance_sampling_ratio/mean": 0.9991394877433777, "sampling/importance_sampling_ratio/min": 0.9400967359542847, "sampling/sampling_logp_difference/max": 0.06169700622558594, "sampling/sampling_logp_difference/mean": 0.000982558587566018, "step": 862, "step_time": 8.777638830999422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 466.90625, "completions/mean_terminated_length": 466.90625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05608996469527483, "epoch": 0.01726, "frac_reward_zero_std": 0.0, "grad_norm": 0.12758006155490875, "kl": 0.7326737502589822, "learning_rate": 1.4289127865558263e-05, "loss": 0.0085, "num_tokens": 38011781.0, "reward": 0.7021796703338623, "reward_std": 0.5458821654319763, "rewards/rollout_reward_func/mean": 0.7021796703338623, "rewards/rollout_reward_func/std": 0.5467676520347595, "sampling/importance_sampling_ratio/max": 1.0086917877197266, "sampling/importance_sampling_ratio/mean": 1.0006980895996094, "sampling/importance_sampling_ratio/min": 0.9910494685173035, "sampling/sampling_logp_difference/max": 0.00900876522064209, "sampling/sampling_logp_difference/mean": 0.0004998704534955323, "step": 863, "step_time": 8.649906713999371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 436.59375, "completions/mean_terminated_length": 436.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04957310878671706, "epoch": 0.01728, "frac_reward_zero_std": 0.25, "grad_norm": 0.004130879417061806, "kl": 0.8295206129550934, "learning_rate": 1.4274135390127455e-05, "loss": 0.0105, "num_tokens": 38055149.0, "reward": 0.7953484058380127, "reward_std": 0.3637661933898926, "rewards/rollout_reward_func/mean": 0.7953484058380127, "rewards/rollout_reward_func/std": 0.43290016055107117, "sampling/importance_sampling_ratio/max": 1.0043760538101196, "sampling/importance_sampling_ratio/mean": 0.998490035533905, "sampling/importance_sampling_ratio/min": 0.964232325553894, "sampling/sampling_logp_difference/max": 0.036591410636901855, "sampling/sampling_logp_difference/mean": 0.0005253720446489751, "step": 864, "step_time": 9.092562814998018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 326.71875, "completions/mean_terminated_length": 326.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.045801938977092505, "epoch": 0.0173, "frac_reward_zero_std": 0.25, "grad_norm": 0.0963851809501648, "kl": 1.0578555930405855, "learning_rate": 1.4259540131983157e-05, "loss": 0.008, "num_tokens": 38093775.0, "reward": 0.8381471633911133, "reward_std": 0.37407630681991577, "rewards/rollout_reward_func/mean": 0.8381471633911133, "rewards/rollout_reward_func/std": 0.45902106165885925, "sampling/importance_sampling_ratio/max": 1.0203030109405518, "sampling/importance_sampling_ratio/mean": 1.001282811164856, "sampling/importance_sampling_ratio/min": 0.9965405464172363, "sampling/sampling_logp_difference/max": 0.027802377939224243, "sampling/sampling_logp_difference/mean": 0.0005983954179100692, "step": 865, "step_time": 8.280549752000297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 509.84375, "completions/mean_terminated_length": 509.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.057668506633490324, "epoch": 0.01732, "frac_reward_zero_std": 0.25, "grad_norm": 0.03941993787884712, "kl": 1.1039548479020596, "learning_rate": 1.4245342300225755e-05, "loss": 0.0067, "num_tokens": 38140865.0, "reward": 0.728663444519043, "reward_std": 0.3964707851409912, "rewards/rollout_reward_func/mean": 0.728663444519043, "rewards/rollout_reward_func/std": 0.4775371551513672, "sampling/importance_sampling_ratio/max": 1.0725431442260742, "sampling/importance_sampling_ratio/mean": 0.9992589354515076, "sampling/importance_sampling_ratio/min": 0.8815878033638, "sampling/sampling_logp_difference/max": 0.12604999542236328, "sampling/sampling_logp_difference/mean": 0.001772777526639402, "step": 866, "step_time": 8.794000227000652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 506.53125, "completions/mean_terminated_length": 506.53125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04377488512545824, "epoch": 0.01734, "frac_reward_zero_std": 0.0, "grad_norm": 0.03734529763460159, "kl": 1.2247474193572998, "learning_rate": 1.4231542098261859e-05, "loss": 0.0098, "num_tokens": 38187706.0, "reward": 0.5991978645324707, "reward_std": 0.540894627571106, "rewards/rollout_reward_func/mean": 0.5991978645324707, "rewards/rollout_reward_func/std": 0.5842189788818359, "sampling/importance_sampling_ratio/max": 1.1292539834976196, "sampling/importance_sampling_ratio/mean": 1.0035450458526611, "sampling/importance_sampling_ratio/min": 0.9838958978652954, "sampling/sampling_logp_difference/max": 0.12154364585876465, "sampling/sampling_logp_difference/mean": 0.0014823619276285172, "step": 867, "step_time": 9.419915888000105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 493.25, "completions/mean_terminated_length": 493.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04453157912939787, "epoch": 0.01736, "frac_reward_zero_std": 0.0, "grad_norm": 0.11389218270778656, "kl": 1.72265150770545, "learning_rate": 1.4218139723801399e-05, "loss": 0.0087, "num_tokens": 38233873.0, "reward": 0.7965935468673706, "reward_std": 0.4269757866859436, "rewards/rollout_reward_func/mean": 0.7965935468673706, "rewards/rollout_reward_func/std": 0.430256724357605, "sampling/importance_sampling_ratio/max": 1.042456865310669, "sampling/importance_sampling_ratio/mean": 1.0019314289093018, "sampling/importance_sampling_ratio/min": 0.9959359169006348, "sampling/sampling_logp_difference/max": 0.04157465696334839, "sampling/sampling_logp_difference/mean": 0.0006809817859902978, "step": 868, "step_time": 9.121280936999028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 352.84375, "completions/mean_terminated_length": 352.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04315443360246718, "epoch": 0.01738, "frac_reward_zero_std": 0.5, "grad_norm": 0.0029913848266005516, "kl": 0.7455997280776501, "learning_rate": 1.4205135368854796e-05, "loss": 0.0069, "num_tokens": 38273860.0, "reward": 0.9325488805770874, "reward_std": 0.19078059494495392, "rewards/rollout_reward_func/mean": 0.9325488805770874, "rewards/rollout_reward_func/std": 0.265417218208313, "sampling/importance_sampling_ratio/max": 1.0153855085372925, "sampling/importance_sampling_ratio/mean": 1.0009171962738037, "sampling/importance_sampling_ratio/min": 0.9928794503211975, "sampling/sampling_logp_difference/max": 0.015031710267066956, "sampling/sampling_logp_difference/mean": 0.0004331322852522135, "step": 869, "step_time": 9.30818019400067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 389.625, "completions/mean_terminated_length": 389.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.053115498973056674, "epoch": 0.0174, "frac_reward_zero_std": 0.0, "grad_norm": 0.015568199567496777, "kl": 1.0334639996290207, "learning_rate": 1.4192529219730199e-05, "loss": 0.0091, "num_tokens": 38316117.0, "reward": 0.6061040759086609, "reward_std": 0.6355239152908325, "rewards/rollout_reward_func/mean": 0.6061040759086609, "rewards/rollout_reward_func/std": 0.6269366145133972, "sampling/importance_sampling_ratio/max": 1.0052465200424194, "sampling/importance_sampling_ratio/mean": 0.9991503953933716, "sampling/importance_sampling_ratio/min": 0.9846031069755554, "sampling/sampling_logp_difference/max": 0.015525974333286285, "sampling/sampling_logp_difference/mean": 0.0006270684534683824, "step": 870, "step_time": 8.891044950998548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 539.4375, "completions/mean_terminated_length": 539.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.047173695638775826, "epoch": 0.01742, "frac_reward_zero_std": 0.0, "grad_norm": 0.017554491758346558, "kl": 0.7927888818085194, "learning_rate": 1.418032145703082e-05, "loss": 0.0078, "num_tokens": 38364625.0, "reward": 0.7977946996688843, "reward_std": 0.42535388469696045, "rewards/rollout_reward_func/mean": 0.7977946996688843, "rewards/rollout_reward_func/std": 0.4276610016822815, "sampling/importance_sampling_ratio/max": 1.0083258152008057, "sampling/importance_sampling_ratio/mean": 1.001054048538208, "sampling/importance_sampling_ratio/min": 0.9945226907730103, "sampling/sampling_logp_difference/max": 0.00865192711353302, "sampling/sampling_logp_difference/mean": 0.0004962208331562579, "step": 871, "step_time": 8.740224372000739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 383.4375, "completions/mean_terminated_length": 383.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06125869043171406, "epoch": 0.01744, "frac_reward_zero_std": 0.25, "grad_norm": 0.05154076963663101, "kl": 0.9548568492755294, "learning_rate": 1.4168512255652366e-05, "loss": 0.0014, "num_tokens": 38405645.0, "reward": 0.7396450638771057, "reward_std": 0.4948764443397522, "rewards/rollout_reward_func/mean": 0.7396450638771057, "rewards/rollout_reward_func/std": 0.5824181437492371, "sampling/importance_sampling_ratio/max": 1.0412713289260864, "sampling/importance_sampling_ratio/mean": 1.001404047012329, "sampling/importance_sampling_ratio/min": 0.9896023869514465, "sampling/sampling_logp_difference/max": 0.039426565170288086, "sampling/sampling_logp_difference/mean": 0.0008479218231514096, "step": 872, "step_time": 8.904084310999679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 436.75, "completions/mean_terminated_length": 436.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05341095803305507, "epoch": 0.01746, "frac_reward_zero_std": 0.0, "grad_norm": 0.00995608326047659, "kl": 0.7657488100230694, "learning_rate": 1.4157101784780496e-05, "loss": 0.0114, "num_tokens": 38448656.0, "reward": 0.7977458238601685, "reward_std": 0.4401950240135193, "rewards/rollout_reward_func/mean": 0.7977458238601685, "rewards/rollout_reward_func/std": 0.4277644455432892, "sampling/importance_sampling_ratio/max": 1.0190205574035645, "sampling/importance_sampling_ratio/mean": 1.00144624710083, "sampling/importance_sampling_ratio/min": 0.9943660497665405, "sampling/sampling_logp_difference/max": 0.018556013703346252, "sampling/sampling_logp_difference/mean": 0.0006224602111615241, "step": 873, "step_time": 8.442453362997185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 400.96875, "completions/mean_terminated_length": 400.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05853135650977492, "epoch": 0.01748, "frac_reward_zero_std": 0.0, "grad_norm": 0.04335436224937439, "kl": 0.783370852470398, "learning_rate": 1.414609020788843e-05, "loss": 0.0099, "num_tokens": 38491361.0, "reward": 0.6662112474441528, "reward_std": 0.5367963910102844, "rewards/rollout_reward_func/mean": 0.6662112474441528, "rewards/rollout_reward_func/std": 0.5638641715049744, "sampling/importance_sampling_ratio/max": 1.1585272550582886, "sampling/importance_sampling_ratio/mean": 1.0055632591247559, "sampling/importance_sampling_ratio/min": 0.9956498146057129, "sampling/sampling_logp_difference/max": 0.14712190628051758, "sampling/sampling_logp_difference/mean": 0.0015239513013511896, "step": 874, "step_time": 8.851496294000754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 437.28125, "completions/mean_terminated_length": 437.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07005872647278011, "epoch": 0.0175, "frac_reward_zero_std": 0.0, "grad_norm": 0.06921201199293137, "kl": 0.631082154577598, "learning_rate": 1.413547768273459e-05, "loss": -0.0058, "num_tokens": 38535790.0, "reward": 0.6444807052612305, "reward_std": 0.6555719375610352, "rewards/rollout_reward_func/mean": 0.6444807052612305, "rewards/rollout_reward_func/std": 0.6650317311286926, "sampling/importance_sampling_ratio/max": 1.1268599033355713, "sampling/importance_sampling_ratio/mean": 1.0051100254058838, "sampling/importance_sampling_ratio/min": 0.9920827746391296, "sampling/sampling_logp_difference/max": 0.1194298267364502, "sampling/sampling_logp_difference/mean": 0.0018406598828732967, "step": 875, "step_time": 8.2792445699979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 348.5, "completions/mean_terminated_length": 348.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05907839699648321, "epoch": 0.01752, "frac_reward_zero_std": 0.5, "grad_norm": 0.019702445715665817, "kl": 1.0665649697184563, "learning_rate": 1.412526436136035e-05, "loss": 0.006, "num_tokens": 38575453.0, "reward": 0.864093542098999, "reward_std": 0.23818941414356232, "rewards/rollout_reward_func/mean": 0.864093542098999, "rewards/rollout_reward_func/std": 0.3653869926929474, "sampling/importance_sampling_ratio/max": 1.1282665729522705, "sampling/importance_sampling_ratio/mean": 1.000737190246582, "sampling/importance_sampling_ratio/min": 0.8462574481964111, "sampling/sampling_logp_difference/max": 0.17613983154296875, "sampling/sampling_logp_difference/mean": 0.0030816574580967426, "step": 876, "step_time": 8.458799142003045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 462.40625, "completions/mean_terminated_length": 462.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04513375577516854, "epoch": 0.01754, "frac_reward_zero_std": 0.0, "grad_norm": 0.01162679959088564, "kl": 0.8163580186665058, "learning_rate": 1.4115450390087841e-05, "loss": 0.0059, "num_tokens": 38620197.0, "reward": 0.7368367314338684, "reward_std": 0.5163937211036682, "rewards/rollout_reward_func/mean": 0.7368367314338684, "rewards/rollout_reward_func/std": 0.528084933757782, "sampling/importance_sampling_ratio/max": 1.1265822649002075, "sampling/importance_sampling_ratio/mean": 1.0032916069030762, "sampling/importance_sampling_ratio/min": 0.9936600923538208, "sampling/sampling_logp_difference/max": 0.11918330192565918, "sampling/sampling_logp_difference/mean": 0.001344715477898717, "step": 877, "step_time": 8.156169655998383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 353.1875, "completions/mean_terminated_length": 353.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05992277292534709, "epoch": 0.01756, "frac_reward_zero_std": 0.0, "grad_norm": 0.024118073284626007, "kl": 0.5630205329507589, "learning_rate": 1.4106035909517883e-05, "loss": -0.005, "num_tokens": 38660564.0, "reward": 0.6411817073822021, "reward_std": 0.6259673833847046, "rewards/rollout_reward_func/mean": 0.6411817073822021, "rewards/rollout_reward_func/std": 0.6162779927253723, "sampling/importance_sampling_ratio/max": 1.0064905881881714, "sampling/importance_sampling_ratio/mean": 0.9921838045120239, "sampling/importance_sampling_ratio/min": 0.8865051865577698, "sampling/sampling_logp_difference/max": 0.12047624588012695, "sampling/sampling_logp_difference/mean": 0.002522840863093734, "step": 878, "step_time": 8.05568803800088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 328.4375, "completions/mean_terminated_length": 328.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.07164469524286687, "epoch": 0.01758, "frac_reward_zero_std": 0.0, "grad_norm": 0.08554575592279434, "kl": 0.7201458141207695, "learning_rate": 1.4097021054527939e-05, "loss": -0.0046, "num_tokens": 38701339.0, "reward": 0.7118511199951172, "reward_std": 0.6097300052642822, "rewards/rollout_reward_func/mean": 0.7118511199951172, "rewards/rollout_reward_func/std": 0.6409900188446045, "sampling/importance_sampling_ratio/max": 1.0137935876846313, "sampling/importance_sampling_ratio/mean": 0.9993321895599365, "sampling/importance_sampling_ratio/min": 0.9554333686828613, "sampling/sampling_logp_difference/max": 0.046224355697631836, "sampling/sampling_logp_difference/mean": 0.0008179308497346938, "step": 879, "step_time": 8.304176757999812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 356.03125, "completions/mean_terminated_length": 356.03125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06754741305485368, "epoch": 0.0176, "frac_reward_zero_std": 0.0, "grad_norm": 0.04121682792901993, "kl": 0.857087817043066, "learning_rate": 1.4088405954270204e-05, "loss": 0.0031, "num_tokens": 38742188.0, "reward": 0.6724567413330078, "reward_std": 0.6295918226242065, "rewards/rollout_reward_func/mean": 0.6724567413330078, "rewards/rollout_reward_func/std": 0.6108361482620239, "sampling/importance_sampling_ratio/max": 1.200297474861145, "sampling/importance_sampling_ratio/mean": 1.00481379032135, "sampling/importance_sampling_ratio/min": 0.9366893768310547, "sampling/sampling_logp_difference/max": 0.18110370635986328, "sampling/sampling_logp_difference/mean": 0.0024272282607853413, "step": 880, "step_time": 7.889802684001552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 437.25, "completions/mean_terminated_length": 437.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05715452926233411, "epoch": 0.01762, "frac_reward_zero_std": 0.25, "grad_norm": 0.009180731140077114, "kl": 0.9858060777187347, "learning_rate": 1.4080190732169742e-05, "loss": 0.0047, "num_tokens": 38786196.0, "reward": 0.8001861572265625, "reward_std": 0.3300366997718811, "rewards/rollout_reward_func/mean": 0.8001861572265625, "rewards/rollout_reward_func/std": 0.42261871695518494, "sampling/importance_sampling_ratio/max": 1.0084702968597412, "sampling/importance_sampling_ratio/mean": 0.9928680062294006, "sampling/importance_sampling_ratio/min": 0.8846774101257324, "sampling/sampling_logp_difference/max": 0.12253618240356445, "sampling/sampling_logp_difference/mean": 0.0023839271161705256, "step": 881, "step_time": 8.6647245189979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 438.9375, "completions/mean_terminated_length": 438.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04888146393932402, "epoch": 0.01764, "frac_reward_zero_std": 0.25, "grad_norm": 0.006905862595885992, "kl": 0.9819873608648777, "learning_rate": 1.4072375505922738e-05, "loss": 0.0101, "num_tokens": 38829432.0, "reward": 0.6967222690582275, "reward_std": 0.3905297815799713, "rewards/rollout_reward_func/mean": 0.6967222690582275, "rewards/rollout_reward_func/std": 0.4927016794681549, "sampling/importance_sampling_ratio/max": 1.0081088542938232, "sampling/importance_sampling_ratio/mean": 1.0014756917953491, "sampling/importance_sampling_ratio/min": 0.9967203736305237, "sampling/sampling_logp_difference/max": 0.00747324526309967, "sampling/sampling_logp_difference/mean": 0.0004333067627158016, "step": 882, "step_time": 7.950120964995222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 489.84375, "completions/mean_terminated_length": 489.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08078007586300373, "epoch": 0.01766, "frac_reward_zero_std": 0.0, "grad_norm": 0.08353795111179352, "kl": 0.9566560387611389, "learning_rate": 1.4064960387494779e-05, "loss": -0.0, "num_tokens": 38875974.0, "reward": 0.6743547916412354, "reward_std": 0.6090975999832153, "rewards/rollout_reward_func/mean": 0.6743547916412354, "rewards/rollout_reward_func/std": 0.6054989695549011, "sampling/importance_sampling_ratio/max": 1.0562876462936401, "sampling/importance_sampling_ratio/mean": 1.002262830734253, "sampling/importance_sampling_ratio/min": 0.9765706062316895, "sampling/sampling_logp_difference/max": 0.05314016342163086, "sampling/sampling_logp_difference/mean": 0.0012920633889734745, "step": 883, "step_time": 8.094620166997629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 419.96875, "completions/mean_terminated_length": 419.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0529099078848958, "epoch": 0.01768, "frac_reward_zero_std": 0.25, "grad_norm": 0.044769033789634705, "kl": 0.5199345145374537, "learning_rate": 1.4057945483119285e-05, "loss": 0.003, "num_tokens": 38919073.0, "reward": 0.7357449531555176, "reward_std": 0.4090166985988617, "rewards/rollout_reward_func/mean": 0.7357449531555176, "rewards/rollout_reward_func/std": 0.5283204317092896, "sampling/importance_sampling_ratio/max": 1.007091760635376, "sampling/importance_sampling_ratio/mean": 1.000322937965393, "sampling/importance_sampling_ratio/min": 0.9758345484733582, "sampling/sampling_logp_difference/max": 0.024822622537612915, "sampling/sampling_logp_difference/mean": 0.0007177510997280478, "step": 884, "step_time": 8.572596061996592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 457.6875, "completions/mean_terminated_length": 457.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07068718643859029, "epoch": 0.0177, "frac_reward_zero_std": 0.0, "grad_norm": 0.21792060136795044, "kl": 0.8493473008275032, "learning_rate": 1.4051330893295958e-05, "loss": 0.0023, "num_tokens": 38964669.0, "reward": 0.6810673475265503, "reward_std": 0.6519420146942139, "rewards/rollout_reward_func/mean": 0.6810673475265503, "rewards/rollout_reward_func/std": 0.6479934453964233, "sampling/importance_sampling_ratio/max": 1.136773705482483, "sampling/importance_sampling_ratio/mean": 1.0076251029968262, "sampling/importance_sampling_ratio/min": 0.9849030375480652, "sampling/sampling_logp_difference/max": 0.12481880187988281, "sampling/sampling_logp_difference/mean": 0.002347296569496393, "step": 885, "step_time": 8.166087143999903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 406.15625, "completions/mean_terminated_length": 406.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05841958476230502, "epoch": 0.01772, "frac_reward_zero_std": 0.0, "grad_norm": 0.0899123027920723, "kl": 0.9592459797859192, "learning_rate": 1.4045116712789364e-05, "loss": 0.0088, "num_tokens": 39007441.0, "reward": 0.5734579563140869, "reward_std": 0.5973485708236694, "rewards/rollout_reward_func/mean": 0.5734579563140869, "rewards/rollout_reward_func/std": 0.6337785720825195, "sampling/importance_sampling_ratio/max": 1.0062839984893799, "sampling/importance_sampling_ratio/mean": 0.9985285997390747, "sampling/importance_sampling_ratio/min": 0.9699920415878296, "sampling/sampling_logp_difference/max": 0.03048604726791382, "sampling/sampling_logp_difference/mean": 0.0006993359420448542, "step": 886, "step_time": 8.61325671099803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 349.0, "completions/mean_terminated_length": 349.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04854583041742444, "epoch": 0.01774, "frac_reward_zero_std": 0.25, "grad_norm": 0.030056731775403023, "kl": 0.7724031880497932, "learning_rate": 1.4039303030627564e-05, "loss": -0.0016, "num_tokens": 39047297.0, "reward": 0.7420517206192017, "reward_std": 0.4721496105194092, "rewards/rollout_reward_func/mean": 0.7420517206192017, "rewards/rollout_reward_func/std": 0.5759487748146057, "sampling/importance_sampling_ratio/max": 1.0065734386444092, "sampling/importance_sampling_ratio/mean": 0.9936729669570923, "sampling/importance_sampling_ratio/min": 0.7950231432914734, "sampling/sampling_logp_difference/max": 0.22939085960388184, "sampling/sampling_logp_difference/mean": 0.0022405250929296017, "step": 887, "step_time": 7.940222204002566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 317.1875, "completions/mean_terminated_length": 317.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04435697942972183, "epoch": 0.01776, "frac_reward_zero_std": 0.5, "grad_norm": 0.008729971945285797, "kl": 0.9550469596870244, "learning_rate": 1.4033889930100831e-05, "loss": 0.0064, "num_tokens": 39085812.0, "reward": 0.9334942698478699, "reward_std": 0.1881066858768463, "rewards/rollout_reward_func/mean": 0.9334942698478699, "rewards/rollout_reward_func/std": 0.2617051303386688, "sampling/importance_sampling_ratio/max": 1.0089313983917236, "sampling/importance_sampling_ratio/mean": 1.0003437995910645, "sampling/importance_sampling_ratio/min": 0.9936517477035522, "sampling/sampling_logp_difference/max": 0.008899062871932983, "sampling/sampling_logp_difference/mean": 0.00032432819716632366, "step": 888, "step_time": 8.24901095399946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 499.09375, "completions/mean_terminated_length": 499.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05434965342283249, "epoch": 0.01778, "frac_reward_zero_std": 0.5, "grad_norm": 0.0495731346309185, "kl": 0.7063534818589687, "learning_rate": 1.402887748876048e-05, "loss": 0.0034, "num_tokens": 39132549.0, "reward": 0.770706057548523, "reward_std": 0.32853272557258606, "rewards/rollout_reward_func/mean": 0.770706057548523, "rewards/rollout_reward_func/std": 0.5050870776176453, "sampling/importance_sampling_ratio/max": 1.0105730295181274, "sampling/importance_sampling_ratio/mean": 1.001185417175293, "sampling/importance_sampling_ratio/min": 0.9977434873580933, "sampling/sampling_logp_difference/max": 0.009489655494689941, "sampling/sampling_logp_difference/mean": 0.0003502531617414206, "step": 889, "step_time": 9.41422362999765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 462.6875, "completions/mean_terminated_length": 462.6875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05285537405870855, "epoch": 0.0178, "frac_reward_zero_std": 0.0, "grad_norm": 0.04630395770072937, "kl": 0.7794124074280262, "learning_rate": 1.4024265778417739e-05, "loss": -0.0008, "num_tokens": 39178705.0, "reward": 0.6986827254295349, "reward_std": 0.5105533599853516, "rewards/rollout_reward_func/mean": 0.6986827254295349, "rewards/rollout_reward_func/std": 0.5518156290054321, "sampling/importance_sampling_ratio/max": 1.1293827295303345, "sampling/importance_sampling_ratio/mean": 1.005160927772522, "sampling/importance_sampling_ratio/min": 0.9932911992073059, "sampling/sampling_logp_difference/max": 0.11982512474060059, "sampling/sampling_logp_difference/mean": 0.0013534517493098974, "step": 890, "step_time": 9.22683045499798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 469.96875, "completions/mean_terminated_length": 469.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.060734499245882034, "epoch": 0.01782, "frac_reward_zero_std": 0.0, "grad_norm": 0.07666626572608948, "kl": 0.8004449536092579, "learning_rate": 1.4020054865142719e-05, "loss": 0.001, "num_tokens": 39225121.0, "reward": 0.67411208152771, "reward_std": 0.5999634265899658, "rewards/rollout_reward_func/mean": 0.67411208152771, "rewards/rollout_reward_func/std": 0.6086531281471252, "sampling/importance_sampling_ratio/max": 1.031135082244873, "sampling/importance_sampling_ratio/mean": 1.001904010772705, "sampling/importance_sampling_ratio/min": 0.9917395114898682, "sampling/sampling_logp_difference/max": 0.022764921188354492, "sampling/sampling_logp_difference/mean": 0.0006018067942932248, "step": 891, "step_time": 8.827996341999096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 437.71875, "completions/mean_terminated_length": 437.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05885353940539062, "epoch": 0.01784, "frac_reward_zero_std": 0.25, "grad_norm": 0.025809116661548615, "kl": 0.6344181224703789, "learning_rate": 1.401624480926348e-05, "loss": -0.0027, "num_tokens": 39270094.0, "reward": 0.7029135227203369, "reward_std": 0.4232480227947235, "rewards/rollout_reward_func/mean": 0.7029135227203369, "rewards/rollout_reward_func/std": 0.5438087582588196, "sampling/importance_sampling_ratio/max": 1.0096378326416016, "sampling/importance_sampling_ratio/mean": 1.000746726989746, "sampling/importance_sampling_ratio/min": 0.9925177097320557, "sampling/sampling_logp_difference/max": 0.007670603692531586, "sampling/sampling_logp_difference/mean": 0.00047006126260384917, "step": 892, "step_time": 8.14710185000149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 445.1875, "completions/mean_terminated_length": 445.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04432955174706876, "epoch": 0.01786, "frac_reward_zero_std": 0.0, "grad_norm": 0.013110603205859661, "kl": 0.7590419389307499, "learning_rate": 1.4012835665365156e-05, "loss": 0.0112, "num_tokens": 39313696.0, "reward": 0.7639631032943726, "reward_std": 0.455127090215683, "rewards/rollout_reward_func/mean": 0.7639631032943726, "rewards/rollout_reward_func/std": 0.4532117247581482, "sampling/importance_sampling_ratio/max": 1.0048694610595703, "sampling/importance_sampling_ratio/mean": 1.000267505645752, "sampling/importance_sampling_ratio/min": 0.9953427314758301, "sampling/sampling_logp_difference/max": 0.0072564370930194855, "sampling/sampling_logp_difference/mean": 0.0003885990590788424, "step": 893, "step_time": 8.188494796002487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 525.3125, "completions/mean_terminated_length": 525.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05240798369050026, "epoch": 0.01788, "frac_reward_zero_std": 0.0, "grad_norm": 0.013714971952140331, "kl": 0.6452549025416374, "learning_rate": 1.4009827482289181e-05, "loss": 0.0101, "num_tokens": 39361392.0, "reward": 0.630859375, "reward_std": 0.530907392501831, "rewards/rollout_reward_func/mean": 0.630859375, "rewards/rollout_reward_func/std": 0.5183380246162415, "sampling/importance_sampling_ratio/max": 1.0166254043579102, "sampling/importance_sampling_ratio/mean": 0.9968096017837524, "sampling/importance_sampling_ratio/min": 0.8888164758682251, "sampling/sampling_logp_difference/max": 0.11787128448486328, "sampling/sampling_logp_difference/mean": 0.0011687726946547627, "step": 894, "step_time": 8.890555424000922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 433.75, "completions/mean_terminated_length": 433.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05964024574495852, "epoch": 0.0179, "frac_reward_zero_std": 0.5, "grad_norm": 0.077114537358284, "kl": 0.5904778242111206, "learning_rate": 1.4007220303132573e-05, "loss": 0.0022, "num_tokens": 39404575.0, "reward": 0.8362500071525574, "reward_std": 0.31350910663604736, "rewards/rollout_reward_func/mean": 0.8362500071525574, "rewards/rollout_reward_func/std": 0.46285054087638855, "sampling/importance_sampling_ratio/max": 1.0171329975128174, "sampling/importance_sampling_ratio/mean": 1.000476598739624, "sampling/importance_sampling_ratio/min": 0.9928604960441589, "sampling/sampling_logp_difference/max": 0.01699650287628174, "sampling/sampling_logp_difference/mean": 0.0006027717608958483, "step": 895, "step_time": 8.69760099299856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 421.46875, "completions/mean_terminated_length": 421.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06155513180419803, "epoch": 0.01792, "frac_reward_zero_std": 0.0, "grad_norm": 0.01932578720152378, "kl": 0.6195799484848976, "learning_rate": 1.4005014165247341e-05, "loss": 0.0061, "num_tokens": 39447275.0, "reward": 0.7354681491851807, "reward_std": 0.5352328419685364, "rewards/rollout_reward_func/mean": 0.7354681491851807, "rewards/rollout_reward_func/std": 0.5305950045585632, "sampling/importance_sampling_ratio/max": 1.0327603816986084, "sampling/importance_sampling_ratio/mean": 1.002079725265503, "sampling/importance_sampling_ratio/min": 0.9927739500999451, "sampling/sampling_logp_difference/max": 0.018377304077148438, "sampling/sampling_logp_difference/mean": 0.0007117460481822491, "step": 896, "step_time": 7.945919504001722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 436.78125, "completions/mean_terminated_length": 436.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05850007780827582, "epoch": 0.01794, "frac_reward_zero_std": 0.0, "grad_norm": 0.0756327211856842, "kl": 0.9146106615662575, "learning_rate": 1.4003209100239935e-05, "loss": 0.0055, "num_tokens": 39491126.0, "reward": 0.571993350982666, "reward_std": 0.621275782585144, "rewards/rollout_reward_func/mean": 0.571993350982666, "rewards/rollout_reward_func/std": 0.636709988117218, "sampling/importance_sampling_ratio/max": 1.0928153991699219, "sampling/importance_sampling_ratio/mean": 1.0030145645141602, "sampling/importance_sampling_ratio/min": 0.9933221340179443, "sampling/sampling_logp_difference/max": 0.10124647617340088, "sampling/sampling_logp_difference/mean": 0.0012909932993352413, "step": 897, "step_time": 8.250444180002887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 385.09375, "completions/mean_terminated_length": 385.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05946746189147234, "epoch": 0.01796, "frac_reward_zero_std": 0.25, "grad_norm": 0.07538504898548126, "kl": 1.2071186676621437, "learning_rate": 1.4001805133970797e-05, "loss": 0.0068, "num_tokens": 39533829.0, "reward": 0.7691256999969482, "reward_std": 0.4125182032585144, "rewards/rollout_reward_func/mean": 0.7691256999969482, "rewards/rollout_reward_func/std": 0.5089782476425171, "sampling/importance_sampling_ratio/max": 1.126849889755249, "sampling/importance_sampling_ratio/mean": 1.0006890296936035, "sampling/importance_sampling_ratio/min": 0.8959082365036011, "sampling/sampling_logp_difference/max": 0.11942601203918457, "sampling/sampling_logp_difference/mean": 0.0022592602763324976, "step": 898, "step_time": 7.793057595996288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 347.71875, "completions/mean_terminated_length": 347.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0710995018016547, "epoch": 0.01798, "frac_reward_zero_std": 0.0, "grad_norm": 0.11539111286401749, "kl": 0.6633780971169472, "learning_rate": 1.400080228655398e-05, "loss": 0.0016, "num_tokens": 39573856.0, "reward": 0.6454638838768005, "reward_std": 0.6534808874130249, "rewards/rollout_reward_func/mean": 0.6454638838768005, "rewards/rollout_reward_func/std": 0.6619250178337097, "sampling/importance_sampling_ratio/max": 1.0325207710266113, "sampling/importance_sampling_ratio/mean": 0.9987700581550598, "sampling/importance_sampling_ratio/min": 0.9534859657287598, "sampling/sampling_logp_difference/max": 0.045864999294281006, "sampling/sampling_logp_difference/mean": 0.0011679312447085977, "step": 899, "step_time": 8.412439352998263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 401.65625, "completions/mean_terminated_length": 401.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05385610228404403, "epoch": 0.018, "frac_reward_zero_std": 0.25, "grad_norm": 0.0417311005294323, "kl": 0.8382021114230156, "learning_rate": 1.4000200572356875e-05, "loss": 0.0042, "num_tokens": 39615777.0, "reward": 0.8375790119171143, "reward_std": 0.37647148966789246, "rewards/rollout_reward_func/mean": 0.8375790119171143, "rewards/rollout_reward_func/std": 0.4582556486129761, "sampling/importance_sampling_ratio/max": 1.0075678825378418, "sampling/importance_sampling_ratio/mean": 0.9997190237045288, "sampling/importance_sampling_ratio/min": 0.9926024079322815, "sampling/sampling_logp_difference/max": 0.007526792585849762, "sampling/sampling_logp_difference/mean": 0.00044625578448176384, "step": 900, "step_time": 8.502976303998366 } ], "logging_steps": 1.0, "max_steps": 900, "num_input_tokens_seen": 39615777, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }