{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0105, "eval_steps": 500, "global_step": 2100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.125, "completions/mean_terminated_length": 6.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.618839144706726, "epoch": 5e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.08699294179677963, "kl": 0.0, "learning_rate": 0.0, "loss": -0.007, "num_tokens": 25554.0, "reward": -0.48725366592407227, "reward_std": 0.6346457600593567, "rewards/rollout_reward_func/mean": -0.48725366592407227, "rewards/rollout_reward_func/std": 0.6346458196640015, "sampling/importance_sampling_ratio/max": 0.4495532512664795, "sampling/importance_sampling_ratio/mean": 0.12279324233531952, "sampling/importance_sampling_ratio/min": 9.408353207618347e-08, "sampling/sampling_logp_difference/max": 2.361050605773926, "sampling/sampling_logp_difference/mean": 0.5148419141769409, "step": 1, "step_time": 12.140594442986185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.618839144706726, "epoch": 1e-05, "grad_norm": 0.08357227593660355, "kl": 0.0, "learning_rate": 2.2857142857142855e-07, "loss": -0.007, "step": 2, "step_time": 5.551788709999528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.8125, "completions/mean_terminated_length": 8.714285850524902, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 3.7492396235466003, "epoch": 1.5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0813305526971817, "kl": 0.000359829849912785, "learning_rate": 4.571428571428571e-07, "loss": -0.0106, "num_tokens": 50759.0, "reward": -0.347565621137619, "reward_std": 0.8163020014762878, "rewards/rollout_reward_func/mean": -0.347565621137619, "rewards/rollout_reward_func/std": 0.8163020610809326, "sampling/importance_sampling_ratio/max": 0.19878223538398743, "sampling/importance_sampling_ratio/mean": 0.05066375434398651, "sampling/importance_sampling_ratio/min": 2.6264839902978565e-07, "sampling/sampling_logp_difference/max": 1.9953398704528809, "sampling/sampling_logp_difference/mean": 0.53300940990448, "step": 3, "step_time": 11.245897928994964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.752246916294098, "epoch": 2e-05, "grad_norm": 0.0810830220580101, "kl": 0.00019437417904555332, "learning_rate": 6.857142857142857e-07, "loss": -0.0107, "step": 4, "step_time": 5.230061589012621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.4375, "completions/mean_terminated_length": 10.142857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.4843159317970276, "epoch": 2.5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.06004912778735161, "kl": 0.00022356666158884764, "learning_rate": 9.142857142857142e-07, "loss": 0.0008, "num_tokens": 80086.0, "reward": -0.14881815016269684, "reward_std": 0.8204445242881775, "rewards/rollout_reward_func/mean": -0.14881815016269684, "rewards/rollout_reward_func/std": 0.8204445242881775, "sampling/importance_sampling_ratio/max": 0.4790046513080597, "sampling/importance_sampling_ratio/mean": 0.05606182664632797, "sampling/importance_sampling_ratio/min": 1.9032066802537884e-06, "sampling/sampling_logp_difference/max": 2.809126138687134, "sampling/sampling_logp_difference/mean": 0.4504998028278351, "step": 5, "step_time": 12.80139086698182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.481586992740631, "epoch": 3e-05, "grad_norm": 0.061256010085344315, "kl": 0.0002564466412877664, "learning_rate": 1.1428571428571428e-06, "loss": 0.0008, "step": 6, "step_time": 6.268456692996551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 8.571429252624512, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.4951741099357605, "epoch": 3.5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.09843087941408157, "kl": 0.00030145487835397944, "learning_rate": 1.3714285714285715e-06, "loss": -0.0053, "num_tokens": 103445.0, "reward": -0.6058412194252014, "reward_std": 0.43671321868896484, "rewards/rollout_reward_func/mean": -0.6058412194252014, "rewards/rollout_reward_func/std": 0.43671324849128723, "sampling/importance_sampling_ratio/max": 0.45400798320770264, "sampling/importance_sampling_ratio/mean": 0.06037512049078941, "sampling/importance_sampling_ratio/min": 4.726050974568352e-06, "sampling/sampling_logp_difference/max": 1.8020131587982178, "sampling/sampling_logp_difference/mean": 0.4565422236919403, "step": 7, "step_time": 10.858794300002046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.489351212978363, "epoch": 4e-05, "grad_norm": 0.09709738194942474, "kl": 0.0001903390875668265, "learning_rate": 1.6e-06, "loss": -0.0054, "step": 8, "step_time": 5.097989376998157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.1875, "completions/mean_terminated_length": 8.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.814206302165985, "epoch": 4.5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.06861130148172379, "kl": 0.0003588594845496118, "learning_rate": 1.8285714285714284e-06, "loss": -0.0111, "num_tokens": 132856.0, "reward": -0.11104242503643036, "reward_std": 0.8622285723686218, "rewards/rollout_reward_func/mean": -0.11104242503643036, "rewards/rollout_reward_func/std": 0.8622285723686218, "sampling/importance_sampling_ratio/max": 0.499625563621521, "sampling/importance_sampling_ratio/mean": 0.09964904189109802, "sampling/importance_sampling_ratio/min": 7.744648478080762e-09, "sampling/sampling_logp_difference/max": 2.189311981201172, "sampling/sampling_logp_difference/mean": 0.5541694760322571, "step": 9, "step_time": 11.976178045995766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.8104697465896606, "epoch": 5e-05, "grad_norm": 0.06816192716360092, "kl": 0.00032017487683333457, "learning_rate": 2.057142857142857e-06, "loss": -0.0111, "step": 10, "step_time": 5.577863938015071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.5625, "completions/mean_terminated_length": 6.636363983154297, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.9053401350975037, "epoch": 5.5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.18728980422019958, "kl": 0.00029497972718672827, "learning_rate": 2.2857142857142856e-06, "loss": -0.0194, "num_tokens": 155389.0, "reward": -0.447155237197876, "reward_std": 0.9991055727005005, "rewards/rollout_reward_func/mean": -0.447155237197876, "rewards/rollout_reward_func/std": 0.9991055727005005, "sampling/importance_sampling_ratio/max": 0.5470947623252869, "sampling/importance_sampling_ratio/mean": 0.1494881808757782, "sampling/importance_sampling_ratio/min": 2.470932543019444e-07, "sampling/sampling_logp_difference/max": 2.210676670074463, "sampling/sampling_logp_difference/mean": 0.5973385572433472, "step": 11, "step_time": 9.722241873998428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.903750479221344, "epoch": 6e-05, "grad_norm": 0.17997853457927704, "kl": 0.00034280537511222064, "learning_rate": 2.5142857142857142e-06, "loss": -0.0192, "step": 12, "step_time": 4.925969659991097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.3125, "completions/mean_terminated_length": 7.666666507720947, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.68387371301651, "epoch": 6.5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.024012058973312378, "kl": 0.000299148159683682, "learning_rate": 2.742857142857143e-06, "loss": 0.0005, "num_tokens": 182726.0, "reward": -0.36497294902801514, "reward_std": 0.6446292400360107, "rewards/rollout_reward_func/mean": -0.36497294902801514, "rewards/rollout_reward_func/std": 0.6446292400360107, "sampling/importance_sampling_ratio/max": 0.3401249051094055, "sampling/importance_sampling_ratio/mean": 0.06023309752345085, "sampling/importance_sampling_ratio/min": 5.750763989453844e-07, "sampling/sampling_logp_difference/max": 2.109081506729126, "sampling/sampling_logp_difference/mean": 0.4916974902153015, "step": 13, "step_time": 10.948986565985251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.6761744618415833, "epoch": 7e-05, "grad_norm": 0.024295788258314133, "kl": 0.00024739527361816727, "learning_rate": 2.9714285714285716e-06, "loss": 0.0005, "step": 14, "step_time": 5.107966764000594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.6875, "completions/mean_terminated_length": 8.428571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.242999792098999, "epoch": 7.5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.05834261327981949, "kl": 0.000358408156898804, "learning_rate": 3.2e-06, "loss": -0.0011, "num_tokens": 206650.0, "reward": -0.0848282128572464, "reward_std": 1.1418253183364868, "rewards/rollout_reward_func/mean": -0.0848282128572464, "rewards/rollout_reward_func/std": 1.1418254375457764, "sampling/importance_sampling_ratio/max": 0.357270747423172, "sampling/importance_sampling_ratio/mean": 0.06937427073717117, "sampling/importance_sampling_ratio/min": 8.554634405300021e-07, "sampling/sampling_logp_difference/max": 2.0425078868865967, "sampling/sampling_logp_difference/mean": 0.3942587971687317, "step": 15, "step_time": 11.167076769997948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.2440605759620667, "epoch": 8e-05, "grad_norm": 0.05735456198453903, "kl": 0.0003413286030991003, "learning_rate": 3.428571428571428e-06, "loss": -0.001, "step": 16, "step_time": 5.756135673989775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0625, "completions/mean_terminated_length": 6.599999904632568, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.491234838962555, "epoch": 8.5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.11805237084627151, "kl": 0.00026268421061104164, "learning_rate": 3.657142857142857e-06, "loss": -0.0161, "num_tokens": 232863.0, "reward": -0.3052408695220947, "reward_std": 0.9040979743003845, "rewards/rollout_reward_func/mean": -0.3052408695220947, "rewards/rollout_reward_func/std": 0.9040979743003845, "sampling/importance_sampling_ratio/max": 0.24806839227676392, "sampling/importance_sampling_ratio/mean": 0.04308807849884033, "sampling/importance_sampling_ratio/min": 2.201077450081357e-06, "sampling/sampling_logp_difference/max": 2.256753921508789, "sampling/sampling_logp_difference/mean": 0.4650702178478241, "step": 17, "step_time": 11.446523365011672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.4874187111854553, "epoch": 9e-05, "grad_norm": 0.11958640813827515, "kl": 0.00025668644957477227, "learning_rate": 3.885714285714286e-06, "loss": -0.0163, "step": 18, "step_time": 5.102313377996325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 10.5625, "completions/mean_terminated_length": 6.333333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.358939826488495, "epoch": 9.5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.06494084000587463, "kl": 0.0004976810014341027, "learning_rate": 4.114285714285714e-06, "loss": -0.0094, "num_tokens": 256630.0, "reward": -0.7124538421630859, "reward_std": 0.7714344263076782, "rewards/rollout_reward_func/mean": -0.7124538421630859, "rewards/rollout_reward_func/std": 0.771434485912323, "sampling/importance_sampling_ratio/max": 0.37200304865837097, "sampling/importance_sampling_ratio/mean": 0.09988504648208618, "sampling/importance_sampling_ratio/min": 9.004368621390313e-06, "sampling/sampling_logp_difference/max": 2.30588436126709, "sampling/sampling_logp_difference/mean": 0.42304980754852295, "step": 19, "step_time": 10.813039105007192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.3528798818588257, "epoch": 0.0001, "grad_norm": 0.06185397133231163, "kl": 0.0006437527772504836, "learning_rate": 4.342857142857142e-06, "loss": -0.0096, "step": 20, "step_time": 5.301291227020556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.625, "completions/mean_terminated_length": 7.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.38206547498703, "epoch": 0.000105, "frac_reward_zero_std": 0.0, "grad_norm": 0.08097440749406815, "kl": 0.0006556252264999785, "learning_rate": 4.571428571428571e-06, "loss": 0.0049, "num_tokens": 279471.0, "reward": 0.08501515537500381, "reward_std": 1.2250598669052124, "rewards/rollout_reward_func/mean": 0.08501515537500381, "rewards/rollout_reward_func/std": 1.2250598669052124, "sampling/importance_sampling_ratio/max": 0.46857836842536926, "sampling/importance_sampling_ratio/mean": 0.11567733436822891, "sampling/importance_sampling_ratio/min": 4.6981444938865025e-06, "sampling/sampling_logp_difference/max": 1.8674627542495728, "sampling/sampling_logp_difference/mean": 0.4343419075012207, "step": 21, "step_time": 10.830704566978966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.3723844289779663, "epoch": 0.00011, "grad_norm": 0.08179351687431335, "kl": 0.0009589906840119511, "learning_rate": 4.8e-06, "loss": 0.0048, "step": 22, "step_time": 5.094458513020072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.6875, "completions/mean_terminated_length": 7.1666669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.522797644138336, "epoch": 0.000115, "frac_reward_zero_std": 0.0, "grad_norm": 0.04824453219771385, "kl": 0.0005546750326175243, "learning_rate": 5.0285714285714285e-06, "loss": -0.0036, "num_tokens": 306711.0, "reward": -0.255246639251709, "reward_std": 0.7140175700187683, "rewards/rollout_reward_func/mean": -0.255246639251709, "rewards/rollout_reward_func/std": 0.7140175700187683, "sampling/importance_sampling_ratio/max": 0.3147832751274109, "sampling/importance_sampling_ratio/mean": 0.04467848315834999, "sampling/importance_sampling_ratio/min": 4.951557457388844e-07, "sampling/sampling_logp_difference/max": 2.1210691928863525, "sampling/sampling_logp_difference/mean": 0.48448923230171204, "step": 23, "step_time": 11.735542836016975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.5201520323753357, "epoch": 0.00012, "grad_norm": 0.0468628853559494, "kl": 0.0005192349854041822, "learning_rate": 5.257142857142857e-06, "loss": -0.0035, "step": 24, "step_time": 5.573232855982496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.9375, "completions/mean_terminated_length": 9.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.3272934556007385, "epoch": 0.000125, "frac_reward_zero_std": 0.0, "grad_norm": 0.09216846525669098, "kl": 0.0010824828350450844, "learning_rate": 5.485714285714286e-06, "loss": -0.0159, "num_tokens": 325570.0, "reward": -0.2438037246465683, "reward_std": 1.2448503971099854, "rewards/rollout_reward_func/mean": -0.2438037246465683, "rewards/rollout_reward_func/std": 1.2448503971099854, "sampling/importance_sampling_ratio/max": 0.446321040391922, "sampling/importance_sampling_ratio/mean": 0.06207210198044777, "sampling/importance_sampling_ratio/min": 1.5564401110168546e-05, "sampling/sampling_logp_difference/max": 2.075477123260498, "sampling/sampling_logp_difference/mean": 0.4323262572288513, "step": 25, "step_time": 9.031012546023703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.3195645809173584, "epoch": 0.00013, "grad_norm": 0.09361906349658966, "kl": 0.0013673034991370514, "learning_rate": 5.7142857142857145e-06, "loss": -0.0161, "step": 26, "step_time": 4.841802776019904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.1871469020843506, "epoch": 0.000135, "frac_reward_zero_std": 0.0, "grad_norm": 0.14157578349113464, "kl": 0.0010580126254353672, "learning_rate": 5.942857142857143e-06, "loss": -0.0122, "num_tokens": 350345.0, "reward": -0.44401228427886963, "reward_std": 0.9283527731895447, "rewards/rollout_reward_func/mean": -0.44401228427886963, "rewards/rollout_reward_func/std": 0.9283528327941895, "sampling/importance_sampling_ratio/max": 0.46044430136680603, "sampling/importance_sampling_ratio/mean": 0.08445648849010468, "sampling/importance_sampling_ratio/min": 1.578010051161982e-05, "sampling/sampling_logp_difference/max": 1.819822072982788, "sampling/sampling_logp_difference/mean": 0.4050140976905823, "step": 27, "step_time": 11.624645467003575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.185097575187683, "epoch": 0.00014, "grad_norm": 0.14499130845069885, "kl": 0.001592590255313553, "learning_rate": 6.171428571428571e-06, "loss": -0.0128, "step": 28, "step_time": 5.2401266929955455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 9.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.505832076072693, "epoch": 0.000145, "frac_reward_zero_std": 0.0, "grad_norm": 0.024386486038565636, "kl": 0.0013383222976699471, "learning_rate": 6.4e-06, "loss": -0.0079, "num_tokens": 378400.0, "reward": -0.23850096762180328, "reward_std": 0.6592814922332764, "rewards/rollout_reward_func/mean": -0.23850096762180328, "rewards/rollout_reward_func/std": 0.6592814922332764, "sampling/importance_sampling_ratio/max": 0.4002411663532257, "sampling/importance_sampling_ratio/mean": 0.04062509164214134, "sampling/importance_sampling_ratio/min": 6.318232044577599e-05, "sampling/sampling_logp_difference/max": 1.8825738430023193, "sampling/sampling_logp_difference/mean": 0.44726309180259705, "step": 29, "step_time": 11.895195536999381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.5026503205299377, "epoch": 0.00015, "grad_norm": 0.02403084561228752, "kl": 0.0016790347581263632, "learning_rate": 6.628571428571428e-06, "loss": -0.0079, "step": 30, "step_time": 5.598286813998129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 6.400000095367432, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.502009868621826, "epoch": 0.000155, "frac_reward_zero_std": 0.0, "grad_norm": 0.0942278578877449, "kl": 0.00773620477411896, "learning_rate": 6.857142857142856e-06, "loss": -0.0077, "num_tokens": 405857.0, "reward": 0.005912564694881439, "reward_std": 1.0260698795318604, "rewards/rollout_reward_func/mean": 0.005912564694881439, "rewards/rollout_reward_func/std": 1.0260698795318604, "sampling/importance_sampling_ratio/max": 0.5333632230758667, "sampling/importance_sampling_ratio/mean": 0.15840838849544525, "sampling/importance_sampling_ratio/min": 4.970889477817764e-09, "sampling/sampling_logp_difference/max": 2.4841055870056152, "sampling/sampling_logp_difference/mean": 0.5592151284217834, "step": 31, "step_time": 11.217114308994496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.4781785011291504, "epoch": 0.00016, "grad_norm": 0.09546854346990585, "kl": 0.009256463265046477, "learning_rate": 7.085714285714285e-06, "loss": -0.0079, "step": 32, "step_time": 5.60567553799774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 6.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.211784601211548, "epoch": 0.000165, "frac_reward_zero_std": 0.0, "grad_norm": 0.18669430911540985, "kl": 0.020159987965598702, "learning_rate": 7.314285714285714e-06, "loss": -0.0141, "num_tokens": 427049.0, "reward": -0.5814807415008545, "reward_std": 1.0412291288375854, "rewards/rollout_reward_func/mean": -0.5814807415008545, "rewards/rollout_reward_func/std": 1.0412291288375854, "sampling/importance_sampling_ratio/max": 0.6470649242401123, "sampling/importance_sampling_ratio/mean": 0.24677863717079163, "sampling/importance_sampling_ratio/min": 1.0467537322256248e-05, "sampling/sampling_logp_difference/max": 1.916571855545044, "sampling/sampling_logp_difference/mean": 0.404464453458786, "step": 33, "step_time": 8.77632075699512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.1773334741592407, "epoch": 0.00017, "grad_norm": 0.19191032648086548, "kl": 0.024724145885556936, "learning_rate": 7.542857142857142e-06, "loss": -0.0153, "step": 34, "step_time": 4.21874022600241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.3125, "completions/mean_terminated_length": 8.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.282309412956238, "epoch": 0.000175, "frac_reward_zero_std": 0.0, "grad_norm": 0.11831216514110565, "kl": 0.007848776993341744, "learning_rate": 7.771428571428572e-06, "loss": -0.0229, "num_tokens": 454345.0, "reward": -0.26938897371292114, "reward_std": 0.8480720520019531, "rewards/rollout_reward_func/mean": -0.26938897371292114, "rewards/rollout_reward_func/std": 0.8480720520019531, "sampling/importance_sampling_ratio/max": 0.6484279632568359, "sampling/importance_sampling_ratio/mean": 0.12214528769254684, "sampling/importance_sampling_ratio/min": 1.4133784134173766e-05, "sampling/sampling_logp_difference/max": 1.974698543548584, "sampling/sampling_logp_difference/mean": 0.4151390492916107, "step": 35, "step_time": 11.639402778004296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.2672371864318848, "epoch": 0.00018, "grad_norm": 0.11834896355867386, "kl": 0.010479938704520464, "learning_rate": 8e-06, "loss": -0.0234, "step": 36, "step_time": 6.801901350001572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.8125, "completions/mean_terminated_length": 6.099999904632568, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.0937100052833557, "epoch": 0.000185, "frac_reward_zero_std": 0.0, "grad_norm": 0.30967622995376587, "kl": 0.04250570060685277, "learning_rate": 7.999999999907456e-06, "loss": -0.0531, "num_tokens": 479732.0, "reward": -0.013984471559524536, "reward_std": 1.1227997541427612, "rewards/rollout_reward_func/mean": -0.013984471559524536, "rewards/rollout_reward_func/std": 1.1227998733520508, "sampling/importance_sampling_ratio/max": 0.9487277865409851, "sampling/importance_sampling_ratio/mean": 0.2916378974914551, "sampling/importance_sampling_ratio/min": 1.2853899278297831e-08, "sampling/sampling_logp_difference/max": 2.4361321926116943, "sampling/sampling_logp_difference/mean": 0.4385579824447632, "step": 37, "step_time": 11.417842144015594 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 3.0549317002296448, "epoch": 0.00019, "grad_norm": 0.2481231987476349, "kl": 0.05892368918284774, "learning_rate": 7.999999999629824e-06, "loss": -0.0573, "step": 38, "step_time": 5.573631760009448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.875, "completions/mean_terminated_length": 7.090909481048584, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.340528130531311, "epoch": 0.000195, "frac_reward_zero_std": 0.0, "grad_norm": 0.2425091713666916, "kl": 0.053665585815906525, "learning_rate": 7.999999999167105e-06, "loss": -0.0574, "num_tokens": 508674.0, "reward": 0.004862666130065918, "reward_std": 0.9697778820991516, "rewards/rollout_reward_func/mean": 0.004862666130065918, "rewards/rollout_reward_func/std": 0.9697780013084412, "sampling/importance_sampling_ratio/max": 0.8507989048957825, "sampling/importance_sampling_ratio/mean": 0.27573829889297485, "sampling/importance_sampling_ratio/min": 1.8499168419339185e-08, "sampling/sampling_logp_difference/max": 2.291719436645508, "sampling/sampling_logp_difference/mean": 0.524437665939331, "step": 39, "step_time": 11.005085692013381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.014062500093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014062500093132257, "entropy": 3.280657947063446, "epoch": 0.0002, "grad_norm": 0.24927949905395508, "kl": 0.07985470350831747, "learning_rate": 7.9999999985193e-06, "loss": -0.0596, "step": 40, "step_time": 5.590164245993947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 6.222222328186035, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.9935288429260254, "epoch": 0.000205, "frac_reward_zero_std": 0.0, "grad_norm": 0.26412102580070496, "kl": 0.10147721786051989, "learning_rate": 7.999999997686407e-06, "loss": -0.0672, "num_tokens": 533690.0, "reward": 0.021267808973789215, "reward_std": 1.216078758239746, "rewards/rollout_reward_func/mean": 0.021267808973789215, "rewards/rollout_reward_func/std": 1.216078758239746, "sampling/importance_sampling_ratio/max": 1.5073410272598267, "sampling/importance_sampling_ratio/mean": 0.31088709831237793, "sampling/importance_sampling_ratio/min": 2.1980116798658855e-05, "sampling/sampling_logp_difference/max": 1.8962770700454712, "sampling/sampling_logp_difference/mean": 0.4174283742904663, "step": 41, "step_time": 11.95448043900251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010156250093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010156250093132257, "entropy": 2.9296656250953674, "epoch": 0.00021, "grad_norm": 0.23916105926036835, "kl": 0.15788778942078352, "learning_rate": 7.999999996668426e-06, "loss": -0.0703, "step": 42, "step_time": 5.783974229998421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.6875, "completions/mean_terminated_length": 6.818181991577148, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.907139778137207, "epoch": 0.000215, "frac_reward_zero_std": 0.0, "grad_norm": 0.2655614912509918, "kl": 0.33065513521432877, "learning_rate": 7.999999995465356e-06, "loss": -0.0322, "num_tokens": 561687.0, "reward": 0.15289351344108582, "reward_std": 1.1562931537628174, "rewards/rollout_reward_func/mean": 0.15289351344108582, "rewards/rollout_reward_func/std": 1.1562931537628174, "sampling/importance_sampling_ratio/max": 2.423182964324951, "sampling/importance_sampling_ratio/mean": 0.45248985290527344, "sampling/importance_sampling_ratio/min": 1.158371787823853e-06, "sampling/sampling_logp_difference/max": 2.161879062652588, "sampling/sampling_logp_difference/mean": 0.4960351586341858, "step": 43, "step_time": 11.044518653012346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03624999988824129, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03624999988824129, "entropy": 2.8489501774311066, "epoch": 0.00022, "grad_norm": 0.27932360768318176, "kl": 0.5207085013389587, "learning_rate": 7.9999999940772e-06, "loss": -0.0353, "step": 44, "step_time": 5.577848651984823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 4.9166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.6336894631385803, "epoch": 0.000225, "frac_reward_zero_std": 0.0, "grad_norm": 0.25369834899902344, "kl": 0.428902730345726, "learning_rate": 7.999999992503956e-06, "loss": -0.047, "num_tokens": 591491.0, "reward": 0.08510953187942505, "reward_std": 0.8426344990730286, "rewards/rollout_reward_func/mean": 0.08510953187942505, "rewards/rollout_reward_func/std": 0.8426345586776733, "sampling/importance_sampling_ratio/max": 1.7334537506103516, "sampling/importance_sampling_ratio/mean": 0.5610501766204834, "sampling/importance_sampling_ratio/min": 6.137764074765073e-08, "sampling/sampling_logp_difference/max": 2.0791704654693604, "sampling/sampling_logp_difference/mean": 0.5277097225189209, "step": 45, "step_time": 10.822773861014866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.05697115417569876, "clip_ratio/low_min": 0.01923076994717121, "clip_ratio/region_mean": 0.05697115417569876, "entropy": 2.5829924046993256, "epoch": 0.00023, "grad_norm": 0.2862364649772644, "kl": 0.6178908497095108, "learning_rate": 7.999999990745626e-06, "loss": -0.0483, "step": 46, "step_time": 6.103914271981921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.800000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7702410817146301, "epoch": 0.000235, "frac_reward_zero_std": 0.0, "grad_norm": 0.8688968420028687, "kl": 1.0391228646039963, "learning_rate": 7.999999988802207e-06, "loss": -0.0915, "num_tokens": 614391.0, "reward": 0.8436746597290039, "reward_std": 1.2372126579284668, "rewards/rollout_reward_func/mean": 0.8436746597290039, "rewards/rollout_reward_func/std": 1.2372126579284668, "sampling/importance_sampling_ratio/max": 1.818477749824524, "sampling/importance_sampling_ratio/mean": 0.8275373578071594, "sampling/importance_sampling_ratio/min": 3.129646211164072e-05, "sampling/sampling_logp_difference/max": 2.6257879734039307, "sampling/sampling_logp_difference/mean": 0.399885356426239, "step": 47, "step_time": 10.915732345005381 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.011363636702299118, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022727273404598236, "entropy": 1.7050312459468842, "epoch": 0.00024, "grad_norm": 0.3098304569721222, "kl": 1.117918148636818, "learning_rate": 7.999999986673701e-06, "loss": -0.0942, "step": 48, "step_time": 5.247294752000016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 5.285714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7538358867168427, "epoch": 0.000245, "frac_reward_zero_std": 0.0, "grad_norm": 0.15132863819599152, "kl": 0.8644853234291077, "learning_rate": 7.999999984360109e-06, "loss": 0.0089, "num_tokens": 640292.0, "reward": -0.17994050681591034, "reward_std": 1.017762303352356, "rewards/rollout_reward_func/mean": -0.17994050681591034, "rewards/rollout_reward_func/std": 1.017762303352356, "sampling/importance_sampling_ratio/max": 1.8239409923553467, "sampling/importance_sampling_ratio/mean": 0.5390455722808838, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.5540661811828613, "sampling/sampling_logp_difference/mean": 0.5716352462768555, "step": 49, "step_time": 10.129451501998119 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.7175495624542236, "epoch": 0.00025, "grad_norm": 0.13997916877269745, "kl": 0.9512917995452881, "learning_rate": 7.999999981861428e-06, "loss": 0.0084, "step": 50, "step_time": 5.345895232996554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.133333683013916, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2715553045272827, "epoch": 0.000255, "frac_reward_zero_std": 0.0, "grad_norm": 0.13666048645973206, "kl": 0.8083999156951904, "learning_rate": 7.99999997917766e-06, "loss": -0.0754, "num_tokens": 670571.0, "reward": 0.19444765150547028, "reward_std": 1.0064643621444702, "rewards/rollout_reward_func/mean": 0.19444765150547028, "rewards/rollout_reward_func/std": 1.0064643621444702, "sampling/importance_sampling_ratio/max": 1.8997082710266113, "sampling/importance_sampling_ratio/mean": 0.6597294211387634, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.4830574989318848, "sampling/sampling_logp_difference/mean": 0.49247685074806213, "step": 51, "step_time": 10.910591095002019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.027472528629004955, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027472528629004955, "entropy": 2.253412127494812, "epoch": 0.00026, "grad_norm": 0.11894695460796356, "kl": 0.8297851160168648, "learning_rate": 7.999999976308803e-06, "loss": -0.0766, "step": 52, "step_time": 5.663442446020781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.1875, "completions/mean_terminated_length": 4.4666666984558105, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1767938435077667, "epoch": 0.000265, "frac_reward_zero_std": 0.0, "grad_norm": 0.4445633292198181, "kl": 0.5223463028669357, "learning_rate": 7.99999997325486e-06, "loss": -0.0578, "num_tokens": 690019.0, "reward": 0.0023356080055236816, "reward_std": 1.2736786603927612, "rewards/rollout_reward_func/mean": 0.0023356080055236816, "rewards/rollout_reward_func/std": 1.2736786603927612, "sampling/importance_sampling_ratio/max": 1.509162425994873, "sampling/importance_sampling_ratio/mean": 1.0684648752212524, "sampling/importance_sampling_ratio/min": 1.0682077117962763e-05, "sampling/sampling_logp_difference/max": 2.0892140865325928, "sampling/sampling_logp_difference/mean": 0.34903475642204285, "step": 53, "step_time": 7.223258704994805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.1670817136764526, "epoch": 0.00027, "grad_norm": 0.3218787610530853, "kl": 0.4831201434135437, "learning_rate": 7.99999997001583e-06, "loss": -0.0616, "step": 54, "step_time": 3.8909975430142367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 5.142857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5648984014987946, "epoch": 0.000275, "frac_reward_zero_std": 0.0, "grad_norm": 0.5415921807289124, "kl": 5.0238602459430695, "learning_rate": 7.999999966591712e-06, "loss": -0.0593, "num_tokens": 714142.0, "reward": 0.5715083479881287, "reward_std": 1.370612382888794, "rewards/rollout_reward_func/mean": 0.5715083479881287, "rewards/rollout_reward_func/std": 1.370612382888794, "sampling/importance_sampling_ratio/max": 1.9699432849884033, "sampling/importance_sampling_ratio/mean": 0.7008839845657349, "sampling/importance_sampling_ratio/min": 0.00010662744170986116, "sampling/sampling_logp_difference/max": 3.02402925491333, "sampling/sampling_logp_difference/mean": 0.44043922424316406, "step": 55, "step_time": 10.146357072982937 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.004166666883975267, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "entropy": 1.5758208334445953, "epoch": 0.00028, "grad_norm": 0.5270129442214966, "kl": 4.927402533590794, "learning_rate": 7.999999962982505e-06, "loss": -0.0608, "step": 56, "step_time": 5.7822370100038825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.475797325372696, "epoch": 0.000285, "frac_reward_zero_std": 0.0, "grad_norm": 0.13737110793590546, "kl": 0.4191647917032242, "learning_rate": 7.999999959188212e-06, "loss": -0.0219, "num_tokens": 734087.0, "reward": -0.21763867139816284, "reward_std": 1.2465332746505737, "rewards/rollout_reward_func/mean": -0.21763867139816284, "rewards/rollout_reward_func/std": 1.2465331554412842, "sampling/importance_sampling_ratio/max": 1.5919784307479858, "sampling/importance_sampling_ratio/mean": 0.799771249294281, "sampling/importance_sampling_ratio/min": 3.5337816370883957e-05, "sampling/sampling_logp_difference/max": 1.8651719093322754, "sampling/sampling_logp_difference/mean": 0.2781135141849518, "step": 57, "step_time": 8.926246038012323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4877009689807892, "epoch": 0.00029, "grad_norm": 0.16777756810188293, "kl": 0.40917886048555374, "learning_rate": 7.999999955208831e-06, "loss": -0.0226, "step": 58, "step_time": 4.054286021011649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.3125, "completions/mean_terminated_length": 4.818181991577148, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7664325386285782, "epoch": 0.000295, "frac_reward_zero_std": 0.0, "grad_norm": 0.17451724410057068, "kl": 1.4128608629107475, "learning_rate": 7.999999951044363e-06, "loss": -0.1117, "num_tokens": 753692.0, "reward": 0.33725547790527344, "reward_std": 1.4197239875793457, "rewards/rollout_reward_func/mean": 0.33725547790527344, "rewards/rollout_reward_func/std": 1.4197239875793457, "sampling/importance_sampling_ratio/max": 1.5088815689086914, "sampling/importance_sampling_ratio/mean": 0.579778790473938, "sampling/importance_sampling_ratio/min": 1.2897928172606044e-05, "sampling/sampling_logp_difference/max": 3.3929643630981445, "sampling/sampling_logp_difference/mean": 0.374864786863327, "step": 59, "step_time": 7.797605139989173 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "entropy": 1.777587652206421, "epoch": 0.0003, "grad_norm": 0.16237565875053406, "kl": 1.2042857259511948, "learning_rate": 7.999999946694808e-06, "loss": -0.1127, "step": 60, "step_time": 3.8766456120065413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.0625, "completions/mean_terminated_length": 7.363636493682861, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.116677761077881, "epoch": 0.000305, "frac_reward_zero_std": 0.0, "grad_norm": 0.27961206436157227, "kl": 0.13057581894099712, "learning_rate": 7.999999942160165e-06, "loss": -0.0251, "num_tokens": 783706.0, "reward": -0.06590871512889862, "reward_std": 0.7924865484237671, "rewards/rollout_reward_func/mean": -0.06590871512889862, "rewards/rollout_reward_func/std": 0.7924866080284119, "sampling/importance_sampling_ratio/max": 1.76718270778656, "sampling/importance_sampling_ratio/mean": 0.3437884449958801, "sampling/importance_sampling_ratio/min": 9.585548468749039e-06, "sampling/sampling_logp_difference/max": 2.1494877338409424, "sampling/sampling_logp_difference/mean": 0.5220329165458679, "step": 61, "step_time": 11.595578460983234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 3.10959792137146, "epoch": 0.00031, "grad_norm": 0.25085335969924927, "kl": 0.12237115763127804, "learning_rate": 7.999999937440435e-06, "loss": -0.0258, "step": 62, "step_time": 5.85015894199023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.5625, "completions/mean_terminated_length": 6.636363983154297, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.550024852156639, "epoch": 0.000315, "frac_reward_zero_std": 0.0, "grad_norm": 0.16905802488327026, "kl": 0.16063841804862022, "learning_rate": 7.999999932535616e-06, "loss": -0.1001, "num_tokens": 811392.0, "reward": 1.012237787246704, "reward_std": 1.1680511236190796, "rewards/rollout_reward_func/mean": 1.012237787246704, "rewards/rollout_reward_func/std": 1.1680512428283691, "sampling/importance_sampling_ratio/max": 1.4980132579803467, "sampling/importance_sampling_ratio/mean": 0.5479334592819214, "sampling/importance_sampling_ratio/min": 7.490131974918768e-05, "sampling/sampling_logp_difference/max": 2.089621067047119, "sampling/sampling_logp_difference/mean": 0.4422675371170044, "step": 63, "step_time": 10.735987472013221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5385036915540695, "epoch": 0.00032, "grad_norm": 0.16711822152137756, "kl": 0.1639714613556862, "learning_rate": 7.99999992744571e-06, "loss": -0.1004, "step": 64, "step_time": 5.563023869006429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4572045132517815, "epoch": 0.000325, "frac_reward_zero_std": 0.0, "grad_norm": 0.08291274309158325, "kl": 0.29666997492313385, "learning_rate": 7.999999922170718e-06, "loss": -0.0233, "num_tokens": 840401.0, "reward": 0.3162999451160431, "reward_std": 1.1226943731307983, "rewards/rollout_reward_func/mean": 0.3162999451160431, "rewards/rollout_reward_func/std": 1.1226943731307983, "sampling/importance_sampling_ratio/max": 1.7699309587478638, "sampling/importance_sampling_ratio/mean": 0.9860155582427979, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.505434513092041, "sampling/sampling_logp_difference/mean": 0.30703654885292053, "step": 65, "step_time": 10.264016875022207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4486078321933746, "epoch": 0.00033, "grad_norm": 0.08129528909921646, "kl": 0.2991799861192703, "learning_rate": 7.999999916710638e-06, "loss": -0.0231, "step": 66, "step_time": 5.163016411999706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 6.800000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.557106375694275, "epoch": 0.000335, "frac_reward_zero_std": 0.0, "grad_norm": 0.16990511119365692, "kl": 0.8256735000759363, "learning_rate": 7.99999991106547e-06, "loss": -0.024, "num_tokens": 869580.0, "reward": 0.008662812411785126, "reward_std": 0.9481741189956665, "rewards/rollout_reward_func/mean": 0.008662812411785126, "rewards/rollout_reward_func/std": 0.9481741189956665, "sampling/importance_sampling_ratio/max": 1.3763420581817627, "sampling/importance_sampling_ratio/mean": 0.228988915681839, "sampling/importance_sampling_ratio/min": 1.1480760520043987e-07, "sampling/sampling_logp_difference/max": 3.6383626461029053, "sampling/sampling_logp_difference/mean": 0.5084672570228577, "step": 67, "step_time": 11.738531046998105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5497167706489563, "epoch": 0.00034, "grad_norm": 0.1719851940870285, "kl": 0.760779207572341, "learning_rate": 7.999999905235214e-06, "loss": -0.0254, "step": 68, "step_time": 5.3576790139923105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 8.8125, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.310907542705536, "epoch": 0.000345, "frac_reward_zero_std": 0.0, "grad_norm": 0.09970749169588089, "kl": 0.3257657252252102, "learning_rate": 7.999999899219872e-06, "loss": 0.0041, "num_tokens": 892018.0, "reward": 0.4541316628456116, "reward_std": 1.275683879852295, "rewards/rollout_reward_func/mean": 0.4541316628456116, "rewards/rollout_reward_func/std": 1.275683879852295, "sampling/importance_sampling_ratio/max": 1.7416625022888184, "sampling/importance_sampling_ratio/mean": 0.6226587295532227, "sampling/importance_sampling_ratio/min": 3.019329142261995e-06, "sampling/sampling_logp_difference/max": 2.5729503631591797, "sampling/sampling_logp_difference/mean": 0.4103279709815979, "step": 69, "step_time": 10.167831579994527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3060953319072723, "epoch": 0.00035, "grad_norm": 0.1001364216208458, "kl": 0.3259161598980427, "learning_rate": 7.999999893019442e-06, "loss": 0.0042, "step": 70, "step_time": 5.075901952004642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.625, "completions/mean_terminated_length": 5.692307949066162, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6109026968479156, "epoch": 0.000355, "frac_reward_zero_std": 0.0, "grad_norm": 0.2883712947368622, "kl": 1.0062016695737839, "learning_rate": 7.999999886633925e-06, "loss": -0.0648, "num_tokens": 916766.0, "reward": 0.6553001999855042, "reward_std": 1.1842234134674072, "rewards/rollout_reward_func/mean": 0.6553001999855042, "rewards/rollout_reward_func/std": 1.1842234134674072, "sampling/importance_sampling_ratio/max": 1.7310456037521362, "sampling/importance_sampling_ratio/mean": 0.6126095056533813, "sampling/importance_sampling_ratio/min": 0.00041248169145546854, "sampling/sampling_logp_difference/max": 3.276310443878174, "sampling/sampling_logp_difference/mean": 0.3782631754875183, "step": 71, "step_time": 11.116174976996263 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 1.6082947179675102, "epoch": 0.00036, "grad_norm": 0.3040386140346527, "kl": 0.9023978784680367, "learning_rate": 7.999999880063319e-06, "loss": -0.0654, "step": 72, "step_time": 5.847364125002059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.875, "completions/mean_terminated_length": 4.599999904632568, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.868049681186676, "epoch": 0.000365, "frac_reward_zero_std": 0.0, "grad_norm": 0.21784476935863495, "kl": 0.31415240094065666, "learning_rate": 7.999999873307627e-06, "loss": -0.0177, "num_tokens": 943702.0, "reward": -0.2619380056858063, "reward_std": 0.8571597337722778, "rewards/rollout_reward_func/mean": -0.2619380056858063, "rewards/rollout_reward_func/std": 0.8571597933769226, "sampling/importance_sampling_ratio/max": 2.342257022857666, "sampling/importance_sampling_ratio/mean": 0.8629866242408752, "sampling/importance_sampling_ratio/min": 4.1699000576045364e-05, "sampling/sampling_logp_difference/max": 2.3843579292297363, "sampling/sampling_logp_difference/mean": 0.3527480363845825, "step": 73, "step_time": 11.185577766984352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8633493483066559, "epoch": 0.00037, "grad_norm": 0.21683810651302338, "kl": 0.3252618536353111, "learning_rate": 7.999999866366846e-06, "loss": -0.0186, "step": 74, "step_time": 5.631567882010131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.428571701049805, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3304058611392975, "epoch": 0.000375, "frac_reward_zero_std": 0.0, "grad_norm": 0.18078136444091797, "kl": 0.27876242995262146, "learning_rate": 7.99999985924098e-06, "loss": -0.0177, "num_tokens": 974113.0, "reward": 0.3658149242401123, "reward_std": 1.0018119812011719, "rewards/rollout_reward_func/mean": 0.3658149242401123, "rewards/rollout_reward_func/std": 1.0018119812011719, "sampling/importance_sampling_ratio/max": 1.9597283601760864, "sampling/importance_sampling_ratio/mean": 1.1048033237457275, "sampling/importance_sampling_ratio/min": 1.7851345546660013e-05, "sampling/sampling_logp_difference/max": 1.6446932554244995, "sampling/sampling_logp_difference/mean": 0.32367849349975586, "step": 75, "step_time": 11.069871647981927 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 1.3212616071105003, "epoch": 0.00038, "grad_norm": 0.13193053007125854, "kl": 0.27500518411397934, "learning_rate": 7.999999851930024e-06, "loss": -0.0183, "step": 76, "step_time": 5.801636003991007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.125, "completions/mean_terminated_length": 5.857142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.572871781885624, "epoch": 0.000385, "frac_reward_zero_std": 0.0, "grad_norm": 0.11087971180677414, "kl": 0.33498915657401085, "learning_rate": 7.999999844433981e-06, "loss": -0.0366, "num_tokens": 996246.0, "reward": 0.29951608180999756, "reward_std": 1.3855551481246948, "rewards/rollout_reward_func/mean": 0.29951608180999756, "rewards/rollout_reward_func/std": 1.3855551481246948, "sampling/importance_sampling_ratio/max": 1.6037315130233765, "sampling/importance_sampling_ratio/mean": 0.8600223064422607, "sampling/importance_sampling_ratio/min": 1.0204901457200322e-07, "sampling/sampling_logp_difference/max": 2.2778091430664062, "sampling/sampling_logp_difference/mean": 0.32860317826271057, "step": 77, "step_time": 10.493392065996886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5782704800367355, "epoch": 0.00039, "grad_norm": 0.10724244266748428, "kl": 0.3348260521888733, "learning_rate": 7.99999983675285e-06, "loss": -0.037, "step": 78, "step_time": 5.31696029601153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.9375, "completions/mean_terminated_length": 4.700000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9161702692508698, "epoch": 0.000395, "frac_reward_zero_std": 0.0, "grad_norm": 0.20374318957328796, "kl": 0.45806949213147163, "learning_rate": 7.999999828886634e-06, "loss": -0.1509, "num_tokens": 1019615.0, "reward": -0.13890454173088074, "reward_std": 1.0576627254486084, "rewards/rollout_reward_func/mean": -0.13890454173088074, "rewards/rollout_reward_func/std": 1.0576627254486084, "sampling/importance_sampling_ratio/max": 1.9034223556518555, "sampling/importance_sampling_ratio/mean": 0.7401782870292664, "sampling/importance_sampling_ratio/min": 0.0001864219957496971, "sampling/sampling_logp_difference/max": 1.6842912435531616, "sampling/sampling_logp_difference/mean": 0.2863841652870178, "step": 79, "step_time": 10.869116213012603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9216732382774353, "epoch": 0.0004, "grad_norm": 0.1971929967403412, "kl": 0.45086969435214996, "learning_rate": 7.99999982083533e-06, "loss": -0.1515, "step": 80, "step_time": 5.255412196987891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 4.769230842590332, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4794869124889374, "epoch": 0.000405, "frac_reward_zero_std": 0.0, "grad_norm": 0.139199361205101, "kl": 1.4272459521889687, "learning_rate": 7.999999812598937e-06, "loss": -0.0682, "num_tokens": 1048573.0, "reward": 0.4487930238246918, "reward_std": 1.1909410953521729, "rewards/rollout_reward_func/mean": 0.4487930238246918, "rewards/rollout_reward_func/std": 1.1909412145614624, "sampling/importance_sampling_ratio/max": 1.961728811264038, "sampling/importance_sampling_ratio/mean": 0.7677186131477356, "sampling/importance_sampling_ratio/min": 2.800346010189969e-06, "sampling/sampling_logp_difference/max": 2.71303653717041, "sampling/sampling_logp_difference/mean": 0.35586023330688477, "step": 81, "step_time": 11.00766267698782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4733954966068268, "epoch": 0.00041, "grad_norm": 0.130975142121315, "kl": 1.3993178382515907, "learning_rate": 7.999999804177458e-06, "loss": -0.0687, "step": 82, "step_time": 5.746074337002938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.800000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.965522974729538, "epoch": 0.000415, "frac_reward_zero_std": 0.0, "grad_norm": 0.22717516124248505, "kl": 0.8641153946518898, "learning_rate": 7.99999979557089e-06, "loss": -0.0892, "num_tokens": 1072799.0, "reward": 0.37994736433029175, "reward_std": 1.1819311380386353, "rewards/rollout_reward_func/mean": 0.37994736433029175, "rewards/rollout_reward_func/std": 1.1819312572479248, "sampling/importance_sampling_ratio/max": 2.261889934539795, "sampling/importance_sampling_ratio/mean": 1.0485408306121826, "sampling/importance_sampling_ratio/min": 0.0013271195348352194, "sampling/sampling_logp_difference/max": 2.430570602416992, "sampling/sampling_logp_difference/mean": 0.2503541111946106, "step": 83, "step_time": 9.662476960977074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9568755850195885, "epoch": 0.00042, "grad_norm": 0.2391654998064041, "kl": 0.8347163498401642, "learning_rate": 7.999999786779235e-06, "loss": -0.0903, "step": 84, "step_time": 5.23636244199588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 4.888888835906982, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.259150594472885, "epoch": 0.000425, "frac_reward_zero_std": 0.0, "grad_norm": 0.34706202149391174, "kl": 0.5373355783522129, "learning_rate": 7.999999777802493e-06, "loss": -0.0046, "num_tokens": 1101284.0, "reward": 0.10869640856981277, "reward_std": 0.9612010717391968, "rewards/rollout_reward_func/mean": 0.10869640856981277, "rewards/rollout_reward_func/std": 0.9612010717391968, "sampling/importance_sampling_ratio/max": 1.6218687295913696, "sampling/importance_sampling_ratio/mean": 0.5094164609909058, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.600797653198242, "sampling/sampling_logp_difference/mean": 0.5007122755050659, "step": 85, "step_time": 10.76962549201562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011363636702299118, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "entropy": 2.2539064586162567, "epoch": 0.00043, "grad_norm": 0.3408925533294678, "kl": 0.5595884285867214, "learning_rate": 7.999999768640663e-06, "loss": -0.0064, "step": 86, "step_time": 5.331013093018555 }, { "clip_ratio/high_max": 0.014705882407724857, "clip_ratio/high_mean": 0.007352941203862429, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007352941203862429, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 5.230769634246826, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4296603947877884, "epoch": 0.000435, "frac_reward_zero_std": 0.0, "grad_norm": 0.12969310581684113, "kl": 0.5981673076748848, "learning_rate": 7.999999759293746e-06, "loss": -0.0322, "num_tokens": 1124076.0, "reward": 0.6085399389266968, "reward_std": 1.3017218112945557, "rewards/rollout_reward_func/mean": 0.6085399389266968, "rewards/rollout_reward_func/std": 1.3017218112945557, "sampling/importance_sampling_ratio/max": 1.688742995262146, "sampling/importance_sampling_ratio/mean": 0.8340816497802734, "sampling/importance_sampling_ratio/min": 0.00017124981968663633, "sampling/sampling_logp_difference/max": 2.3283238410949707, "sampling/sampling_logp_difference/mean": 0.27063944935798645, "step": 87, "step_time": 10.827418914006557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4354943707585335, "epoch": 0.00044, "grad_norm": 0.11711790412664413, "kl": 0.5654629021883011, "learning_rate": 7.99999974976174e-06, "loss": -0.0323, "step": 88, "step_time": 5.316992973006563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 5.800000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.543670266866684, "epoch": 0.000445, "frac_reward_zero_std": 0.0, "grad_norm": 0.04334703087806702, "kl": 0.6220175698399544, "learning_rate": 7.99999974004465e-06, "loss": -0.1235, "num_tokens": 1147205.0, "reward": 1.2738475799560547, "reward_std": 1.2259465456008911, "rewards/rollout_reward_func/mean": 1.2738475799560547, "rewards/rollout_reward_func/std": 1.2259466648101807, "sampling/importance_sampling_ratio/max": 1.5080175399780273, "sampling/importance_sampling_ratio/mean": 0.9383698105812073, "sampling/importance_sampling_ratio/min": 0.0003038280410692096, "sampling/sampling_logp_difference/max": 1.7632684707641602, "sampling/sampling_logp_difference/mean": 0.3275758624076843, "step": 89, "step_time": 9.474415049000527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5439908057451248, "epoch": 0.00045, "grad_norm": 0.043424442410469055, "kl": 0.6005135849118233, "learning_rate": 7.99999973014247e-06, "loss": -0.1235, "step": 90, "step_time": 4.889323844006867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.1875, "completions/mean_terminated_length": 6.9166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1980614364147186, "epoch": 0.000455, "frac_reward_zero_std": 0.0, "grad_norm": 0.242161363363266, "kl": 0.6146153658628464, "learning_rate": 7.999999720055204e-06, "loss": -0.0336, "num_tokens": 1169414.0, "reward": 0.41809016466140747, "reward_std": 1.4040031433105469, "rewards/rollout_reward_func/mean": 0.41809016466140747, "rewards/rollout_reward_func/std": 1.4040032625198364, "sampling/importance_sampling_ratio/max": 2.812469482421875, "sampling/importance_sampling_ratio/mean": 0.7353912591934204, "sampling/importance_sampling_ratio/min": 2.6739571694633923e-06, "sampling/sampling_logp_difference/max": 2.278257369995117, "sampling/sampling_logp_difference/mean": 0.48828256130218506, "step": 91, "step_time": 10.434453733992996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00657894741743803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00657894741743803, "entropy": 2.199955493211746, "epoch": 0.00046, "grad_norm": 0.12184280157089233, "kl": 0.5862277448177338, "learning_rate": 7.99999970978285e-06, "loss": -0.0351, "step": 92, "step_time": 5.532462081988342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7348355278372765, "epoch": 0.000465, "frac_reward_zero_std": 0.0, "grad_norm": 0.13526687026023865, "kl": 0.5223072394728661, "learning_rate": 7.999999699325408e-06, "loss": 0.0004, "num_tokens": 1194346.0, "reward": -0.034711360931396484, "reward_std": 1.1768467426300049, "rewards/rollout_reward_func/mean": -0.034711360931396484, "rewards/rollout_reward_func/std": 1.1768468618392944, "sampling/importance_sampling_ratio/max": 1.6688870191574097, "sampling/importance_sampling_ratio/mean": 0.7293574213981628, "sampling/importance_sampling_ratio/min": 1.2546785910672043e-05, "sampling/sampling_logp_difference/max": 2.359471321105957, "sampling/sampling_logp_difference/mean": 0.4059450626373291, "step": 93, "step_time": 10.545615202005138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7421624809503555, "epoch": 0.00047, "grad_norm": 0.13750562071800232, "kl": 0.4693038836121559, "learning_rate": 7.999999688682879e-06, "loss": -0.0005, "step": 94, "step_time": 5.234378547000233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 5.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0796432495117188, "epoch": 0.000475, "frac_reward_zero_std": 0.0, "grad_norm": 0.1829412281513214, "kl": 0.5938045233488083, "learning_rate": 7.999999677855262e-06, "loss": -0.0962, "num_tokens": 1218790.0, "reward": 1.5130460262298584, "reward_std": 0.7642645239830017, "rewards/rollout_reward_func/mean": 1.5130460262298584, "rewards/rollout_reward_func/std": 0.7642645239830017, "sampling/importance_sampling_ratio/max": 1.9616343975067139, "sampling/importance_sampling_ratio/mean": 0.995913028717041, "sampling/importance_sampling_ratio/min": 8.301049092551693e-05, "sampling/sampling_logp_difference/max": 2.329237937927246, "sampling/sampling_logp_difference/mean": 0.29325541853904724, "step": 95, "step_time": 10.438160272999085 }, { "clip_ratio/high_max": 0.011904762126505375, "clip_ratio/high_mean": 0.0059523810632526875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0059523810632526875, "entropy": 1.0862488113343716, "epoch": 0.00048, "grad_norm": 0.16001036763191223, "kl": 0.5419805496931076, "learning_rate": 7.999999666842558e-06, "loss": -0.0965, "step": 96, "step_time": 5.329658855000162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.875, "completions/mean_terminated_length": 6.888888835906982, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.944686472415924, "epoch": 0.000485, "frac_reward_zero_std": 0.0, "grad_norm": 0.1629626452922821, "kl": 0.26988714188337326, "learning_rate": 7.999999655644765e-06, "loss": -0.0668, "num_tokens": 1248048.0, "reward": 0.5272194147109985, "reward_std": 1.0484296083450317, "rewards/rollout_reward_func/mean": 0.5272194147109985, "rewards/rollout_reward_func/std": 1.0484297275543213, "sampling/importance_sampling_ratio/max": 1.6387481689453125, "sampling/importance_sampling_ratio/mean": 0.4404284656047821, "sampling/importance_sampling_ratio/min": 5.288916180012926e-11, "sampling/sampling_logp_difference/max": 2.874612331390381, "sampling/sampling_logp_difference/mean": 0.5351565480232239, "step": 97, "step_time": 11.67732242801867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.943123161792755, "epoch": 0.00049, "grad_norm": 0.16933315992355347, "kl": 0.26080257445573807, "learning_rate": 7.999999644261886e-06, "loss": -0.0665, "step": 98, "step_time": 6.028228823997779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 5.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.557965725660324, "epoch": 0.000495, "frac_reward_zero_std": 0.0, "grad_norm": 0.22958242893218994, "kl": 0.26445848122239113, "learning_rate": 7.99999963269392e-06, "loss": -0.1481, "num_tokens": 1277985.0, "reward": 0.6440425515174866, "reward_std": 0.9762083888053894, "rewards/rollout_reward_func/mean": 0.6440425515174866, "rewards/rollout_reward_func/std": 0.9762084484100342, "sampling/importance_sampling_ratio/max": 1.6114506721496582, "sampling/importance_sampling_ratio/mean": 0.7233901619911194, "sampling/importance_sampling_ratio/min": 1.3150906852388289e-06, "sampling/sampling_logp_difference/max": 1.8411524295806885, "sampling/sampling_logp_difference/mean": 0.4505821466445923, "step": 99, "step_time": 11.54986814998847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.552103638648987, "epoch": 0.0005, "grad_norm": 0.22437652945518494, "kl": 0.26965125650167465, "learning_rate": 7.999999620940867e-06, "loss": -0.1481, "step": 100, "step_time": 5.630242771017947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 4.769230842590332, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7084199339151382, "epoch": 0.000505, "frac_reward_zero_std": 0.0, "grad_norm": 0.16095083951950073, "kl": 0.7893666103482246, "learning_rate": 7.999999609002725e-06, "loss": -0.1014, "num_tokens": 1303371.0, "reward": 0.6850444078445435, "reward_std": 1.2042073011398315, "rewards/rollout_reward_func/mean": 0.6850444078445435, "rewards/rollout_reward_func/std": 1.204207420349121, "sampling/importance_sampling_ratio/max": 2.1294894218444824, "sampling/importance_sampling_ratio/mean": 0.7503684759140015, "sampling/importance_sampling_ratio/min": 8.682707994012162e-05, "sampling/sampling_logp_difference/max": 2.272430419921875, "sampling/sampling_logp_difference/mean": 0.4393223226070404, "step": 101, "step_time": 10.661392059992068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6967554241418839, "epoch": 0.00051, "grad_norm": 0.15570706129074097, "kl": 0.8061883375048637, "learning_rate": 7.999999596879495e-06, "loss": -0.1017, "step": 102, "step_time": 5.438090675015701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.375, "completions/mean_terminated_length": 6.363636493682861, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.281813144683838, "epoch": 0.000515, "frac_reward_zero_std": 0.0, "grad_norm": 0.1426232010126114, "kl": 0.5162532106041908, "learning_rate": 7.99999958457118e-06, "loss": -0.0703, "num_tokens": 1328794.0, "reward": 0.47969627380371094, "reward_std": 1.0974889993667603, "rewards/rollout_reward_func/mean": 0.47969627380371094, "rewards/rollout_reward_func/std": 1.0974891185760498, "sampling/importance_sampling_ratio/max": 1.744999885559082, "sampling/importance_sampling_ratio/mean": 0.5041226148605347, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.3221445083618164, "sampling/sampling_logp_difference/mean": 0.48255079984664917, "step": 103, "step_time": 9.826100211008452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 2.27250137925148, "epoch": 0.00052, "grad_norm": 0.14545027911663055, "kl": 0.5577533915638924, "learning_rate": 7.999999572077776e-06, "loss": -0.0705, "step": 104, "step_time": 4.995798963995185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.3125, "completions/mean_terminated_length": 5.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8221285939216614, "epoch": 0.000525, "frac_reward_zero_std": 0.0, "grad_norm": 0.07815376669168472, "kl": 0.9731422960758209, "learning_rate": 7.999999559399285e-06, "loss": -0.1393, "num_tokens": 1357327.0, "reward": 0.9302961826324463, "reward_std": 1.0617681741714478, "rewards/rollout_reward_func/mean": 0.9302961826324463, "rewards/rollout_reward_func/std": 1.0617681741714478, "sampling/importance_sampling_ratio/max": 2.230060338973999, "sampling/importance_sampling_ratio/mean": 0.7788149118423462, "sampling/importance_sampling_ratio/min": 2.579577085271012e-05, "sampling/sampling_logp_difference/max": 2.0274910926818848, "sampling/sampling_logp_difference/mean": 0.40013444423675537, "step": 105, "step_time": 10.314820110012079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.8126737177371979, "epoch": 0.00053, "grad_norm": 0.07978535443544388, "kl": 1.0231345370411873, "learning_rate": 7.999999546535705e-06, "loss": -0.1393, "step": 106, "step_time": 5.319854475994362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.4375, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7994420528411865, "epoch": 0.000535, "frac_reward_zero_std": 0.0, "grad_norm": 0.062221426516771317, "kl": 0.2165021374821663, "learning_rate": 7.99999953348704e-06, "loss": -0.093, "num_tokens": 1375276.0, "reward": 1.2061383724212646, "reward_std": 1.3664226531982422, "rewards/rollout_reward_func/mean": 1.2061383724212646, "rewards/rollout_reward_func/std": 1.3664226531982422, "sampling/importance_sampling_ratio/max": 1.312779426574707, "sampling/importance_sampling_ratio/mean": 0.7980822324752808, "sampling/importance_sampling_ratio/min": 2.4842636776156723e-05, "sampling/sampling_logp_difference/max": 1.9277920722961426, "sampling/sampling_logp_difference/mean": 0.27928823232650757, "step": 107, "step_time": 7.874082577996887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7996599674224854, "epoch": 0.00054, "grad_norm": 0.05675205960869789, "kl": 0.22677529975771904, "learning_rate": 7.999999520253286e-06, "loss": -0.0932, "step": 108, "step_time": 4.062309216984431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.857142925262451, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4255062341690063, "epoch": 0.000545, "frac_reward_zero_std": 0.0, "grad_norm": 0.40212827920913696, "kl": 0.7610947266221046, "learning_rate": 7.999999506834446e-06, "loss": -0.0805, "num_tokens": 1401190.0, "reward": 0.3900020718574524, "reward_std": 0.9305210113525391, "rewards/rollout_reward_func/mean": 0.3900020718574524, "rewards/rollout_reward_func/std": 0.9305210113525391, "sampling/importance_sampling_ratio/max": 2.1810426712036133, "sampling/importance_sampling_ratio/mean": 0.8975775837898254, "sampling/importance_sampling_ratio/min": 0.0005465354770421982, "sampling/sampling_logp_difference/max": 1.9672224521636963, "sampling/sampling_logp_difference/mean": 0.30051374435424805, "step": 109, "step_time": 10.780840703024296 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.0193452388048172, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0193452388048172, "entropy": 1.4155230820178986, "epoch": 0.00055, "grad_norm": 0.18430496752262115, "kl": 0.764761496335268, "learning_rate": 7.999999493230518e-06, "loss": -0.0829, "step": 110, "step_time": 5.642612785988604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.857142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.299570955336094, "epoch": 0.000555, "frac_reward_zero_std": 0.0, "grad_norm": 0.1704164296388626, "kl": 0.5442881435155869, "learning_rate": 7.9999994794415e-06, "loss": 0.0652, "num_tokens": 1427451.0, "reward": -0.5835646986961365, "reward_std": 0.4753926694393158, "rewards/rollout_reward_func/mean": -0.5835646986961365, "rewards/rollout_reward_func/std": 0.4753926694393158, "sampling/importance_sampling_ratio/max": 2.2271270751953125, "sampling/importance_sampling_ratio/mean": 0.8203881978988647, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.099860668182373, "sampling/sampling_logp_difference/mean": 0.3253498673439026, "step": 111, "step_time": 9.94036687699554 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.2990195378661156, "epoch": 0.00056, "grad_norm": 0.08860377222299576, "kl": 0.5052449591457844, "learning_rate": 7.999999465467398e-06, "loss": 0.0647, "step": 112, "step_time": 5.189346256011049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.5625, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2674781680107117, "epoch": 0.000565, "frac_reward_zero_std": 0.0, "grad_norm": 0.38748565316200256, "kl": 0.5398770347237587, "learning_rate": 7.999999451308208e-06, "loss": -0.0912, "num_tokens": 1456634.0, "reward": 0.633227527141571, "reward_std": 1.1978548765182495, "rewards/rollout_reward_func/mean": 0.633227527141571, "rewards/rollout_reward_func/std": 1.197854995727539, "sampling/importance_sampling_ratio/max": 2.0017926692962646, "sampling/importance_sampling_ratio/mean": 0.7993462681770325, "sampling/importance_sampling_ratio/min": 2.2156227714731358e-05, "sampling/sampling_logp_difference/max": 2.518662929534912, "sampling/sampling_logp_difference/mean": 0.28971725702285767, "step": 113, "step_time": 11.240512824020698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.26498843729496, "epoch": 0.00057, "grad_norm": 0.36718928813934326, "kl": 0.5102036781609058, "learning_rate": 7.99999943696393e-06, "loss": -0.0922, "step": 114, "step_time": 5.7275406639964785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 5.600000381469727, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6058227866888046, "epoch": 0.000575, "frac_reward_zero_std": 0.0, "grad_norm": 0.37918004393577576, "kl": 2.01989559084177, "learning_rate": 7.999999422434564e-06, "loss": -0.0071, "num_tokens": 1487241.0, "reward": 0.27582085132598877, "reward_std": 0.8370612263679504, "rewards/rollout_reward_func/mean": 0.27582085132598877, "rewards/rollout_reward_func/std": 0.8370612859725952, "sampling/importance_sampling_ratio/max": 1.8096799850463867, "sampling/importance_sampling_ratio/mean": 0.6177927851676941, "sampling/importance_sampling_ratio/min": 3.648512210929766e-05, "sampling/sampling_logp_difference/max": 2.45412540435791, "sampling/sampling_logp_difference/mean": 0.46527770161628723, "step": 115, "step_time": 11.055422292018193 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 1.6244221925735474, "epoch": 0.00058, "grad_norm": 0.32484501600265503, "kl": 1.8558364138007164, "learning_rate": 7.99999940772011e-06, "loss": -0.0073, "step": 116, "step_time": 5.8781528679828625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0520108938217163, "epoch": 0.000585, "frac_reward_zero_std": 0.0, "grad_norm": 0.25670742988586426, "kl": 0.40035679191350937, "learning_rate": 7.99999939282057e-06, "loss": -0.0523, "num_tokens": 1516510.0, "reward": 0.6029979586601257, "reward_std": 0.9091008305549622, "rewards/rollout_reward_func/mean": 0.6029979586601257, "rewards/rollout_reward_func/std": 0.9091008305549622, "sampling/importance_sampling_ratio/max": 1.6936149597167969, "sampling/importance_sampling_ratio/mean": 0.7515994906425476, "sampling/importance_sampling_ratio/min": 8.254737622337416e-06, "sampling/sampling_logp_difference/max": 2.429328203201294, "sampling/sampling_logp_difference/mean": 0.45095181465148926, "step": 117, "step_time": 10.692627311014803 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 2.0672963857650757, "epoch": 0.00059, "grad_norm": 0.22718799114227295, "kl": 0.37278471887111664, "learning_rate": 7.999999377735942e-06, "loss": -0.0522, "step": 118, "step_time": 6.399188036011765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3849885761737823, "epoch": 0.000595, "frac_reward_zero_std": 0.0, "grad_norm": 0.056321896612644196, "kl": 0.33927178010344505, "learning_rate": 7.999999362466227e-06, "loss": -0.0792, "num_tokens": 1535532.0, "reward": 1.3484954833984375, "reward_std": 0.9996035695075989, "rewards/rollout_reward_func/mean": 1.3484954833984375, "rewards/rollout_reward_func/std": 0.9996036291122437, "sampling/importance_sampling_ratio/max": 1.327034831047058, "sampling/importance_sampling_ratio/mean": 0.8872377872467041, "sampling/importance_sampling_ratio/min": 4.3210089643253013e-05, "sampling/sampling_logp_difference/max": 1.889549970626831, "sampling/sampling_logp_difference/mean": 0.27211159467697144, "step": 119, "step_time": 7.211238159012282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3951306343078613, "epoch": 0.0006, "grad_norm": 0.05795726552605629, "kl": 0.3269429914653301, "learning_rate": 7.999999347011425e-06, "loss": -0.0792, "step": 120, "step_time": 3.7516103529924294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 5.600000381469727, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8657833263278008, "epoch": 0.000605, "frac_reward_zero_std": 0.0, "grad_norm": 0.07142860442399979, "kl": 0.43994101136922836, "learning_rate": 7.999999331371534e-06, "loss": -0.091, "num_tokens": 1559115.0, "reward": 1.3275398015975952, "reward_std": 1.0262062549591064, "rewards/rollout_reward_func/mean": 1.3275398015975952, "rewards/rollout_reward_func/std": 1.026206374168396, "sampling/importance_sampling_ratio/max": 1.7059682607650757, "sampling/importance_sampling_ratio/mean": 0.8803335428237915, "sampling/importance_sampling_ratio/min": 0.005690920166671276, "sampling/sampling_logp_difference/max": 2.2258453369140625, "sampling/sampling_logp_difference/mean": 0.21557903289794922, "step": 121, "step_time": 10.352384333993541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.863434448838234, "epoch": 0.00061, "grad_norm": 0.07608739286661148, "kl": 0.42110211215913296, "learning_rate": 7.999999315546556e-06, "loss": -0.0909, "step": 122, "step_time": 5.107282773999032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 5.538461685180664, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6892493292689323, "epoch": 0.000615, "frac_reward_zero_std": 0.0, "grad_norm": 0.2637943923473358, "kl": 0.33836739510297775, "learning_rate": 7.999999299536492e-06, "loss": -0.0843, "num_tokens": 1584155.0, "reward": 0.13283555209636688, "reward_std": 0.7603104710578918, "rewards/rollout_reward_func/mean": 0.13283555209636688, "rewards/rollout_reward_func/std": 0.7603104710578918, "sampling/importance_sampling_ratio/max": 2.031399726867676, "sampling/importance_sampling_ratio/mean": 0.8453797101974487, "sampling/importance_sampling_ratio/min": 2.812975594679301e-07, "sampling/sampling_logp_difference/max": 2.51373028755188, "sampling/sampling_logp_difference/mean": 0.353754460811615, "step": 123, "step_time": 10.932341295992956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 1.6891283690929413, "epoch": 0.00062, "grad_norm": 0.15672801434993744, "kl": 0.34601226449012756, "learning_rate": 7.999999283341339e-06, "loss": -0.0848, "step": 124, "step_time": 5.603316144988639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 5.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.151180349290371, "epoch": 0.000625, "frac_reward_zero_std": 0.0, "grad_norm": 0.062388040125370026, "kl": 0.8879400119185448, "learning_rate": 7.999999266961099e-06, "loss": -0.0881, "num_tokens": 1603586.0, "reward": 1.021225094795227, "reward_std": 1.0384376049041748, "rewards/rollout_reward_func/mean": 1.021225094795227, "rewards/rollout_reward_func/std": 1.0384376049041748, "sampling/importance_sampling_ratio/max": 1.3948408365249634, "sampling/importance_sampling_ratio/mean": 0.8152002692222595, "sampling/importance_sampling_ratio/min": 1.2878188499598764e-05, "sampling/sampling_logp_difference/max": 2.067627429962158, "sampling/sampling_logp_difference/mean": 0.3047482371330261, "step": 125, "step_time": 7.008393719006563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1422325447201729, "epoch": 0.00063, "grad_norm": 0.05982079729437828, "kl": 0.8434962779283524, "learning_rate": 7.999999250395772e-06, "loss": -0.0881, "step": 126, "step_time": 3.7613188329996774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 6.142857551574707, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7715769410133362, "epoch": 0.000635, "frac_reward_zero_std": 0.0, "grad_norm": 0.15183234214782715, "kl": 0.5913110002875328, "learning_rate": 7.999999233645358e-06, "loss": 0.0168, "num_tokens": 1632467.0, "reward": 0.253756582736969, "reward_std": 0.9491588473320007, "rewards/rollout_reward_func/mean": 0.253756582736969, "rewards/rollout_reward_func/std": 0.9491589069366455, "sampling/importance_sampling_ratio/max": 1.6401346921920776, "sampling/importance_sampling_ratio/mean": 0.6423428654670715, "sampling/importance_sampling_ratio/min": 4.7421070803466137e-07, "sampling/sampling_logp_difference/max": 2.2367568016052246, "sampling/sampling_logp_difference/mean": 0.38369065523147583, "step": 127, "step_time": 10.37102431099629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7671479880809784, "epoch": 0.00064, "grad_norm": 0.14804087579250336, "kl": 0.5754344388842583, "learning_rate": 7.999999216709854e-06, "loss": 0.0164, "step": 128, "step_time": 6.232113076010137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 6.461538791656494, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.432909518480301, "epoch": 0.000645, "frac_reward_zero_std": 0.0, "grad_norm": 0.24918630719184875, "kl": 0.4018862657248974, "learning_rate": 7.999999199589265e-06, "loss": 0.0308, "num_tokens": 1657913.0, "reward": -0.21710282564163208, "reward_std": 0.9533239006996155, "rewards/rollout_reward_func/mean": -0.21710282564163208, "rewards/rollout_reward_func/std": 0.9533239006996155, "sampling/importance_sampling_ratio/max": 1.6209540367126465, "sampling/importance_sampling_ratio/mean": 0.5772700309753418, "sampling/importance_sampling_ratio/min": 2.5009774617501535e-05, "sampling/sampling_logp_difference/max": 2.3640027046203613, "sampling/sampling_logp_difference/mean": 0.40492501854896545, "step": 129, "step_time": 10.571965150011238 }, { "clip_ratio/high_max": 0.011904762126505375, "clip_ratio/high_mean": 0.0059523810632526875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0059523810632526875, "entropy": 2.4417407512664795, "epoch": 0.00065, "grad_norm": 0.1457263082265854, "kl": 0.37626173347234726, "learning_rate": 7.999999182283588e-06, "loss": 0.0297, "step": 130, "step_time": 5.253936442983104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 5.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0885881185531616, "epoch": 0.000655, "frac_reward_zero_std": 0.0, "grad_norm": 0.22942520678043365, "kl": 0.6062246337532997, "learning_rate": 7.999999164792823e-06, "loss": -0.0704, "num_tokens": 1682227.0, "reward": 0.9923819899559021, "reward_std": 1.1457533836364746, "rewards/rollout_reward_func/mean": 0.9923819899559021, "rewards/rollout_reward_func/std": 1.1457533836364746, "sampling/importance_sampling_ratio/max": 1.4305908679962158, "sampling/importance_sampling_ratio/mean": 0.7308158874511719, "sampling/importance_sampling_ratio/min": 0.0002712007553782314, "sampling/sampling_logp_difference/max": 2.3636956214904785, "sampling/sampling_logp_difference/mean": 0.23673351109027863, "step": 131, "step_time": 10.081881753008929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0919404029846191, "epoch": 0.00066, "grad_norm": 0.21578961610794067, "kl": 0.5900304913520813, "learning_rate": 7.999999147116972e-06, "loss": -0.0713, "step": 132, "step_time": 5.110072571013006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.375, "completions/mean_terminated_length": 5.833333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9384838044643402, "epoch": 0.000665, "frac_reward_zero_std": 0.0, "grad_norm": 0.11865980178117752, "kl": 0.32989924401044846, "learning_rate": 7.999999129256032e-06, "loss": -0.0843, "num_tokens": 1711658.0, "reward": 0.41500428318977356, "reward_std": 1.1302977800369263, "rewards/rollout_reward_func/mean": 0.41500428318977356, "rewards/rollout_reward_func/std": 1.1302978992462158, "sampling/importance_sampling_ratio/max": 1.9996552467346191, "sampling/importance_sampling_ratio/mean": 0.6816721558570862, "sampling/importance_sampling_ratio/min": 1.370166410197271e-05, "sampling/sampling_logp_difference/max": 2.035564422607422, "sampling/sampling_logp_difference/mean": 0.3762866258621216, "step": 133, "step_time": 10.519254915008787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9376183450222015, "epoch": 0.00067, "grad_norm": 0.11914806067943573, "kl": 0.3182927221059799, "learning_rate": 7.999999111210005e-06, "loss": -0.0844, "step": 134, "step_time": 5.380936524001299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.076923370361328, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.326256774365902, "epoch": 0.000675, "frac_reward_zero_std": 0.0, "grad_norm": 0.1568683683872223, "kl": 0.25318950042128563, "learning_rate": 7.999999092978893e-06, "loss": -0.1081, "num_tokens": 1737135.0, "reward": 0.27099519968032837, "reward_std": 1.012181282043457, "rewards/rollout_reward_func/mean": 0.27099519968032837, "rewards/rollout_reward_func/std": 1.012181282043457, "sampling/importance_sampling_ratio/max": 2.049621105194092, "sampling/importance_sampling_ratio/mean": 0.9866907596588135, "sampling/importance_sampling_ratio/min": 8.326254032908764e-07, "sampling/sampling_logp_difference/max": 2.1079556941986084, "sampling/sampling_logp_difference/mean": 0.2864449620246887, "step": 135, "step_time": 9.989032299010432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3224895521998405, "epoch": 0.00068, "grad_norm": 0.16339419782161713, "kl": 0.2521179523319006, "learning_rate": 7.999999074562691e-06, "loss": -0.1085, "step": 136, "step_time": 5.159156209003413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 5.714285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9949153065681458, "epoch": 0.000685, "frac_reward_zero_std": 0.0, "grad_norm": 0.13245484232902527, "kl": 0.5161089822649956, "learning_rate": 7.999999055961402e-06, "loss": -0.0608, "num_tokens": 1759860.0, "reward": 0.9270025491714478, "reward_std": 1.2408831119537354, "rewards/rollout_reward_func/mean": 0.9270025491714478, "rewards/rollout_reward_func/std": 1.2408831119537354, "sampling/importance_sampling_ratio/max": 1.6555811166763306, "sampling/importance_sampling_ratio/mean": 0.6745444536209106, "sampling/importance_sampling_ratio/min": 1.3409190557922557e-07, "sampling/sampling_logp_difference/max": 2.3057405948638916, "sampling/sampling_logp_difference/mean": 0.3946884274482727, "step": 137, "step_time": 10.030656296978123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9818265438079834, "epoch": 0.00069, "grad_norm": 0.12508007884025574, "kl": 0.5395314395427704, "learning_rate": 7.999999037175024e-06, "loss": -0.0612, "step": 138, "step_time": 5.6152675619814545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.9375, "completions/mean_terminated_length": 4.84615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5486100614070892, "epoch": 0.000695, "frac_reward_zero_std": 0.0, "grad_norm": 0.24393755197525024, "kl": 0.5115091651678085, "learning_rate": 7.999999018203562e-06, "loss": -0.1156, "num_tokens": 1787327.0, "reward": 0.740088939666748, "reward_std": 1.1930359601974487, "rewards/rollout_reward_func/mean": 0.740088939666748, "rewards/rollout_reward_func/std": 1.1930359601974487, "sampling/importance_sampling_ratio/max": 1.9546622037887573, "sampling/importance_sampling_ratio/mean": 0.8430712223052979, "sampling/importance_sampling_ratio/min": 0.0001287483173655346, "sampling/sampling_logp_difference/max": 2.1418356895446777, "sampling/sampling_logp_difference/mean": 0.3104485869407654, "step": 139, "step_time": 10.580353190976894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.541521891951561, "epoch": 0.0007, "grad_norm": 0.2321566492319107, "kl": 0.5418380498886108, "learning_rate": 7.99999899904701e-06, "loss": -0.1164, "step": 140, "step_time": 5.269347638008185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 5.000000476837158, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0900275334715843, "epoch": 0.000705, "frac_reward_zero_std": 0.0, "grad_norm": 0.19942595064640045, "kl": 0.46472400426864624, "learning_rate": 7.99999897970537e-06, "loss": -0.0339, "num_tokens": 1811574.0, "reward": -0.39852896332740784, "reward_std": 0.8885053396224976, "rewards/rollout_reward_func/mean": -0.39852896332740784, "rewards/rollout_reward_func/std": 0.8885052800178528, "sampling/importance_sampling_ratio/max": 1.492458701133728, "sampling/importance_sampling_ratio/mean": 0.8734211325645447, "sampling/importance_sampling_ratio/min": 0.0002966042666230351, "sampling/sampling_logp_difference/max": 2.1858062744140625, "sampling/sampling_logp_difference/mean": 0.27620014548301697, "step": 141, "step_time": 10.353894897998543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0811446234583855, "epoch": 0.00071, "grad_norm": 0.1970970332622528, "kl": 0.47249414771795273, "learning_rate": 7.999998960178645e-06, "loss": -0.0339, "step": 142, "step_time": 5.579770384996664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 4.769230842590332, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7352206856012344, "epoch": 0.000715, "frac_reward_zero_std": 0.0, "grad_norm": 0.15444688498973846, "kl": 0.7006551399827003, "learning_rate": 7.999998940466832e-06, "loss": -0.075, "num_tokens": 1839876.0, "reward": 0.3310365080833435, "reward_std": 1.148134708404541, "rewards/rollout_reward_func/mean": 0.3310365080833435, "rewards/rollout_reward_func/std": 1.148134708404541, "sampling/importance_sampling_ratio/max": 1.675083875656128, "sampling/importance_sampling_ratio/mean": 0.6959143280982971, "sampling/importance_sampling_ratio/min": 1.1225338312215172e-05, "sampling/sampling_logp_difference/max": 2.2899086475372314, "sampling/sampling_logp_difference/mean": 0.38214361667633057, "step": 143, "step_time": 10.681089934005286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.730945646762848, "epoch": 0.00072, "grad_norm": 0.15418070554733276, "kl": 0.7367965802550316, "learning_rate": 7.999998920569931e-06, "loss": -0.0751, "step": 144, "step_time": 5.6262689840077655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 4.461538791656494, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4533570408821106, "epoch": 0.000725, "frac_reward_zero_std": 0.0, "grad_norm": 0.3227585554122925, "kl": 0.5545799434185028, "learning_rate": 7.999998900487944e-06, "loss": 0.0168, "num_tokens": 1861502.0, "reward": 0.21466529369354248, "reward_std": 1.2533127069473267, "rewards/rollout_reward_func/mean": 0.21466529369354248, "rewards/rollout_reward_func/std": 1.2533127069473267, "sampling/importance_sampling_ratio/max": 2.3051607608795166, "sampling/importance_sampling_ratio/mean": 0.991920530796051, "sampling/importance_sampling_ratio/min": 0.0018182856729254127, "sampling/sampling_logp_difference/max": 1.76255464553833, "sampling/sampling_logp_difference/mean": 0.2388356477022171, "step": 145, "step_time": 8.589714832982281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4593216180801392, "epoch": 0.00073, "grad_norm": 0.32233935594558716, "kl": 0.5195391923189163, "learning_rate": 7.999998880220868e-06, "loss": 0.016, "step": 146, "step_time": 4.411740761002875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.9375, "completions/mean_terminated_length": 5.266666889190674, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.323715977370739, "epoch": 0.000735, "frac_reward_zero_std": 0.0, "grad_norm": 0.311855286359787, "kl": 0.5567180588841438, "learning_rate": 7.999998859768705e-06, "loss": -0.1139, "num_tokens": 1889770.0, "reward": 0.4985571801662445, "reward_std": 1.3741074800491333, "rewards/rollout_reward_func/mean": 0.4985571801662445, "rewards/rollout_reward_func/std": 1.3741074800491333, "sampling/importance_sampling_ratio/max": 1.5915650129318237, "sampling/importance_sampling_ratio/mean": 0.8472097516059875, "sampling/importance_sampling_ratio/min": 0.00024315355403814465, "sampling/sampling_logp_difference/max": 1.924583911895752, "sampling/sampling_logp_difference/mean": 0.2968475818634033, "step": 147, "step_time": 9.991272856015712 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "entropy": 1.3336700797080994, "epoch": 0.00074, "grad_norm": 0.19082602858543396, "kl": 0.5928105413913727, "learning_rate": 7.999998839131454e-06, "loss": -0.1147, "step": 148, "step_time": 5.721222754989867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 5.384615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5478863343596458, "epoch": 0.000745, "frac_reward_zero_std": 0.0, "grad_norm": 0.19265514612197876, "kl": 0.9747177362442017, "learning_rate": 7.999998818309116e-06, "loss": -0.0738, "num_tokens": 1911446.0, "reward": 0.4651053547859192, "reward_std": 1.4452497959136963, "rewards/rollout_reward_func/mean": 0.4651053547859192, "rewards/rollout_reward_func/std": 1.4452499151229858, "sampling/importance_sampling_ratio/max": 2.4700236320495605, "sampling/importance_sampling_ratio/mean": 0.7520056962966919, "sampling/importance_sampling_ratio/min": 0.0010270762722939253, "sampling/sampling_logp_difference/max": 2.023202657699585, "sampling/sampling_logp_difference/mean": 0.31669479608535767, "step": 149, "step_time": 9.910644608986331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011363636702299118, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "entropy": 1.5572413206100464, "epoch": 0.00075, "grad_norm": 0.1864304393529892, "kl": 0.9711170643568039, "learning_rate": 7.99999879730169e-06, "loss": -0.0746, "step": 150, "step_time": 5.019605240013334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.3125, "completions/mean_terminated_length": 4.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5691404938697815, "epoch": 0.000755, "frac_reward_zero_std": 0.0, "grad_norm": 1.628533124923706, "kl": 9.296067014336586, "learning_rate": 7.999998776109179e-06, "loss": -0.0233, "num_tokens": 1942017.0, "reward": 1.2791532278060913, "reward_std": 0.7505183815956116, "rewards/rollout_reward_func/mean": 1.2791532278060913, "rewards/rollout_reward_func/std": 0.7505183815956116, "sampling/importance_sampling_ratio/max": 1.8821651935577393, "sampling/importance_sampling_ratio/mean": 1.0893218517303467, "sampling/importance_sampling_ratio/min": 0.05715829133987427, "sampling/sampling_logp_difference/max": 3.2489047050476074, "sampling/sampling_logp_difference/mean": 0.17902812361717224, "step": 151, "step_time": 10.87549658898206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.590184360742569, "epoch": 0.00076, "grad_norm": 0.4314332604408264, "kl": 3.0973178818821907, "learning_rate": 7.999998754731578e-06, "loss": -0.0354, "step": 152, "step_time": 5.82444250601111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.857142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3135876879096031, "epoch": 0.000765, "frac_reward_zero_std": 0.0, "grad_norm": 0.10647381097078323, "kl": 0.7397106364369392, "learning_rate": 7.99999873316889e-06, "loss": -0.1102, "num_tokens": 1965146.0, "reward": 1.0957226753234863, "reward_std": 1.3381567001342773, "rewards/rollout_reward_func/mean": 1.0957226753234863, "rewards/rollout_reward_func/std": 1.3381567001342773, "sampling/importance_sampling_ratio/max": 1.6392552852630615, "sampling/importance_sampling_ratio/mean": 0.8900735974311829, "sampling/importance_sampling_ratio/min": 5.7676530559547246e-05, "sampling/sampling_logp_difference/max": 1.550917625427246, "sampling/sampling_logp_difference/mean": 0.32126909494400024, "step": 153, "step_time": 9.420556502009276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3276137560606003, "epoch": 0.00077, "grad_norm": 0.10453101992607117, "kl": 0.6804407387971878, "learning_rate": 7.999998711421117e-06, "loss": -0.1103, "step": 154, "step_time": 4.906970107986126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.800000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0972765311598778, "epoch": 0.000775, "frac_reward_zero_std": 0.5, "grad_norm": 0.08652235567569733, "kl": 0.38137199729681015, "learning_rate": 7.999998689488254e-06, "loss": -0.0229, "num_tokens": 1989169.0, "reward": 1.2386428117752075, "reward_std": 0.9783482551574707, "rewards/rollout_reward_func/mean": 1.2386428117752075, "rewards/rollout_reward_func/std": 0.9783482551574707, "sampling/importance_sampling_ratio/max": 1.5282601118087769, "sampling/importance_sampling_ratio/mean": 0.9901463389396667, "sampling/importance_sampling_ratio/min": 0.00022460207401309162, "sampling/sampling_logp_difference/max": 2.3735976219177246, "sampling/sampling_logp_difference/mean": 0.20499774813652039, "step": 155, "step_time": 10.518070799997076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1148130148649216, "epoch": 0.00078, "grad_norm": 0.0883607566356659, "kl": 0.36699508130550385, "learning_rate": 7.999998667370304e-06, "loss": -0.0231, "step": 156, "step_time": 5.576950656002737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 5.285714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3449519872665405, "epoch": 0.000785, "frac_reward_zero_std": 0.0, "grad_norm": 0.38251546025276184, "kl": 0.5405448526144028, "learning_rate": 7.999998645067266e-06, "loss": -0.1126, "num_tokens": 2015143.0, "reward": 0.39510318636894226, "reward_std": 1.1962493658065796, "rewards/rollout_reward_func/mean": 0.39510318636894226, "rewards/rollout_reward_func/std": 1.1962493658065796, "sampling/importance_sampling_ratio/max": 1.98281991481781, "sampling/importance_sampling_ratio/mean": 0.6687020063400269, "sampling/importance_sampling_ratio/min": 0.00014661147724837065, "sampling/sampling_logp_difference/max": 2.281486988067627, "sampling/sampling_logp_difference/mean": 0.32206833362579346, "step": 157, "step_time": 10.258422788974713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3594027161598206, "epoch": 0.00079, "grad_norm": 0.38052529096603394, "kl": 0.5587325468659401, "learning_rate": 7.999998622579143e-06, "loss": -0.1143, "step": 158, "step_time": 5.213063931980287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.625, "completions/mean_terminated_length": 6.1666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6692862212657928, "epoch": 0.000795, "frac_reward_zero_std": 0.0, "grad_norm": 0.21010734140872955, "kl": 0.4247273616492748, "learning_rate": 7.99999859990593e-06, "loss": -0.0733, "num_tokens": 2039146.0, "reward": 0.012054979801177979, "reward_std": 1.1781718730926514, "rewards/rollout_reward_func/mean": 0.012054979801177979, "rewards/rollout_reward_func/std": 1.1781718730926514, "sampling/importance_sampling_ratio/max": 1.3568586111068726, "sampling/importance_sampling_ratio/mean": 0.5452471375465393, "sampling/importance_sampling_ratio/min": 0.00019512017024680972, "sampling/sampling_logp_difference/max": 2.348021984100342, "sampling/sampling_logp_difference/mean": 0.3477447032928467, "step": 159, "step_time": 10.514781461009989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6756320297718048, "epoch": 0.0008, "grad_norm": 0.19912579655647278, "kl": 0.43176092579960823, "learning_rate": 7.999998577047632e-06, "loss": -0.0735, "step": 160, "step_time": 5.118431397015229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 5.769230842590332, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5698262453079224, "epoch": 0.000805, "frac_reward_zero_std": 0.0, "grad_norm": 0.1054619550704956, "kl": 0.9075517691671848, "learning_rate": 7.999998554004246e-06, "loss": -0.0972, "num_tokens": 2067230.0, "reward": 0.7313723564147949, "reward_std": 1.1968051195144653, "rewards/rollout_reward_func/mean": 0.7313723564147949, "rewards/rollout_reward_func/std": 1.1968051195144653, "sampling/importance_sampling_ratio/max": 1.4517724514007568, "sampling/importance_sampling_ratio/mean": 0.5330005884170532, "sampling/importance_sampling_ratio/min": 0.0002435192873235792, "sampling/sampling_logp_difference/max": 1.5478564500808716, "sampling/sampling_logp_difference/mean": 0.3063850998878479, "step": 161, "step_time": 11.163013421013602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5719172209501266, "epoch": 0.00081, "grad_norm": 0.1037355363368988, "kl": 0.909616731107235, "learning_rate": 7.99999853077577e-06, "loss": -0.0973, "step": 162, "step_time": 5.775730723995366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.9375, "completions/mean_terminated_length": 4.84615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8465642035007477, "epoch": 0.000815, "frac_reward_zero_std": 0.0, "grad_norm": 0.14831629395484924, "kl": 0.6285284757614136, "learning_rate": 7.999998507362211e-06, "loss": -0.1177, "num_tokens": 2092183.0, "reward": 0.4516575336456299, "reward_std": 1.2587978839874268, "rewards/rollout_reward_func/mean": 0.4516575336456299, "rewards/rollout_reward_func/std": 1.2587980031967163, "sampling/importance_sampling_ratio/max": 1.2965846061706543, "sampling/importance_sampling_ratio/mean": 0.6706132292747498, "sampling/importance_sampling_ratio/min": 3.9674611684858974e-07, "sampling/sampling_logp_difference/max": 2.653749465942383, "sampling/sampling_logp_difference/mean": 0.4199655055999756, "step": 163, "step_time": 10.478798107986222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8518941402435303, "epoch": 0.00082, "grad_norm": 0.13697868585586548, "kl": 0.6427172720432281, "learning_rate": 7.999998483763561e-06, "loss": -0.1176, "step": 164, "step_time": 5.2047887169755995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.1875, "completions/mean_terminated_length": 4.4666666984558105, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9862514287233353, "epoch": 0.000825, "frac_reward_zero_std": 0.0, "grad_norm": 0.2885926365852356, "kl": 0.41268399357795715, "learning_rate": 7.999998459979827e-06, "loss": -0.0413, "num_tokens": 2121069.0, "reward": 0.30620235204696655, "reward_std": 1.0755419731140137, "rewards/rollout_reward_func/mean": 0.30620235204696655, "rewards/rollout_reward_func/std": 1.0755419731140137, "sampling/importance_sampling_ratio/max": 1.8146857023239136, "sampling/importance_sampling_ratio/mean": 0.8120746612548828, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.064103126525879, "sampling/sampling_logp_difference/mean": 0.23270340263843536, "step": 165, "step_time": 11.006338214996504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.9826839715242386, "epoch": 0.00083, "grad_norm": 0.2558526396751404, "kl": 0.4262583553791046, "learning_rate": 7.999998436011002e-06, "loss": -0.0419, "step": 166, "step_time": 5.768568781990325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.928571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4776695519685745, "epoch": 0.000835, "frac_reward_zero_std": 0.0, "grad_norm": 0.13959939777851105, "kl": 0.44488612934947014, "learning_rate": 7.999998411857091e-06, "loss": -0.0909, "num_tokens": 2146768.0, "reward": 0.31666532158851624, "reward_std": 1.3343416452407837, "rewards/rollout_reward_func/mean": 0.31666532158851624, "rewards/rollout_reward_func/std": 1.3343416452407837, "sampling/importance_sampling_ratio/max": 1.5611106157302856, "sampling/importance_sampling_ratio/mean": 0.9706696271896362, "sampling/importance_sampling_ratio/min": 0.0002178488066419959, "sampling/sampling_logp_difference/max": 1.74723219871521, "sampling/sampling_logp_difference/mean": 0.29170113801956177, "step": 167, "step_time": 10.611509366994142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4627514109015465, "epoch": 0.00084, "grad_norm": 0.12546560168266296, "kl": 0.45020807906985283, "learning_rate": 7.999998387518093e-06, "loss": -0.0916, "step": 168, "step_time": 5.611155764985597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.4375, "completions/mean_terminated_length": 4.733333587646484, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.904705360531807, "epoch": 0.000845, "frac_reward_zero_std": 0.0, "grad_norm": 0.2586134374141693, "kl": 0.36313341930508614, "learning_rate": 7.999998362994007e-06, "loss": -0.0925, "num_tokens": 2176074.0, "reward": 1.1551895141601562, "reward_std": 1.100745439529419, "rewards/rollout_reward_func/mean": 1.1551895141601562, "rewards/rollout_reward_func/std": 1.1007453203201294, "sampling/importance_sampling_ratio/max": 1.5749634504318237, "sampling/importance_sampling_ratio/mean": 0.9487511515617371, "sampling/importance_sampling_ratio/min": 0.00028595648473128676, "sampling/sampling_logp_difference/max": 1.7006906270980835, "sampling/sampling_logp_difference/mean": 0.2317337840795517, "step": 169, "step_time": 11.33858570700977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8868665397167206, "epoch": 0.00085, "grad_norm": 0.23468899726867676, "kl": 0.3779219090938568, "learning_rate": 7.999998338284834e-06, "loss": -0.0946, "step": 170, "step_time": 5.613528765999945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 6.230769634246826, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.146590441465378, "epoch": 0.000855, "frac_reward_zero_std": 0.0, "grad_norm": 0.1569104641675949, "kl": 1.1155143082141876, "learning_rate": 7.999998313390573e-06, "loss": -0.0432, "num_tokens": 2204009.0, "reward": 0.05591858923435211, "reward_std": 0.9841828942298889, "rewards/rollout_reward_func/mean": 0.05591858923435211, "rewards/rollout_reward_func/std": 0.9841829538345337, "sampling/importance_sampling_ratio/max": 1.5214422941207886, "sampling/importance_sampling_ratio/mean": 0.5220916271209717, "sampling/importance_sampling_ratio/min": 7.668570106034167e-06, "sampling/sampling_logp_difference/max": 2.5211760997772217, "sampling/sampling_logp_difference/mean": 0.45504334568977356, "step": 171, "step_time": 11.035878487004084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.133900746703148, "epoch": 0.00086, "grad_norm": 0.15347075462341309, "kl": 1.1190180703997612, "learning_rate": 7.999998288311227e-06, "loss": -0.0435, "step": 172, "step_time": 5.623268894007197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 5.153846263885498, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8297575414180756, "epoch": 0.000865, "frac_reward_zero_std": 0.0, "grad_norm": 0.33131417632102966, "kl": 1.8023043721914291, "learning_rate": 7.999998263046792e-06, "loss": -0.0553, "num_tokens": 2233160.0, "reward": -0.06279589235782623, "reward_std": 1.0063202381134033, "rewards/rollout_reward_func/mean": -0.06279589235782623, "rewards/rollout_reward_func/std": 1.0063202381134033, "sampling/importance_sampling_ratio/max": 1.5583964586257935, "sampling/importance_sampling_ratio/mean": 0.6125948429107666, "sampling/importance_sampling_ratio/min": 4.316862941777799e-06, "sampling/sampling_logp_difference/max": 2.4274351596832275, "sampling/sampling_logp_difference/mean": 0.4162282943725586, "step": 173, "step_time": 10.910007779006264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.827655017375946, "epoch": 0.00087, "grad_norm": 0.31803378462791443, "kl": 1.7699005752801895, "learning_rate": 7.999998237597269e-06, "loss": -0.0555, "step": 174, "step_time": 5.605694048004807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.49971675872802734, "epoch": 0.000875, "frac_reward_zero_std": 0.0, "grad_norm": 0.07416312396526337, "kl": 1.0386766865849495, "learning_rate": 7.99999821196266e-06, "loss": -0.0799, "num_tokens": 2257001.0, "reward": 1.5307725667953491, "reward_std": 0.9640592932701111, "rewards/rollout_reward_func/mean": 1.5307725667953491, "rewards/rollout_reward_func/std": 0.9640593528747559, "sampling/importance_sampling_ratio/max": 1.3889833688735962, "sampling/importance_sampling_ratio/mean": 1.0831341743469238, "sampling/importance_sampling_ratio/min": 0.05231684818863869, "sampling/sampling_logp_difference/max": 1.8971333503723145, "sampling/sampling_logp_difference/mean": 0.12288475036621094, "step": 175, "step_time": 9.769935954987886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5028297603130341, "epoch": 0.00088, "grad_norm": 0.07534410059452057, "kl": 1.0374054685235023, "learning_rate": 7.999998186142964e-06, "loss": -0.0797, "step": 176, "step_time": 5.13259251099953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.4375, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2167628705501556, "epoch": 0.000885, "frac_reward_zero_std": 0.0, "grad_norm": 0.07958617806434631, "kl": 0.9772776588797569, "learning_rate": 7.999998160138178e-06, "loss": -0.0701, "num_tokens": 2279750.0, "reward": 0.5431874394416809, "reward_std": 1.3092989921569824, "rewards/rollout_reward_func/mean": 0.5431874394416809, "rewards/rollout_reward_func/std": 1.3092989921569824, "sampling/importance_sampling_ratio/max": 1.6021147966384888, "sampling/importance_sampling_ratio/mean": 0.5971626043319702, "sampling/importance_sampling_ratio/min": 1.6051012607931625e-07, "sampling/sampling_logp_difference/max": 2.404000759124756, "sampling/sampling_logp_difference/mean": 0.3927628695964813, "step": 177, "step_time": 10.575020681979368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.214975029230118, "epoch": 0.00089, "grad_norm": 0.08320756256580353, "kl": 0.9892929047346115, "learning_rate": 7.999998133948305e-06, "loss": -0.0702, "step": 178, "step_time": 5.036040941995452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 3.6000001430511475, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.021374821662903, "epoch": 0.000895, "frac_reward_zero_std": 0.0, "grad_norm": 0.20825637876987457, "kl": 0.27866392210125923, "learning_rate": 7.999998107573346e-06, "loss": -0.0771, "num_tokens": 2304500.0, "reward": 0.5630372762680054, "reward_std": 0.9090720415115356, "rewards/rollout_reward_func/mean": 0.5630372762680054, "rewards/rollout_reward_func/std": 0.9090719819068909, "sampling/importance_sampling_ratio/max": 1.5411503314971924, "sampling/importance_sampling_ratio/mean": 0.6329948902130127, "sampling/importance_sampling_ratio/min": 1.4278148228186183e-05, "sampling/sampling_logp_difference/max": 2.091010332107544, "sampling/sampling_logp_difference/mean": 0.37004029750823975, "step": 179, "step_time": 11.364560008005355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0289529263973236, "epoch": 0.0009, "grad_norm": 0.21165414154529572, "kl": 0.28073400259017944, "learning_rate": 7.9999980810133e-06, "loss": -0.0771, "step": 180, "step_time": 6.376429278010619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 6.6666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2570679783821106, "epoch": 0.000905, "frac_reward_zero_std": 0.0, "grad_norm": 0.2150796353816986, "kl": 0.5372538864612579, "learning_rate": 7.999998054268167e-06, "loss": 0.0217, "num_tokens": 2332483.0, "reward": -0.039103537797927856, "reward_std": 0.9563415050506592, "rewards/rollout_reward_func/mean": -0.039103537797927856, "rewards/rollout_reward_func/std": 0.9563415050506592, "sampling/importance_sampling_ratio/max": 1.2727280855178833, "sampling/importance_sampling_ratio/mean": 0.31620925664901733, "sampling/importance_sampling_ratio/min": 2.6350833195465384e-06, "sampling/sampling_logp_difference/max": 3.031341552734375, "sampling/sampling_logp_difference/mean": 0.5031793117523193, "step": 181, "step_time": 11.089442488009809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.250910133123398, "epoch": 0.00091, "grad_norm": 0.2134333699941635, "kl": 0.4985249191522598, "learning_rate": 7.999998027337947e-06, "loss": 0.0215, "step": 182, "step_time": 5.606239158005337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 9.5625, "completions/mean_terminated_length": 4.555555820465088, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9049576446413994, "epoch": 0.000915, "frac_reward_zero_std": 0.0, "grad_norm": 0.07848678529262543, "kl": 0.21378230676054955, "learning_rate": 7.999998000222637e-06, "loss": -0.1041, "num_tokens": 2360994.0, "reward": 0.3604106903076172, "reward_std": 1.0521128177642822, "rewards/rollout_reward_func/mean": 0.3604106903076172, "rewards/rollout_reward_func/std": 1.0521129369735718, "sampling/importance_sampling_ratio/max": 1.4714319705963135, "sampling/importance_sampling_ratio/mean": 0.6066860556602478, "sampling/importance_sampling_ratio/min": 1.034655451803701e-05, "sampling/sampling_logp_difference/max": 1.7001724243164062, "sampling/sampling_logp_difference/mean": 0.352992445230484, "step": 183, "step_time": 11.339542895977502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9038111492991447, "epoch": 0.00092, "grad_norm": 0.07948275655508041, "kl": 0.2091466560959816, "learning_rate": 7.999997972922241e-06, "loss": -0.1042, "step": 184, "step_time": 5.582007326986059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.1875, "completions/mean_terminated_length": 5.583333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3915022015571594, "epoch": 0.000925, "frac_reward_zero_std": 0.0, "grad_norm": 0.16815932095050812, "kl": 0.3531005047261715, "learning_rate": 7.999997945436757e-06, "loss": -0.0958, "num_tokens": 2384363.0, "reward": 0.7610887289047241, "reward_std": 1.1642953157424927, "rewards/rollout_reward_func/mean": 0.7610887289047241, "rewards/rollout_reward_func/std": 1.1642954349517822, "sampling/importance_sampling_ratio/max": 1.393028736114502, "sampling/importance_sampling_ratio/mean": 0.663599967956543, "sampling/importance_sampling_ratio/min": 1.4292562866558e-07, "sampling/sampling_logp_difference/max": 2.515740394592285, "sampling/sampling_logp_difference/mean": 0.403983473777771, "step": 185, "step_time": 10.204717061002157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3844029009342194, "epoch": 0.00093, "grad_norm": 0.16708886623382568, "kl": 0.3768719919025898, "learning_rate": 7.999997917766188e-06, "loss": -0.0962, "step": 186, "step_time": 5.041124770999886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.416264370083809, "epoch": 0.000935, "frac_reward_zero_std": 0.0, "grad_norm": 0.27312055230140686, "kl": 0.38119739666581154, "learning_rate": 7.999997889910529e-06, "loss": -0.0675, "num_tokens": 2415401.0, "reward": 0.8580120801925659, "reward_std": 0.8292725086212158, "rewards/rollout_reward_func/mean": 0.8580120801925659, "rewards/rollout_reward_func/std": 0.8292725086212158, "sampling/importance_sampling_ratio/max": 1.7839237451553345, "sampling/importance_sampling_ratio/mean": 0.8475208878517151, "sampling/importance_sampling_ratio/min": 0.00025581082445569336, "sampling/sampling_logp_difference/max": 1.6096148490905762, "sampling/sampling_logp_difference/mean": 0.3417282998561859, "step": 187, "step_time": 10.928875928992056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4168424904346466, "epoch": 0.00094, "grad_norm": 0.2743455171585083, "kl": 0.39186935871839523, "learning_rate": 7.999997861869784e-06, "loss": -0.0676, "step": 188, "step_time": 5.790889573007007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.0625, "completions/mean_terminated_length": 5.909090995788574, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3235363960266113, "epoch": 0.000945, "frac_reward_zero_std": 0.0, "grad_norm": 0.35959863662719727, "kl": 0.39859750121831894, "learning_rate": 7.999997833643951e-06, "loss": -0.0837, "num_tokens": 2440322.0, "reward": -0.3145470917224884, "reward_std": 0.9280606508255005, "rewards/rollout_reward_func/mean": -0.3145470917224884, "rewards/rollout_reward_func/std": 0.9280607104301453, "sampling/importance_sampling_ratio/max": 2.1830880641937256, "sampling/importance_sampling_ratio/mean": 0.6680892109870911, "sampling/importance_sampling_ratio/min": 1.8410844859317876e-05, "sampling/sampling_logp_difference/max": 2.564990282058716, "sampling/sampling_logp_difference/mean": 0.3927008807659149, "step": 189, "step_time": 11.724017455999274 }, { "clip_ratio/high_max": 0.0357142873108387, "clip_ratio/high_mean": 0.01785714365541935, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01785714365541935, "entropy": 2.321988523006439, "epoch": 0.00095, "grad_norm": 0.05268784239888191, "kl": 0.40984347090125084, "learning_rate": 7.999997805233032e-06, "loss": -0.0848, "step": 190, "step_time": 5.8562916240189224 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 4.785714626312256, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2123073935508728, "epoch": 0.000955, "frac_reward_zero_std": 0.0, "grad_norm": 0.10335876792669296, "kl": 0.6373326554894447, "learning_rate": 7.999997776637025e-06, "loss": -0.075, "num_tokens": 2470254.0, "reward": 1.1212207078933716, "reward_std": 0.7573785781860352, "rewards/rollout_reward_func/mean": 1.1212207078933716, "rewards/rollout_reward_func/std": 0.7573785781860352, "sampling/importance_sampling_ratio/max": 1.543009638786316, "sampling/importance_sampling_ratio/mean": 0.7852025628089905, "sampling/importance_sampling_ratio/min": 0.0028459932655096054, "sampling/sampling_logp_difference/max": 2.2731943130493164, "sampling/sampling_logp_difference/mean": 0.2649043798446655, "step": 191, "step_time": 10.854305444983765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2084105014801025, "epoch": 0.00096, "grad_norm": 0.12847360968589783, "kl": 0.6696866378188133, "learning_rate": 7.99999774785593e-06, "loss": -0.075, "step": 192, "step_time": 5.582749810026144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 4.5714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.374019831418991, "epoch": 0.000965, "frac_reward_zero_std": 0.0, "grad_norm": 0.14717084169387817, "kl": 0.21658854186534882, "learning_rate": 7.999997718889747e-06, "loss": -0.0911, "num_tokens": 2494567.0, "reward": 1.1881462335586548, "reward_std": 1.0427972078323364, "rewards/rollout_reward_func/mean": 1.1881462335586548, "rewards/rollout_reward_func/std": 1.042797327041626, "sampling/importance_sampling_ratio/max": 1.2554941177368164, "sampling/importance_sampling_ratio/mean": 0.9316425323486328, "sampling/importance_sampling_ratio/min": 2.5827905574260512e-06, "sampling/sampling_logp_difference/max": 1.5795996189117432, "sampling/sampling_logp_difference/mean": 0.2617455720901489, "step": 193, "step_time": 10.032125383993844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.378207728266716, "epoch": 0.00097, "grad_norm": 0.14948391914367676, "kl": 0.2162497229874134, "learning_rate": 7.999997689738478e-06, "loss": -0.0909, "step": 194, "step_time": 5.236792654002784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 6.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2541489005088806, "epoch": 0.000975, "frac_reward_zero_std": 0.0, "grad_norm": 0.27493223547935486, "kl": 0.17739950492978096, "learning_rate": 7.999997660402122e-06, "loss": -0.0975, "num_tokens": 2514680.0, "reward": -0.8766603469848633, "reward_std": 0.11209217458963394, "rewards/rollout_reward_func/mean": -0.8766603469848633, "rewards/rollout_reward_func/std": 0.11209216713905334, "sampling/importance_sampling_ratio/max": 1.2472249269485474, "sampling/importance_sampling_ratio/mean": 0.3993654251098633, "sampling/importance_sampling_ratio/min": 0.000275112921372056, "sampling/sampling_logp_difference/max": 1.7546954154968262, "sampling/sampling_logp_difference/mean": 0.3330591320991516, "step": 195, "step_time": 8.666724204973434 }, { "clip_ratio/high_max": 0.01666666753590107, "clip_ratio/high_mean": 0.008333333767950535, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008333333767950535, "entropy": 2.247138738632202, "epoch": 0.00098, "grad_norm": 0.12053297460079193, "kl": 0.1792053785175085, "learning_rate": 7.999997630880678e-06, "loss": -0.0985, "step": 196, "step_time": 4.100722142015002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.9375, "completions/mean_terminated_length": 4.84615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9824168980121613, "epoch": 0.000985, "frac_reward_zero_std": 0.0, "grad_norm": 0.22760888934135437, "kl": 1.428950160741806, "learning_rate": 7.999997601174145e-06, "loss": -0.0564, "num_tokens": 2542061.0, "reward": 0.07045963406562805, "reward_std": 1.0446678400039673, "rewards/rollout_reward_func/mean": 0.07045963406562805, "rewards/rollout_reward_func/std": 1.0446678400039673, "sampling/importance_sampling_ratio/max": 2.0922844409942627, "sampling/importance_sampling_ratio/mean": 0.7223848104476929, "sampling/importance_sampling_ratio/min": 5.992424689793552e-07, "sampling/sampling_logp_difference/max": 2.7351913452148438, "sampling/sampling_logp_difference/mean": 0.49829980731010437, "step": 197, "step_time": 11.409631133981748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9785683155059814, "epoch": 0.00099, "grad_norm": 0.21101541817188263, "kl": 1.3301977440714836, "learning_rate": 7.999997571282526e-06, "loss": -0.0567, "step": 198, "step_time": 5.605502839011024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.800000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6972186155617237, "epoch": 0.000995, "frac_reward_zero_std": 0.0, "grad_norm": 0.18541546165943146, "kl": 0.35523999482393265, "learning_rate": 7.999997541205821e-06, "loss": -0.0479, "num_tokens": 2565464.0, "reward": 1.0742099285125732, "reward_std": 1.277552604675293, "rewards/rollout_reward_func/mean": 1.0742099285125732, "rewards/rollout_reward_func/std": 1.277552604675293, "sampling/importance_sampling_ratio/max": 1.6158726215362549, "sampling/importance_sampling_ratio/mean": 1.138796329498291, "sampling/importance_sampling_ratio/min": 0.012059012427926064, "sampling/sampling_logp_difference/max": 0.8185960054397583, "sampling/sampling_logp_difference/mean": 0.13259238004684448, "step": 199, "step_time": 10.07565474200237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6994814574718475, "epoch": 0.001, "grad_norm": 0.2042582631111145, "kl": 0.3544727712869644, "learning_rate": 7.99999751094403e-06, "loss": -0.0488, "step": 200, "step_time": 5.9009109449980315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.3125, "completions/mean_terminated_length": 4.818181991577148, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8216793984174728, "epoch": 0.001005, "frac_reward_zero_std": 0.0, "grad_norm": 0.15797406435012817, "kl": 0.33942490071058273, "learning_rate": 7.999997480497147e-06, "loss": -0.0364, "num_tokens": 2592183.0, "reward": 0.3291100561618805, "reward_std": 1.2402606010437012, "rewards/rollout_reward_func/mean": 0.3291100561618805, "rewards/rollout_reward_func/std": 1.2402607202529907, "sampling/importance_sampling_ratio/max": 1.8047727346420288, "sampling/importance_sampling_ratio/mean": 0.7542750835418701, "sampling/importance_sampling_ratio/min": 3.839510100078769e-05, "sampling/sampling_logp_difference/max": 2.4737164974212646, "sampling/sampling_logp_difference/mean": 0.344121515750885, "step": 201, "step_time": 11.151049727006466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8241359889507294, "epoch": 0.00101, "grad_norm": 0.15178857743740082, "kl": 0.3271215744316578, "learning_rate": 7.99999744986518e-06, "loss": -0.0368, "step": 202, "step_time": 5.832710207992932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 4.785714626312256, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4508406445384026, "epoch": 0.001015, "frac_reward_zero_std": 0.0, "grad_norm": 0.5538622140884399, "kl": 0.2601737193763256, "learning_rate": 7.999997419048124e-06, "loss": -0.0698, "num_tokens": 2616530.0, "reward": 0.09978680312633514, "reward_std": 1.3786044120788574, "rewards/rollout_reward_func/mean": 0.09978680312633514, "rewards/rollout_reward_func/std": 1.3786044120788574, "sampling/importance_sampling_ratio/max": 1.4826953411102295, "sampling/importance_sampling_ratio/mean": 0.9053951501846313, "sampling/importance_sampling_ratio/min": 6.5535359681234695e-06, "sampling/sampling_logp_difference/max": 1.720056414604187, "sampling/sampling_logp_difference/mean": 0.2937009632587433, "step": 203, "step_time": 9.553545325019513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.04261363670229912, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04261363670229912, "entropy": 1.4480303972959518, "epoch": 0.00102, "grad_norm": 0.1849849969148636, "kl": 0.2541790120303631, "learning_rate": 7.999997388045983e-06, "loss": -0.0723, "step": 204, "step_time": 5.185900883021532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.447839304804802, "epoch": 0.001025, "frac_reward_zero_std": 0.0, "grad_norm": 0.42235496640205383, "kl": 0.41228246316313744, "learning_rate": 7.999997356858753e-06, "loss": -0.007, "num_tokens": 2635999.0, "reward": -0.929474413394928, "reward_std": 0.17002686858177185, "rewards/rollout_reward_func/mean": -0.929474413394928, "rewards/rollout_reward_func/std": 0.17002686858177185, "sampling/importance_sampling_ratio/max": 1.8955391645431519, "sampling/importance_sampling_ratio/mean": 0.7799770832061768, "sampling/importance_sampling_ratio/min": 0.0005607432103715837, "sampling/sampling_logp_difference/max": 2.6387548446655273, "sampling/sampling_logp_difference/mean": 0.3109760880470276, "step": 205, "step_time": 7.8998627529945225 }, { "clip_ratio/high_max": 0.07500000018626451, "clip_ratio/high_mean": 0.03750000009313226, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04791666707023978, "entropy": 1.4456508457660675, "epoch": 0.00103, "grad_norm": 0.12443115562200546, "kl": 0.34868302568793297, "learning_rate": 7.999997325486435e-06, "loss": -0.0093, "step": 206, "step_time": 4.1782409450097475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9593270421028137, "epoch": 0.001035, "frac_reward_zero_std": 0.0, "grad_norm": 0.1536100059747696, "kl": 0.4644118584692478, "learning_rate": 7.99999729392903e-06, "loss": -0.1223, "num_tokens": 2662053.0, "reward": 0.284379780292511, "reward_std": 1.2431942224502563, "rewards/rollout_reward_func/mean": 0.284379780292511, "rewards/rollout_reward_func/std": 1.243194341659546, "sampling/importance_sampling_ratio/max": 2.0099775791168213, "sampling/importance_sampling_ratio/mean": 0.839763879776001, "sampling/importance_sampling_ratio/min": 1.6259486201875006e-08, "sampling/sampling_logp_difference/max": 2.576575517654419, "sampling/sampling_logp_difference/mean": 0.4968186616897583, "step": 207, "step_time": 10.724217404989759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.9740652441978455, "epoch": 0.00104, "grad_norm": 0.14900703728199005, "kl": 0.5806146897375584, "learning_rate": 7.99999726218654e-06, "loss": -0.1226, "step": 208, "step_time": 5.2677610109967645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3040585964918137, "epoch": 0.001045, "frac_reward_zero_std": 0.0, "grad_norm": 0.3478550314903259, "kl": 1.2698786407709122, "learning_rate": 7.999997230258959e-06, "loss": -0.064, "num_tokens": 2688550.0, "reward": 0.8011084198951721, "reward_std": 1.3367671966552734, "rewards/rollout_reward_func/mean": 0.8011084198951721, "rewards/rollout_reward_func/std": 1.3367671966552734, "sampling/importance_sampling_ratio/max": 1.4643746614456177, "sampling/importance_sampling_ratio/mean": 0.6708874702453613, "sampling/importance_sampling_ratio/min": 3.368727630004287e-05, "sampling/sampling_logp_difference/max": 2.2891688346862793, "sampling/sampling_logp_difference/mean": 0.2644292712211609, "step": 209, "step_time": 10.681676357024116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3102612495422363, "epoch": 0.00105, "grad_norm": 0.3084186613559723, "kl": 1.1751005128026009, "learning_rate": 7.999997198146293e-06, "loss": -0.0649, "step": 210, "step_time": 5.541288682987215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.92307710647583, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5823302417993546, "epoch": 0.001055, "frac_reward_zero_std": 0.0, "grad_norm": 0.330441415309906, "kl": 0.4175164997577667, "learning_rate": 7.99999716584854e-06, "loss": -0.0655, "num_tokens": 2718786.0, "reward": 0.4249646067619324, "reward_std": 0.9716609120368958, "rewards/rollout_reward_func/mean": 0.4249646067619324, "rewards/rollout_reward_func/std": 0.9716609120368958, "sampling/importance_sampling_ratio/max": 1.4341119527816772, "sampling/importance_sampling_ratio/mean": 0.7468906044960022, "sampling/importance_sampling_ratio/min": 1.2419962331478018e-05, "sampling/sampling_logp_difference/max": 2.113673210144043, "sampling/sampling_logp_difference/mean": 0.30504533648490906, "step": 211, "step_time": 11.33863269998983 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.5920384526252747, "epoch": 0.00106, "grad_norm": 0.16097842156887054, "kl": 0.38928814977407455, "learning_rate": 7.9999971333657e-06, "loss": -0.0658, "step": 212, "step_time": 5.776818436992471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 4.692307949066162, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6575380861759186, "epoch": 0.001065, "frac_reward_zero_std": 0.0, "grad_norm": 0.329692006111145, "kl": 0.2301468588411808, "learning_rate": 7.99999710069777e-06, "loss": -0.0187, "num_tokens": 2748472.0, "reward": 0.49865448474884033, "reward_std": 0.9549805521965027, "rewards/rollout_reward_func/mean": 0.49865448474884033, "rewards/rollout_reward_func/std": 0.9549804925918579, "sampling/importance_sampling_ratio/max": 2.9181628227233887, "sampling/importance_sampling_ratio/mean": 1.0421583652496338, "sampling/importance_sampling_ratio/min": 7.811626345777256e-10, "sampling/sampling_logp_difference/max": 1.9303134679794312, "sampling/sampling_logp_difference/mean": 0.3242887854576111, "step": 213, "step_time": 11.099832564999815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6689542084932327, "epoch": 0.00107, "grad_norm": 0.3387160003185272, "kl": 0.2190601322799921, "learning_rate": 7.999997067844755e-06, "loss": -0.0202, "step": 214, "step_time": 5.671561017996282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.5625, "completions/mean_terminated_length": 4.0714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.858823612332344, "epoch": 0.001075, "frac_reward_zero_std": 0.0, "grad_norm": 0.2636239230632782, "kl": 0.408181332051754, "learning_rate": 7.999997034806652e-06, "loss": -0.0933, "num_tokens": 2767617.0, "reward": 0.19116228818893433, "reward_std": 1.4314587116241455, "rewards/rollout_reward_func/mean": 0.19116228818893433, "rewards/rollout_reward_func/std": 1.4314587116241455, "sampling/importance_sampling_ratio/max": 1.9809857606887817, "sampling/importance_sampling_ratio/mean": 1.071839690208435, "sampling/importance_sampling_ratio/min": 0.016885319724678993, "sampling/sampling_logp_difference/max": 1.3602783679962158, "sampling/sampling_logp_difference/mean": 0.16368861496448517, "step": 215, "step_time": 7.580775276001077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8617050126194954, "epoch": 0.00108, "grad_norm": 0.252139151096344, "kl": 0.41270603239536285, "learning_rate": 7.999997001583462e-06, "loss": -0.0933, "step": 216, "step_time": 4.093556762018125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 5.400000095367432, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6092962622642517, "epoch": 0.001085, "frac_reward_zero_std": 0.0, "grad_norm": 0.2347511649131775, "kl": 0.4229845926165581, "learning_rate": 7.999996968175185e-06, "loss": -0.0791, "num_tokens": 2796431.0, "reward": 0.6156083941459656, "reward_std": 1.0912361145019531, "rewards/rollout_reward_func/mean": 0.6156083941459656, "rewards/rollout_reward_func/std": 1.0912361145019531, "sampling/importance_sampling_ratio/max": 1.84894859790802, "sampling/importance_sampling_ratio/mean": 0.8623616695404053, "sampling/importance_sampling_ratio/min": 3.217754738216172e-06, "sampling/sampling_logp_difference/max": 1.599365234375, "sampling/sampling_logp_difference/mean": 0.3125346004962921, "step": 217, "step_time": 10.951748341991333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6169039607048035, "epoch": 0.00109, "grad_norm": 0.23657342791557312, "kl": 0.4384237378835678, "learning_rate": 7.99999693458182e-06, "loss": -0.0797, "step": 218, "step_time": 5.5885160520119825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 5.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2900914251804352, "epoch": 0.001095, "frac_reward_zero_std": 0.0, "grad_norm": 0.194688081741333, "kl": 0.25448746606707573, "learning_rate": 7.999996900803368e-06, "loss": -0.0808, "num_tokens": 2820144.0, "reward": 0.6194009780883789, "reward_std": 1.3555092811584473, "rewards/rollout_reward_func/mean": 0.6194009780883789, "rewards/rollout_reward_func/std": 1.3555091619491577, "sampling/importance_sampling_ratio/max": 1.7295619249343872, "sampling/importance_sampling_ratio/mean": 0.7465708255767822, "sampling/importance_sampling_ratio/min": 0.0018999263411387801, "sampling/sampling_logp_difference/max": 1.3544964790344238, "sampling/sampling_logp_difference/mean": 0.2154594510793686, "step": 219, "step_time": 10.285461702005705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2921043634414673, "epoch": 0.0011, "grad_norm": 0.1885593980550766, "kl": 0.26284773275256157, "learning_rate": 7.999996866839829e-06, "loss": -0.0811, "step": 220, "step_time": 5.246435197987012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.625, "completions/mean_terminated_length": 7.400000095367432, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.670769453048706, "epoch": 0.001105, "frac_reward_zero_std": 0.0, "grad_norm": 0.7613117694854736, "kl": 0.19551441352814436, "learning_rate": 7.999996832691203e-06, "loss": -0.1343, "num_tokens": 2845321.0, "reward": -0.508375883102417, "reward_std": 0.7063706517219543, "rewards/rollout_reward_func/mean": -0.508375883102417, "rewards/rollout_reward_func/std": 0.7063706517219543, "sampling/importance_sampling_ratio/max": 2.2972910404205322, "sampling/importance_sampling_ratio/mean": 0.5249801874160767, "sampling/importance_sampling_ratio/min": 3.85756266041426e-06, "sampling/sampling_logp_difference/max": 2.5533299446105957, "sampling/sampling_logp_difference/mean": 0.45862290263175964, "step": 221, "step_time": 11.941129170983913 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015530303586274385, "entropy": 2.650566026568413, "epoch": 0.00111, "grad_norm": 0.235989511013031, "kl": 0.21628504432737827, "learning_rate": 7.999996798357488e-06, "loss": -0.14, "step": 222, "step_time": 5.593466545993579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 4.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.188902795314789, "epoch": 0.001115, "frac_reward_zero_std": 0.0, "grad_norm": 0.17180640995502472, "kl": 0.23570674657821655, "learning_rate": 7.999996763838688e-06, "loss": -0.0421, "num_tokens": 2876085.0, "reward": 0.3736106753349304, "reward_std": 0.8860535025596619, "rewards/rollout_reward_func/mean": 0.3736106753349304, "rewards/rollout_reward_func/std": 0.8860534429550171, "sampling/importance_sampling_ratio/max": 1.3890936374664307, "sampling/importance_sampling_ratio/mean": 0.5931527614593506, "sampling/importance_sampling_ratio/min": 3.2376888725593744e-08, "sampling/sampling_logp_difference/max": 2.8913986682891846, "sampling/sampling_logp_difference/mean": 0.4142003655433655, "step": 223, "step_time": 11.080082510015927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.179533302783966, "epoch": 0.00112, "grad_norm": 0.18605512380599976, "kl": 0.24448700994253159, "learning_rate": 7.9999967291348e-06, "loss": -0.0423, "step": 224, "step_time": 5.682992677015136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 4.6666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6806839406490326, "epoch": 0.001125, "frac_reward_zero_std": 0.0, "grad_norm": 0.12484953552484512, "kl": 0.5829749405384064, "learning_rate": 7.999996694245824e-06, "loss": -0.1323, "num_tokens": 2897844.0, "reward": 0.6144769191741943, "reward_std": 1.4123649597167969, "rewards/rollout_reward_func/mean": 0.6144769191741943, "rewards/rollout_reward_func/std": 1.4123649597167969, "sampling/importance_sampling_ratio/max": 2.232877731323242, "sampling/importance_sampling_ratio/mean": 0.7130535840988159, "sampling/importance_sampling_ratio/min": 0.0008326822426170111, "sampling/sampling_logp_difference/max": 2.121084690093994, "sampling/sampling_logp_difference/mean": 0.3428789973258972, "step": 225, "step_time": 9.82744074598304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03437500027939677, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03437500027939677, "entropy": 1.6837027370929718, "epoch": 0.00113, "grad_norm": 0.08928071707487106, "kl": 0.7437407225370407, "learning_rate": 7.99999665917176e-06, "loss": -0.1325, "step": 226, "step_time": 4.980188425004599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.6875, "completions/mean_terminated_length": 5.363636493682861, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.267116665840149, "epoch": 0.001135, "frac_reward_zero_std": 0.0, "grad_norm": 0.21544763445854187, "kl": 0.48154356330633163, "learning_rate": 7.999996623912611e-06, "loss": -0.0782, "num_tokens": 2928783.0, "reward": 0.2652406692504883, "reward_std": 0.8377887606620789, "rewards/rollout_reward_func/mean": 0.2652406692504883, "rewards/rollout_reward_func/std": 0.8377888202667236, "sampling/importance_sampling_ratio/max": 1.4800795316696167, "sampling/importance_sampling_ratio/mean": 0.605501115322113, "sampling/importance_sampling_ratio/min": 1.859221665512223e-10, "sampling/sampling_logp_difference/max": 2.701185464859009, "sampling/sampling_logp_difference/mean": 0.39189136028289795, "step": 227, "step_time": 11.693135106994305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.257229447364807, "epoch": 0.00114, "grad_norm": 0.21247778832912445, "kl": 0.5195800233632326, "learning_rate": 7.999996588468373e-06, "loss": -0.0788, "step": 228, "step_time": 5.880113592007547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 5.000000476837158, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1896789222955704, "epoch": 0.001145, "frac_reward_zero_std": 0.0, "grad_norm": 0.38016706705093384, "kl": 0.26313307881355286, "learning_rate": 7.999996552839049e-06, "loss": 0.0285, "num_tokens": 2958962.0, "reward": 0.22076112031936646, "reward_std": 0.7118640542030334, "rewards/rollout_reward_func/mean": 0.22076112031936646, "rewards/rollout_reward_func/std": 0.7118640542030334, "sampling/importance_sampling_ratio/max": 1.6239290237426758, "sampling/importance_sampling_ratio/mean": 0.798032283782959, "sampling/importance_sampling_ratio/min": 5.402190708991839e-06, "sampling/sampling_logp_difference/max": 2.1631176471710205, "sampling/sampling_logp_difference/mean": 0.30585330724716187, "step": 229, "step_time": 10.669435356001486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1946916431188583, "epoch": 0.00115, "grad_norm": 0.39750245213508606, "kl": 0.26612865552306175, "learning_rate": 7.999996517024637e-06, "loss": 0.0265, "step": 230, "step_time": 5.356448913997156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.6875, "completions/mean_terminated_length": 5.363636493682861, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8471387922763824, "epoch": 0.001155, "frac_reward_zero_std": 0.0, "grad_norm": 0.0699273869395256, "kl": 0.3139121364802122, "learning_rate": 7.999996481025137e-06, "loss": -0.0974, "num_tokens": 2980926.0, "reward": 0.1000637412071228, "reward_std": 1.172484040260315, "rewards/rollout_reward_func/mean": 0.1000637412071228, "rewards/rollout_reward_func/std": 1.172484040260315, "sampling/importance_sampling_ratio/max": 1.2288416624069214, "sampling/importance_sampling_ratio/mean": 0.597937822341919, "sampling/importance_sampling_ratio/min": 0.00047270426875911653, "sampling/sampling_logp_difference/max": 1.4154832363128662, "sampling/sampling_logp_difference/mean": 0.2819620966911316, "step": 231, "step_time": 9.95180364800035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8447609841823578, "epoch": 0.00116, "grad_norm": 0.0727645754814148, "kl": 0.29940796457231045, "learning_rate": 7.99999644484055e-06, "loss": -0.0974, "step": 232, "step_time": 4.82232644900796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.777403011918068, "epoch": 0.001165, "frac_reward_zero_std": 0.0, "grad_norm": 0.36788177490234375, "kl": 0.35352158546447754, "learning_rate": 7.999996408470877e-06, "loss": -0.0504, "num_tokens": 3009180.0, "reward": 0.5980522632598877, "reward_std": 1.1114522218704224, "rewards/rollout_reward_func/mean": 0.5980522632598877, "rewards/rollout_reward_func/std": 1.1114522218704224, "sampling/importance_sampling_ratio/max": 1.4484837055206299, "sampling/importance_sampling_ratio/mean": 0.7523267269134521, "sampling/importance_sampling_ratio/min": 2.8849572117906064e-06, "sampling/sampling_logp_difference/max": 1.943435549736023, "sampling/sampling_logp_difference/mean": 0.4140724837779999, "step": 233, "step_time": 10.231780272020842 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.785219445824623, "epoch": 0.00117, "grad_norm": 0.3403241038322449, "kl": 0.33868465200066566, "learning_rate": 7.999996371916116e-06, "loss": -0.0525, "step": 234, "step_time": 5.391081601017504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 9.4375, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8646888136863708, "epoch": 0.001175, "frac_reward_zero_std": 0.0, "grad_norm": 0.12764638662338257, "kl": 0.18677347339689732, "learning_rate": 7.999996335176269e-06, "loss": -0.0948, "num_tokens": 3034061.0, "reward": -0.05084031820297241, "reward_std": 1.1976794004440308, "rewards/rollout_reward_func/mean": -0.05084031820297241, "rewards/rollout_reward_func/std": 1.1976794004440308, "sampling/importance_sampling_ratio/max": 1.5182585716247559, "sampling/importance_sampling_ratio/mean": 0.6278291940689087, "sampling/importance_sampling_ratio/min": 9.160958870779723e-05, "sampling/sampling_logp_difference/max": 1.584362506866455, "sampling/sampling_logp_difference/mean": 0.2673160433769226, "step": 235, "step_time": 10.44895540800644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.871199607849121, "epoch": 0.00118, "grad_norm": 0.12652702629566193, "kl": 0.18027149699628353, "learning_rate": 7.999996298251333e-06, "loss": -0.0953, "step": 236, "step_time": 5.101086324022617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 5.599999904632568, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9510152339935303, "epoch": 0.001185, "frac_reward_zero_std": 0.0, "grad_norm": 0.16005262732505798, "kl": 0.4537510685622692, "learning_rate": 7.99999626114131e-06, "loss": -0.0346, "num_tokens": 3063314.0, "reward": 0.06652283668518066, "reward_std": 1.0345544815063477, "rewards/rollout_reward_func/mean": 0.06652283668518066, "rewards/rollout_reward_func/std": 1.0345546007156372, "sampling/importance_sampling_ratio/max": 1.526026964187622, "sampling/importance_sampling_ratio/mean": 0.5132787227630615, "sampling/importance_sampling_ratio/min": 2.5752558940439485e-05, "sampling/sampling_logp_difference/max": 1.645466923713684, "sampling/sampling_logp_difference/mean": 0.3029615581035614, "step": 237, "step_time": 11.502920427999925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9481127560138702, "epoch": 0.00119, "grad_norm": 0.16091206669807434, "kl": 0.44866692274808884, "learning_rate": 7.9999962238462e-06, "loss": -0.0345, "step": 238, "step_time": 5.619397064001532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.625, "completions/mean_terminated_length": 6.1666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8593705892562866, "epoch": 0.001195, "frac_reward_zero_std": 0.0, "grad_norm": 0.07207290083169937, "kl": 0.7105748802423477, "learning_rate": 7.999996186366002e-06, "loss": -0.061, "num_tokens": 3087572.0, "reward": 0.6417911052703857, "reward_std": 1.4272098541259766, "rewards/rollout_reward_func/mean": 0.6417911052703857, "rewards/rollout_reward_func/std": 1.4272098541259766, "sampling/importance_sampling_ratio/max": 1.5021177530288696, "sampling/importance_sampling_ratio/mean": 0.5956293344497681, "sampling/importance_sampling_ratio/min": 4.18612262365059e-06, "sampling/sampling_logp_difference/max": 2.5283689498901367, "sampling/sampling_logp_difference/mean": 0.4110857844352722, "step": 239, "step_time": 10.309698556986405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8662011325359344, "epoch": 0.0012, "grad_norm": 0.09728999435901642, "kl": 0.7887490913271904, "learning_rate": 7.999996148700719e-06, "loss": -0.061, "step": 240, "step_time": 5.105003124990617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.6875, "completions/mean_terminated_length": 5.363636493682861, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7423527091741562, "epoch": 0.001205, "frac_reward_zero_std": 0.0, "grad_norm": 0.13316544890403748, "kl": 0.2540152072906494, "learning_rate": 7.999996110850347e-06, "loss": -0.041, "num_tokens": 3112470.0, "reward": 0.6128761768341064, "reward_std": 1.3822180032730103, "rewards/rollout_reward_func/mean": 0.6128761768341064, "rewards/rollout_reward_func/std": 1.3822181224822998, "sampling/importance_sampling_ratio/max": 1.5613194704055786, "sampling/importance_sampling_ratio/mean": 0.5588198900222778, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.9420166015625, "sampling/sampling_logp_difference/mean": 0.3035857677459717, "step": 241, "step_time": 10.537815503004822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7466114908456802, "epoch": 0.00121, "grad_norm": 0.1413937360048294, "kl": 0.25381311029195786, "learning_rate": 7.999996072814888e-06, "loss": -0.0403, "step": 242, "step_time": 4.906154252006672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 10.0625, "completions/mean_terminated_length": 4.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.6902716159820557, "epoch": 0.001215, "frac_reward_zero_std": 0.0, "grad_norm": 0.26292479038238525, "kl": 0.23144599236547947, "learning_rate": 7.999996034594342e-06, "loss": -0.1102, "num_tokens": 3141701.0, "reward": 0.1574999988079071, "reward_std": 0.9185966849327087, "rewards/rollout_reward_func/mean": 0.1574999988079071, "rewards/rollout_reward_func/std": 0.9185967445373535, "sampling/importance_sampling_ratio/max": 1.9667589664459229, "sampling/importance_sampling_ratio/mean": 0.5538327693939209, "sampling/importance_sampling_ratio/min": 7.372660260074326e-10, "sampling/sampling_logp_difference/max": 2.680985689163208, "sampling/sampling_logp_difference/mean": 0.5362745523452759, "step": 243, "step_time": 11.6806640040013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.683416724205017, "epoch": 0.00122, "grad_norm": 0.2615632712841034, "kl": 0.23494082875549793, "learning_rate": 7.99999599618871e-06, "loss": -0.1109, "step": 244, "step_time": 5.832673219018034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.3125, "completions/mean_terminated_length": 5.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8420428931713104, "epoch": 0.001225, "frac_reward_zero_std": 0.0, "grad_norm": 0.15692739188671112, "kl": 0.46559081226587296, "learning_rate": 7.999995957597988e-06, "loss": -0.0979, "num_tokens": 3161261.0, "reward": 1.1872198581695557, "reward_std": 1.2836792469024658, "rewards/rollout_reward_func/mean": 1.1872198581695557, "rewards/rollout_reward_func/std": 1.2836792469024658, "sampling/importance_sampling_ratio/max": 1.41400945186615, "sampling/importance_sampling_ratio/mean": 0.6746201515197754, "sampling/importance_sampling_ratio/min": 1.6134454199345782e-05, "sampling/sampling_logp_difference/max": 2.2568562030792236, "sampling/sampling_logp_difference/mean": 0.3484262228012085, "step": 245, "step_time": 7.407037457000115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8394230902194977, "epoch": 0.00123, "grad_norm": 0.1467793583869934, "kl": 0.48687357548624277, "learning_rate": 7.99999591882218e-06, "loss": -0.0983, "step": 246, "step_time": 3.769606793008279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.125, "completions/mean_terminated_length": 5.555555820465088, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.157475918531418, "epoch": 0.001235, "frac_reward_zero_std": 0.0, "grad_norm": 0.09687697887420654, "kl": 0.2646772563457489, "learning_rate": 7.999995879861286e-06, "loss": -0.1045, "num_tokens": 3190019.0, "reward": 0.6926649808883667, "reward_std": 1.1061100959777832, "rewards/rollout_reward_func/mean": 0.6926649808883667, "rewards/rollout_reward_func/std": 1.1061100959777832, "sampling/importance_sampling_ratio/max": 1.7377722263336182, "sampling/importance_sampling_ratio/mean": 0.6021935939788818, "sampling/importance_sampling_ratio/min": 1.9430598285907763e-07, "sampling/sampling_logp_difference/max": 2.6663451194763184, "sampling/sampling_logp_difference/mean": 0.4211345613002777, "step": 247, "step_time": 11.026698448011302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1496640145778656, "epoch": 0.00124, "grad_norm": 0.10055594891309738, "kl": 0.27879996225237846, "learning_rate": 7.999995840715304e-06, "loss": -0.1049, "step": 248, "step_time": 5.3061516510060756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.375, "completions/mean_terminated_length": 6.615385055541992, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7570375800132751, "epoch": 0.001245, "frac_reward_zero_std": 0.0, "grad_norm": 0.27006036043167114, "kl": 0.36185211315751076, "learning_rate": 7.999995801384234e-06, "loss": -0.0502, "num_tokens": 3220051.0, "reward": 0.031009051948785782, "reward_std": 0.7932913899421692, "rewards/rollout_reward_func/mean": 0.031009051948785782, "rewards/rollout_reward_func/std": 0.7932913899421692, "sampling/importance_sampling_ratio/max": 1.4356775283813477, "sampling/importance_sampling_ratio/mean": 0.5278638601303101, "sampling/importance_sampling_ratio/min": 8.387453272007406e-05, "sampling/sampling_logp_difference/max": 2.2720842361450195, "sampling/sampling_logp_difference/mean": 0.3254013955593109, "step": 249, "step_time": 11.491477820003638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009615384973585606, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009615384973585606, "entropy": 1.7461607605218887, "epoch": 0.00125, "grad_norm": 0.27883872389793396, "kl": 0.3700099512934685, "learning_rate": 7.999995761868076e-06, "loss": -0.0509, "step": 250, "step_time": 5.988081552000949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.125, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3184734284877777, "epoch": 0.001255, "frac_reward_zero_std": 0.0, "grad_norm": 0.1697387844324112, "kl": 0.46602077409625053, "learning_rate": 7.999995722166832e-06, "loss": -0.1038, "num_tokens": 3243701.0, "reward": 0.45106789469718933, "reward_std": 1.2248862981796265, "rewards/rollout_reward_func/mean": 0.45106789469718933, "rewards/rollout_reward_func/std": 1.224886178970337, "sampling/importance_sampling_ratio/max": 1.3628265857696533, "sampling/importance_sampling_ratio/mean": 0.5323365926742554, "sampling/importance_sampling_ratio/min": 8.202395292755682e-06, "sampling/sampling_logp_difference/max": 1.9791086912155151, "sampling/sampling_logp_difference/mean": 0.37616151571273804, "step": 251, "step_time": 10.581285218009725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.307445228099823, "epoch": 0.00126, "grad_norm": 0.1552981585264206, "kl": 0.45217957627028227, "learning_rate": 7.999995682280502e-06, "loss": -0.1045, "step": 252, "step_time": 5.133333435995155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.92307710647583, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2860594764351845, "epoch": 0.001265, "frac_reward_zero_std": 0.0, "grad_norm": 0.27134740352630615, "kl": 0.3490382470190525, "learning_rate": 7.999995642209084e-06, "loss": -0.041, "num_tokens": 3271789.0, "reward": 0.5547933578491211, "reward_std": 1.2988783121109009, "rewards/rollout_reward_func/mean": 0.5547933578491211, "rewards/rollout_reward_func/std": 1.2988784313201904, "sampling/importance_sampling_ratio/max": 1.4184446334838867, "sampling/importance_sampling_ratio/mean": 0.8109256029129028, "sampling/importance_sampling_ratio/min": 0.0004770495870616287, "sampling/sampling_logp_difference/max": 1.688154935836792, "sampling/sampling_logp_difference/mean": 0.214344322681427, "step": 253, "step_time": 10.236597531984444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2876719906926155, "epoch": 0.00127, "grad_norm": 0.27261045575141907, "kl": 0.34831427596509457, "learning_rate": 7.999995601952578e-06, "loss": -0.0411, "step": 254, "step_time": 5.311843744988437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 5.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.542632520198822, "epoch": 0.001275, "frac_reward_zero_std": 0.0, "grad_norm": 0.2530660033226013, "kl": 0.3399149626493454, "learning_rate": 7.999995561510986e-06, "loss": -0.0187, "num_tokens": 3297850.0, "reward": 0.004683598875999451, "reward_std": 0.9269005060195923, "rewards/rollout_reward_func/mean": 0.004683598875999451, "rewards/rollout_reward_func/std": 0.9269005060195923, "sampling/importance_sampling_ratio/max": 1.3038924932479858, "sampling/importance_sampling_ratio/mean": 0.6961873769760132, "sampling/importance_sampling_ratio/min": 2.190445911764982e-06, "sampling/sampling_logp_difference/max": 2.014331340789795, "sampling/sampling_logp_difference/mean": 0.23915962874889374, "step": 255, "step_time": 11.247671618984896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5525882840156555, "epoch": 0.00128, "grad_norm": 0.251810759305954, "kl": 0.33408671244978905, "learning_rate": 7.999995520884306e-06, "loss": -0.0198, "step": 256, "step_time": 5.519446749007329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3634364306926727, "epoch": 0.001285, "frac_reward_zero_std": 0.0, "grad_norm": 0.050544995814561844, "kl": 0.21641655266284943, "learning_rate": 7.99999548007254e-06, "loss": -0.1166, "num_tokens": 3321401.0, "reward": 0.305082768201828, "reward_std": 1.3236279487609863, "rewards/rollout_reward_func/mean": 0.305082768201828, "rewards/rollout_reward_func/std": 1.3236280679702759, "sampling/importance_sampling_ratio/max": 1.4028531312942505, "sampling/importance_sampling_ratio/mean": 0.8013430833816528, "sampling/importance_sampling_ratio/min": 0.0001402806956321001, "sampling/sampling_logp_difference/max": 1.8980402946472168, "sampling/sampling_logp_difference/mean": 0.2726154327392578, "step": 257, "step_time": 9.610294591009733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3663774579763412, "epoch": 0.00129, "grad_norm": 0.052494436502456665, "kl": 0.21605664119124413, "learning_rate": 7.999995439075685e-06, "loss": -0.1165, "step": 258, "step_time": 4.892771772007109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3621182180941105, "epoch": 0.001295, "frac_reward_zero_std": 0.0, "grad_norm": 0.1275741308927536, "kl": 0.21479728817939758, "learning_rate": 7.999995397893743e-06, "loss": -0.1119, "num_tokens": 3343382.0, "reward": 1.58572256565094, "reward_std": 0.9918658137321472, "rewards/rollout_reward_func/mean": 1.58572256565094, "rewards/rollout_reward_func/std": 0.9918658137321472, "sampling/importance_sampling_ratio/max": 1.2791683673858643, "sampling/importance_sampling_ratio/mean": 0.8634887933731079, "sampling/importance_sampling_ratio/min": 1.368550965707982e-05, "sampling/sampling_logp_difference/max": 1.8574490547180176, "sampling/sampling_logp_difference/mean": 0.2739109992980957, "step": 259, "step_time": 9.742048508007429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3671377561986446, "epoch": 0.0013, "grad_norm": 0.1379302740097046, "kl": 0.21543843112885952, "learning_rate": 7.999995356526716e-06, "loss": -0.112, "step": 260, "step_time": 5.220866580988513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.875, "completions/mean_terminated_length": 5.1666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4922121614217758, "epoch": 0.001305, "frac_reward_zero_std": 0.0, "grad_norm": 0.4242532253265381, "kl": 2.977185278199613, "learning_rate": 7.9999953149746e-06, "loss": -0.0906, "num_tokens": 3366509.0, "reward": 0.8154324889183044, "reward_std": 1.324123740196228, "rewards/rollout_reward_func/mean": 0.8154324889183044, "rewards/rollout_reward_func/std": 1.3241238594055176, "sampling/importance_sampling_ratio/max": 1.1749870777130127, "sampling/importance_sampling_ratio/mean": 0.6759510040283203, "sampling/importance_sampling_ratio/min": 0.0029170357156544924, "sampling/sampling_logp_difference/max": 3.0181963443756104, "sampling/sampling_logp_difference/mean": 0.23584312200546265, "step": 261, "step_time": 10.886400592004065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4915464296936989, "epoch": 0.00131, "grad_norm": 0.38303253054618835, "kl": 2.4797389302402735, "learning_rate": 7.999995273237395e-06, "loss": -0.0921, "step": 262, "step_time": 5.259191079996526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.8125, "completions/mean_terminated_length": 4.090909004211426, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9340612292289734, "epoch": 0.001315, "frac_reward_zero_std": 0.0, "grad_norm": 0.19741232693195343, "kl": 0.571202140301466, "learning_rate": 7.999995231315105e-06, "loss": -0.0778, "num_tokens": 3396468.0, "reward": 0.5183576941490173, "reward_std": 0.9524552822113037, "rewards/rollout_reward_func/mean": 0.5183576941490173, "rewards/rollout_reward_func/std": 0.9524552822113037, "sampling/importance_sampling_ratio/max": 1.759539246559143, "sampling/importance_sampling_ratio/mean": 0.7496333122253418, "sampling/importance_sampling_ratio/min": 9.344095985852618e-08, "sampling/sampling_logp_difference/max": 2.517127275466919, "sampling/sampling_logp_difference/mean": 0.443209171295166, "step": 263, "step_time": 11.205427364999196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.946344718337059, "epoch": 0.00132, "grad_norm": 0.1933220475912094, "kl": 0.4944137744605541, "learning_rate": 7.999995189207729e-06, "loss": -0.0784, "step": 264, "step_time": 5.650861339978292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 5.6666669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9165979623794556, "epoch": 0.001325, "frac_reward_zero_std": 0.0, "grad_norm": 0.21388721466064453, "kl": 0.564408540725708, "learning_rate": 7.999995146915264e-06, "loss": -0.0504, "num_tokens": 3425406.0, "reward": 0.2027512937784195, "reward_std": 1.0146524906158447, "rewards/rollout_reward_func/mean": 0.2027512937784195, "rewards/rollout_reward_func/std": 1.0146524906158447, "sampling/importance_sampling_ratio/max": 1.4431047439575195, "sampling/importance_sampling_ratio/mean": 0.5360437631607056, "sampling/importance_sampling_ratio/min": 0.0005560913705267012, "sampling/sampling_logp_difference/max": 1.7929612398147583, "sampling/sampling_logp_difference/mean": 0.3172757029533386, "step": 265, "step_time": 10.869505600989214 }, { "clip_ratio/high_max": 0.05048076994717121, "clip_ratio/high_mean": 0.025240384973585606, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.025240384973585606, "entropy": 1.9225394427776337, "epoch": 0.00133, "grad_norm": 0.1624724417924881, "kl": 0.4886353388428688, "learning_rate": 7.999995104437712e-06, "loss": -0.0508, "step": 266, "step_time": 5.39498805598123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 5.000000476837158, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5072721540927887, "epoch": 0.001335, "frac_reward_zero_std": 0.0, "grad_norm": 0.11921078711748123, "kl": 0.55628277733922, "learning_rate": 7.999995061775074e-06, "loss": -0.0155, "num_tokens": 3452883.0, "reward": 0.10426148772239685, "reward_std": 1.0035289525985718, "rewards/rollout_reward_func/mean": 0.10426148772239685, "rewards/rollout_reward_func/std": 1.0035289525985718, "sampling/importance_sampling_ratio/max": 1.3998394012451172, "sampling/importance_sampling_ratio/mean": 0.6794506907463074, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.130772590637207, "sampling/sampling_logp_difference/mean": 0.3111976087093353, "step": 267, "step_time": 10.005820960985147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5231688022613525, "epoch": 0.00134, "grad_norm": 0.12856177985668182, "kl": 0.5054917521774769, "learning_rate": 7.999995018927347e-06, "loss": -0.0159, "step": 268, "step_time": 5.282121950003784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 5.714285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.148701637983322, "epoch": 0.001345, "frac_reward_zero_std": 0.0, "grad_norm": 0.24326962232589722, "kl": 0.9832627922296524, "learning_rate": 7.999994975894534e-06, "loss": -0.0441, "num_tokens": 3478957.0, "reward": -0.230757474899292, "reward_std": 0.8765544295310974, "rewards/rollout_reward_func/mean": -0.230757474899292, "rewards/rollout_reward_func/std": 0.8765544295310974, "sampling/importance_sampling_ratio/max": 1.4538041353225708, "sampling/importance_sampling_ratio/mean": 0.6288132667541504, "sampling/importance_sampling_ratio/min": 1.141610397326076e-07, "sampling/sampling_logp_difference/max": 2.447627305984497, "sampling/sampling_logp_difference/mean": 0.4202445149421692, "step": 269, "step_time": 10.387154703988926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1558148860931396, "epoch": 0.00135, "grad_norm": 0.2207786589860916, "kl": 0.8019575551152229, "learning_rate": 7.999994932676635e-06, "loss": -0.0444, "step": 270, "step_time": 5.30813373401179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7649448774755001, "epoch": 0.001355, "frac_reward_zero_std": 0.0, "grad_norm": 0.04678642749786377, "kl": 0.3885515667498112, "learning_rate": 7.999994889273647e-06, "loss": -0.0859, "num_tokens": 3497789.0, "reward": 1.6186199188232422, "reward_std": 1.003922462463379, "rewards/rollout_reward_func/mean": 1.6186199188232422, "rewards/rollout_reward_func/std": 1.0039225816726685, "sampling/importance_sampling_ratio/max": 1.3583191633224487, "sampling/importance_sampling_ratio/mean": 0.9865636825561523, "sampling/importance_sampling_ratio/min": 9.571083501214162e-05, "sampling/sampling_logp_difference/max": 1.319462776184082, "sampling/sampling_logp_difference/mean": 0.18784859776496887, "step": 271, "step_time": 7.500699674987118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7674944810569286, "epoch": 0.00136, "grad_norm": 0.048121411353349686, "kl": 0.38524676859378815, "learning_rate": 7.999994845685572e-06, "loss": -0.0858, "step": 272, "step_time": 4.062612732988782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 5.928571701049805, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.890156239271164, "epoch": 0.001365, "frac_reward_zero_std": 0.0, "grad_norm": 0.19381022453308105, "kl": 0.2677001953125, "learning_rate": 7.99999480191241e-06, "loss": -0.0597, "num_tokens": 3520973.0, "reward": 1.0841317176818848, "reward_std": 1.0578515529632568, "rewards/rollout_reward_func/mean": 1.0841317176818848, "rewards/rollout_reward_func/std": 1.0578515529632568, "sampling/importance_sampling_ratio/max": 1.32606041431427, "sampling/importance_sampling_ratio/mean": 0.6840971112251282, "sampling/importance_sampling_ratio/min": 1.8271543012815528e-05, "sampling/sampling_logp_difference/max": 1.926069736480713, "sampling/sampling_logp_difference/mean": 0.3081067204475403, "step": 273, "step_time": 11.38104962502257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.889025717973709, "epoch": 0.00137, "grad_norm": 0.19467991590499878, "kl": 0.26718878746032715, "learning_rate": 7.99999475795416e-06, "loss": -0.0601, "step": 274, "step_time": 5.816642904988839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.052934616804123, "epoch": 0.001375, "frac_reward_zero_std": 0.0, "grad_norm": 0.19586637616157532, "kl": 0.49614859744906425, "learning_rate": 7.999994713810826e-06, "loss": -0.1316, "num_tokens": 3548574.0, "reward": 0.4168613851070404, "reward_std": 1.1648529767990112, "rewards/rollout_reward_func/mean": 0.4168613851070404, "rewards/rollout_reward_func/std": 1.1648529767990112, "sampling/importance_sampling_ratio/max": 1.6735233068466187, "sampling/importance_sampling_ratio/mean": 0.6375086307525635, "sampling/importance_sampling_ratio/min": 1.05380865988991e-06, "sampling/sampling_logp_difference/max": 1.672742247581482, "sampling/sampling_logp_difference/mean": 0.4057219326496124, "step": 275, "step_time": 11.338834553011111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.049794539809227, "epoch": 0.00138, "grad_norm": 0.18664216995239258, "kl": 0.5445880517363548, "learning_rate": 7.999994669482402e-06, "loss": -0.1319, "step": 276, "step_time": 5.613565680017928 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.125, "completions/mean_terminated_length": 5.555555820465088, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.0137690901756287, "epoch": 0.001385, "frac_reward_zero_std": 0.0, "grad_norm": 0.14326854050159454, "kl": 0.21554972464218736, "learning_rate": 7.999994624968891e-06, "loss": -0.0928, "num_tokens": 3575427.0, "reward": -0.11267539113759995, "reward_std": 1.079444169998169, "rewards/rollout_reward_func/mean": -0.11267539113759995, "rewards/rollout_reward_func/std": 1.079444408416748, "sampling/importance_sampling_ratio/max": 1.6490052938461304, "sampling/importance_sampling_ratio/mean": 0.41216540336608887, "sampling/importance_sampling_ratio/min": 1.671003246883629e-07, "sampling/sampling_logp_difference/max": 2.5495574474334717, "sampling/sampling_logp_difference/mean": 0.49545973539352417, "step": 277, "step_time": 11.17731662800361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00657894741743803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00657894741743803, "entropy": 3.0042057633399963, "epoch": 0.00139, "grad_norm": 0.14034047722816467, "kl": 0.24021166190505028, "learning_rate": 7.999994580270293e-06, "loss": -0.0928, "step": 278, "step_time": 5.601504356003716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 4.533333778381348, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1146882772445679, "epoch": 0.001395, "frac_reward_zero_std": 0.0, "grad_norm": 0.21629108488559723, "kl": 0.34001918137073517, "learning_rate": 7.999994535386609e-06, "loss": -0.0675, "num_tokens": 3604104.0, "reward": 0.7018243670463562, "reward_std": 1.1516263484954834, "rewards/rollout_reward_func/mean": 0.7018243670463562, "rewards/rollout_reward_func/std": 1.151626467704773, "sampling/importance_sampling_ratio/max": 1.5478854179382324, "sampling/importance_sampling_ratio/mean": 0.9032884836196899, "sampling/importance_sampling_ratio/min": 0.0020561928395181894, "sampling/sampling_logp_difference/max": 1.6416089534759521, "sampling/sampling_logp_difference/mean": 0.24393077194690704, "step": 279, "step_time": 10.867938069990487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1053155809640884, "epoch": 0.0014, "grad_norm": 0.2264469414949417, "kl": 0.3374170884490013, "learning_rate": 7.999994490317837e-06, "loss": -0.0685, "step": 280, "step_time": 5.828664335000212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.92307710647583, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2167787700891495, "epoch": 0.001405, "frac_reward_zero_std": 0.0, "grad_norm": 0.20311513543128967, "kl": 0.49450377002358437, "learning_rate": 7.999994445063977e-06, "loss": -0.0808, "num_tokens": 3627721.0, "reward": 1.2695283889770508, "reward_std": 1.2080130577087402, "rewards/rollout_reward_func/mean": 1.2695283889770508, "rewards/rollout_reward_func/std": 1.2080131769180298, "sampling/importance_sampling_ratio/max": 1.3611929416656494, "sampling/importance_sampling_ratio/mean": 0.7240141034126282, "sampling/importance_sampling_ratio/min": 0.0011297499295324087, "sampling/sampling_logp_difference/max": 1.694256067276001, "sampling/sampling_logp_difference/mean": 0.24213850498199463, "step": 281, "step_time": 10.151260993996402 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.2020557448267937, "epoch": 0.00141, "grad_norm": 0.15922896564006805, "kl": 0.553026843816042, "learning_rate": 7.999994399625032e-06, "loss": -0.0816, "step": 282, "step_time": 5.047948904000805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.3125, "completions/mean_terminated_length": 4.4166669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9121569842100143, "epoch": 0.001415, "frac_reward_zero_std": 0.0, "grad_norm": 0.15851442515850067, "kl": 0.679245863109827, "learning_rate": 7.999994354001e-06, "loss": -0.0823, "num_tokens": 3656029.0, "reward": 0.2677704989910126, "reward_std": 0.9202988743782043, "rewards/rollout_reward_func/mean": 0.2677704989910126, "rewards/rollout_reward_func/std": 0.9202988743782043, "sampling/importance_sampling_ratio/max": 1.4737193584442139, "sampling/importance_sampling_ratio/mean": 0.6761351823806763, "sampling/importance_sampling_ratio/min": 7.565990056690453e-10, "sampling/sampling_logp_difference/max": 2.501542329788208, "sampling/sampling_logp_difference/mean": 0.47500696778297424, "step": 283, "step_time": 11.044970381990424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 1.9010446071624756, "epoch": 0.00142, "grad_norm": 0.16110359132289886, "kl": 0.7596149630844593, "learning_rate": 7.999994308191879e-06, "loss": -0.0826, "step": 284, "step_time": 5.774768172996119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.493580162525177, "epoch": 0.001425, "frac_reward_zero_std": 0.0, "grad_norm": 0.06930053979158401, "kl": 0.5896518677473068, "learning_rate": 7.999994262197671e-06, "loss": -0.1031, "num_tokens": 3684730.0, "reward": 1.1753828525543213, "reward_std": 1.1371759176254272, "rewards/rollout_reward_func/mean": 1.1753828525543213, "rewards/rollout_reward_func/std": 1.1371759176254272, "sampling/importance_sampling_ratio/max": 1.6033353805541992, "sampling/importance_sampling_ratio/mean": 0.7723684906959534, "sampling/importance_sampling_ratio/min": 4.9347847379976884e-05, "sampling/sampling_logp_difference/max": 2.1260459423065186, "sampling/sampling_logp_difference/mean": 0.3006690442562103, "step": 285, "step_time": 11.05291011200461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 1.4858326017856598, "epoch": 0.00143, "grad_norm": 0.07533411681652069, "kl": 0.6536826565861702, "learning_rate": 7.999994216018377e-06, "loss": -0.1031, "step": 286, "step_time": 5.600599189012428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 4.363636493682861, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6193946599960327, "epoch": 0.001435, "frac_reward_zero_std": 0.0, "grad_norm": 0.045481640845537186, "kl": 0.4085489399731159, "learning_rate": 7.999994169653994e-06, "loss": -0.112, "num_tokens": 3709372.0, "reward": 1.166193962097168, "reward_std": 1.127092957496643, "rewards/rollout_reward_func/mean": 1.166193962097168, "rewards/rollout_reward_func/std": 1.1270930767059326, "sampling/importance_sampling_ratio/max": 1.344879388809204, "sampling/importance_sampling_ratio/mean": 0.7373055219650269, "sampling/importance_sampling_ratio/min": 8.38385458337143e-05, "sampling/sampling_logp_difference/max": 1.8287317752838135, "sampling/sampling_logp_difference/mean": 0.2737196087837219, "step": 287, "step_time": 10.35916704198462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6139472126960754, "epoch": 0.00144, "grad_norm": 0.044782448559999466, "kl": 0.4105014428496361, "learning_rate": 7.999994123104526e-06, "loss": -0.1121, "step": 288, "step_time": 5.171071514007053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.4375, "completions/mean_terminated_length": 3.545454740524292, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6422449573874474, "epoch": 0.001445, "frac_reward_zero_std": 0.0, "grad_norm": 0.1665170043706894, "kl": 0.32560987025499344, "learning_rate": 7.999994076369971e-06, "loss": -0.0749, "num_tokens": 3739360.0, "reward": 0.5221548080444336, "reward_std": 0.9803319573402405, "rewards/rollout_reward_func/mean": 0.5221548080444336, "rewards/rollout_reward_func/std": 0.98033207654953, "sampling/importance_sampling_ratio/max": 1.7347246408462524, "sampling/importance_sampling_ratio/mean": 0.7983744144439697, "sampling/importance_sampling_ratio/min": 4.250712208886398e-07, "sampling/sampling_logp_difference/max": 2.457787036895752, "sampling/sampling_logp_difference/mean": 0.39074814319610596, "step": 289, "step_time": 11.405403942000703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6354467645287514, "epoch": 0.00145, "grad_norm": 0.17089040577411652, "kl": 0.30498629342764616, "learning_rate": 7.999994029450328e-06, "loss": -0.0759, "step": 290, "step_time": 5.866442229002132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7329985350370407, "epoch": 0.001455, "frac_reward_zero_std": 0.0, "grad_norm": 0.12287451326847076, "kl": 0.5834894739091396, "learning_rate": 7.999993982345597e-06, "loss": -0.0605, "num_tokens": 3761404.0, "reward": 1.3241143226623535, "reward_std": 0.9750368595123291, "rewards/rollout_reward_func/mean": 1.3241143226623535, "rewards/rollout_reward_func/std": 0.9750368595123291, "sampling/importance_sampling_ratio/max": 1.596002459526062, "sampling/importance_sampling_ratio/mean": 0.9096636772155762, "sampling/importance_sampling_ratio/min": 0.00010315956751583144, "sampling/sampling_logp_difference/max": 1.8556137084960938, "sampling/sampling_logp_difference/mean": 0.21516281366348267, "step": 291, "step_time": 10.034046957007376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7258673906326294, "epoch": 0.00146, "grad_norm": 0.11953559517860413, "kl": 0.6303103268146515, "learning_rate": 7.99999393505578e-06, "loss": -0.0607, "step": 292, "step_time": 5.220318496008986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 4.230769157409668, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.239456232637167, "epoch": 0.001465, "frac_reward_zero_std": 0.0, "grad_norm": 0.39765840768814087, "kl": 0.30496514961123466, "learning_rate": 7.999993887580876e-06, "loss": -0.0747, "num_tokens": 3786126.0, "reward": 0.1788753867149353, "reward_std": 1.267731785774231, "rewards/rollout_reward_func/mean": 0.1788753867149353, "rewards/rollout_reward_func/std": 1.267731785774231, "sampling/importance_sampling_ratio/max": 1.2922160625457764, "sampling/importance_sampling_ratio/mean": 0.9132706522941589, "sampling/importance_sampling_ratio/min": 2.1170610864373884e-07, "sampling/sampling_logp_difference/max": 2.1452274322509766, "sampling/sampling_logp_difference/mean": 0.2555055320262909, "step": 293, "step_time": 10.865809014983824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2365637794137, "epoch": 0.00147, "grad_norm": 0.35344362258911133, "kl": 0.30059537291526794, "learning_rate": 7.999993839920885e-06, "loss": -0.0772, "step": 294, "step_time": 5.261573204988963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.1875, "completions/mean_terminated_length": 4.1875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.3296564482152462, "epoch": 0.001475, "frac_reward_zero_std": 0.5, "grad_norm": 0.2702910602092743, "kl": 0.25437788665294647, "learning_rate": 7.999993792075807e-06, "loss": -0.0174, "num_tokens": 3809851.0, "reward": 1.3178603649139404, "reward_std": 0.6776701211929321, "rewards/rollout_reward_func/mean": 1.3178603649139404, "rewards/rollout_reward_func/std": 0.6776701807975769, "sampling/importance_sampling_ratio/max": 1.527146816253662, "sampling/importance_sampling_ratio/mean": 1.0840317010879517, "sampling/importance_sampling_ratio/min": 0.4563157260417938, "sampling/sampling_logp_difference/max": 0.8320221900939941, "sampling/sampling_logp_difference/mean": 0.07261839509010315, "step": 295, "step_time": 9.960294685981353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.32805971428751945, "epoch": 0.00148, "grad_norm": 0.2436477392911911, "kl": 0.2567219063639641, "learning_rate": 7.99999374404564e-06, "loss": -0.0174, "step": 296, "step_time": 5.2917269180034054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 5.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6590928584337234, "epoch": 0.001485, "frac_reward_zero_std": 0.0, "grad_norm": 0.16893307864665985, "kl": 0.7914604432880878, "learning_rate": 7.999993695830389e-06, "loss": -0.1127, "num_tokens": 3832853.0, "reward": 0.7942524552345276, "reward_std": 1.1799368858337402, "rewards/rollout_reward_func/mean": 0.7942524552345276, "rewards/rollout_reward_func/std": 1.1799367666244507, "sampling/importance_sampling_ratio/max": 2.0470051765441895, "sampling/importance_sampling_ratio/mean": 0.7734338045120239, "sampling/importance_sampling_ratio/min": 0.0001525227853562683, "sampling/sampling_logp_difference/max": 1.8630123138427734, "sampling/sampling_logp_difference/mean": 0.3076530694961548, "step": 297, "step_time": 10.379400742021971 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "entropy": 1.6640010923147202, "epoch": 0.00149, "grad_norm": 0.15150466561317444, "kl": 0.6651390269398689, "learning_rate": 7.999993647430049e-06, "loss": -0.1133, "step": 298, "step_time": 5.061414890005835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 4.615384578704834, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.443115495145321, "epoch": 0.001495, "frac_reward_zero_std": 0.0, "grad_norm": 0.31683626770973206, "kl": 0.6786776632070541, "learning_rate": 7.99999359884462e-06, "loss": -0.0642, "num_tokens": 3855774.0, "reward": 1.0089319944381714, "reward_std": 1.2598366737365723, "rewards/rollout_reward_func/mean": 1.0089319944381714, "rewards/rollout_reward_func/std": 1.2598367929458618, "sampling/importance_sampling_ratio/max": 1.323489785194397, "sampling/importance_sampling_ratio/mean": 0.6195496320724487, "sampling/importance_sampling_ratio/min": 0.00012499677541200072, "sampling/sampling_logp_difference/max": 1.832579493522644, "sampling/sampling_logp_difference/mean": 0.31141847372055054, "step": 299, "step_time": 10.02916430401092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011363636702299118, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "entropy": 1.4460420571267605, "epoch": 0.0015, "grad_norm": 0.15327921509742737, "kl": 0.6596890315413475, "learning_rate": 7.999993550074108e-06, "loss": -0.066, "step": 300, "step_time": 5.2099731439957395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.928571701049805, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3775924146175385, "epoch": 0.001505, "frac_reward_zero_std": 0.0, "grad_norm": 0.25351664423942566, "kl": 0.19102150574326515, "learning_rate": 7.999993501118506e-06, "loss": -0.0022, "num_tokens": 3885040.0, "reward": -0.041426822543144226, "reward_std": 0.8331003189086914, "rewards/rollout_reward_func/mean": -0.041426822543144226, "rewards/rollout_reward_func/std": 0.8331003189086914, "sampling/importance_sampling_ratio/max": 1.6650416851043701, "sampling/importance_sampling_ratio/mean": 0.8976505398750305, "sampling/importance_sampling_ratio/min": 8.785316458670422e-05, "sampling/sampling_logp_difference/max": 1.412672758102417, "sampling/sampling_logp_difference/mean": 0.2536909282207489, "step": 301, "step_time": 10.797688921011286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3846818059682846, "epoch": 0.00151, "grad_norm": 0.2546313405036926, "kl": 0.1893215961754322, "learning_rate": 7.999993451977818e-06, "loss": -0.0026, "step": 302, "step_time": 5.342691951998859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 6.153846263885498, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7070536315441132, "epoch": 0.001515, "frac_reward_zero_std": 0.0, "grad_norm": 0.07106637954711914, "kl": 0.47749723494052887, "learning_rate": 7.999993402652043e-06, "loss": -0.0997, "num_tokens": 3914087.0, "reward": 0.8769879341125488, "reward_std": 1.2493925094604492, "rewards/rollout_reward_func/mean": 0.8769879341125488, "rewards/rollout_reward_func/std": 1.2493926286697388, "sampling/importance_sampling_ratio/max": 1.5601454973220825, "sampling/importance_sampling_ratio/mean": 0.6710275411605835, "sampling/importance_sampling_ratio/min": 0.00034622589009813964, "sampling/sampling_logp_difference/max": 1.8249282836914062, "sampling/sampling_logp_difference/mean": 0.3481978178024292, "step": 303, "step_time": 11.460587882014806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7061038464307785, "epoch": 0.00152, "grad_norm": 0.07418623566627502, "kl": 0.4748520776629448, "learning_rate": 7.99999335314118e-06, "loss": -0.0997, "step": 304, "step_time": 5.628745534006157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 4.615384578704834, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2340523898601532, "epoch": 0.001525, "frac_reward_zero_std": 0.0, "grad_norm": 0.05881958082318306, "kl": 0.4867863319814205, "learning_rate": 7.99999330344523e-06, "loss": -0.0837, "num_tokens": 3933145.0, "reward": 0.33572840690612793, "reward_std": 1.506378173828125, "rewards/rollout_reward_func/mean": 0.33572840690612793, "rewards/rollout_reward_func/std": 1.506378173828125, "sampling/importance_sampling_ratio/max": 1.273199200630188, "sampling/importance_sampling_ratio/mean": 0.8568713068962097, "sampling/importance_sampling_ratio/min": 0.00020430778386071324, "sampling/sampling_logp_difference/max": 2.008737325668335, "sampling/sampling_logp_difference/mean": 0.2454812228679657, "step": 305, "step_time": 7.512424111002474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2358276098966599, "epoch": 0.00153, "grad_norm": 0.05739687383174896, "kl": 0.48642781376838684, "learning_rate": 7.999993253564196e-06, "loss": -0.0836, "step": 306, "step_time": 3.822396067014779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.4375, "completions/mean_terminated_length": 6.111111164093018, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.215204179286957, "epoch": 0.001535, "frac_reward_zero_std": 0.0, "grad_norm": 0.22436434030532837, "kl": 0.6444643065333366, "learning_rate": 7.999993203498072e-06, "loss": -0.0716, "num_tokens": 3956304.0, "reward": 0.7856954336166382, "reward_std": 1.1941570043563843, "rewards/rollout_reward_func/mean": 0.7856954336166382, "rewards/rollout_reward_func/std": 1.1941570043563843, "sampling/importance_sampling_ratio/max": 1.272106647491455, "sampling/importance_sampling_ratio/mean": 0.41422367095947266, "sampling/importance_sampling_ratio/min": 3.258704764363962e-11, "sampling/sampling_logp_difference/max": 2.35339093208313, "sampling/sampling_logp_difference/mean": 0.46209585666656494, "step": 307, "step_time": 10.372905251992051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.217239797115326, "epoch": 0.00154, "grad_norm": 0.21416424214839935, "kl": 0.589503176510334, "learning_rate": 7.999993153246862e-06, "loss": -0.0722, "step": 308, "step_time": 4.9589858499966795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.4027960803359747, "epoch": 0.001545, "frac_reward_zero_std": 0.0, "grad_norm": 0.13154219090938568, "kl": 0.29289376735687256, "learning_rate": 7.999993102810564e-06, "loss": -0.0319, "num_tokens": 3984053.0, "reward": 1.763615608215332, "reward_std": 0.5260440111160278, "rewards/rollout_reward_func/mean": 1.763615608215332, "rewards/rollout_reward_func/std": 0.5260440707206726, "sampling/importance_sampling_ratio/max": 1.4045442342758179, "sampling/importance_sampling_ratio/mean": 0.9972589015960693, "sampling/importance_sampling_ratio/min": 0.03068407066166401, "sampling/sampling_logp_difference/max": 0.8129880428314209, "sampling/sampling_logp_difference/mean": 0.09541689604520798, "step": 309, "step_time": 9.998282687010942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4141158014535904, "epoch": 0.00155, "grad_norm": 0.13779166340827942, "kl": 0.2880864031612873, "learning_rate": 7.99999305218918e-06, "loss": -0.0322, "step": 310, "step_time": 5.205519128998276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.3125, "completions/mean_terminated_length": 5.300000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.524816542863846, "epoch": 0.001555, "frac_reward_zero_std": 0.0, "grad_norm": 0.15154413878917694, "kl": 0.6693032812327147, "learning_rate": 7.999993001382707e-06, "loss": -0.1038, "num_tokens": 4008235.0, "reward": 0.493430495262146, "reward_std": 1.2095248699188232, "rewards/rollout_reward_func/mean": 0.493430495262146, "rewards/rollout_reward_func/std": 1.2095248699188232, "sampling/importance_sampling_ratio/max": 1.1363351345062256, "sampling/importance_sampling_ratio/mean": 0.5471174716949463, "sampling/importance_sampling_ratio/min": 1.1829406503238715e-05, "sampling/sampling_logp_difference/max": 1.79635751247406, "sampling/sampling_logp_difference/mean": 0.3823045492172241, "step": 311, "step_time": 10.662493253999855 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 2.530482918024063, "epoch": 0.00156, "grad_norm": 0.14034277200698853, "kl": 0.5489210225641727, "learning_rate": 7.99999295039115e-06, "loss": -0.1042, "step": 312, "step_time": 5.1306200429971796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.875, "completions/mean_terminated_length": 6.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2363987565040588, "epoch": 0.001565, "frac_reward_zero_std": 0.0, "grad_norm": 0.26213961839675903, "kl": 0.2515251301229, "learning_rate": 7.999992899214505e-06, "loss": -0.0446, "num_tokens": 4035874.0, "reward": 0.8190179467201233, "reward_std": 1.2323977947235107, "rewards/rollout_reward_func/mean": 0.8190179467201233, "rewards/rollout_reward_func/std": 1.2323977947235107, "sampling/importance_sampling_ratio/max": 1.3796846866607666, "sampling/importance_sampling_ratio/mean": 0.7208008766174316, "sampling/importance_sampling_ratio/min": 0.00024323495745193213, "sampling/sampling_logp_difference/max": 1.5472874641418457, "sampling/sampling_logp_difference/mean": 0.2397332489490509, "step": 313, "step_time": 11.026083649994689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2447217851877213, "epoch": 0.00157, "grad_norm": 0.24889935553073883, "kl": 0.2420251164585352, "learning_rate": 7.999992847852771e-06, "loss": -0.0456, "step": 314, "step_time": 5.256578070009709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 6.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.417757034301758, "epoch": 0.001575, "frac_reward_zero_std": 0.0, "grad_norm": 0.18963660299777985, "kl": 0.4196908436715603, "learning_rate": 7.999992796305951e-06, "loss": -0.0456, "num_tokens": 4063136.0, "reward": 0.7053830027580261, "reward_std": 1.2176971435546875, "rewards/rollout_reward_func/mean": 0.7053830027580261, "rewards/rollout_reward_func/std": 1.2176971435546875, "sampling/importance_sampling_ratio/max": 1.5798054933547974, "sampling/importance_sampling_ratio/mean": 0.6187005043029785, "sampling/importance_sampling_ratio/min": 4.8039605644589756e-06, "sampling/sampling_logp_difference/max": 2.511014938354492, "sampling/sampling_logp_difference/mean": 0.4515858292579651, "step": 315, "step_time": 10.456009105022531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4221612215042114, "epoch": 0.00158, "grad_norm": 0.19361773133277893, "kl": 0.41391944885253906, "learning_rate": 7.999992744574046e-06, "loss": -0.0465, "step": 316, "step_time": 5.198620935014333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 5.769230842590332, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.950627088546753, "epoch": 0.001585, "frac_reward_zero_std": 0.0, "grad_norm": 0.36025843024253845, "kl": 1.80654763802886, "learning_rate": 7.999992692657052e-06, "loss": -0.0801, "num_tokens": 4093664.0, "reward": 0.5661508440971375, "reward_std": 0.9711536169052124, "rewards/rollout_reward_func/mean": 0.5661508440971375, "rewards/rollout_reward_func/std": 0.9711536765098572, "sampling/importance_sampling_ratio/max": 1.5033648014068604, "sampling/importance_sampling_ratio/mean": 0.6039842367172241, "sampling/importance_sampling_ratio/min": 9.593381400918588e-05, "sampling/sampling_logp_difference/max": 2.1264700889587402, "sampling/sampling_logp_difference/mean": 0.39101892709732056, "step": 317, "step_time": 11.614841665999847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9529908895492554, "epoch": 0.00159, "grad_norm": 0.29773181676864624, "kl": 1.461775429546833, "learning_rate": 7.999992640554971e-06, "loss": -0.0818, "step": 318, "step_time": 5.806180575993494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.375, "completions/mean_terminated_length": 6.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.481856107711792, "epoch": 0.001595, "frac_reward_zero_std": 0.0, "grad_norm": 0.31463316082954407, "kl": 0.2577453963458538, "learning_rate": 7.999992588267803e-06, "loss": -0.1089, "num_tokens": 4121299.0, "reward": 0.6240830421447754, "reward_std": 0.9771212935447693, "rewards/rollout_reward_func/mean": 0.6240830421447754, "rewards/rollout_reward_func/std": 0.9771213531494141, "sampling/importance_sampling_ratio/max": 1.9752897024154663, "sampling/importance_sampling_ratio/mean": 0.5510113835334778, "sampling/importance_sampling_ratio/min": 2.6734126024052784e-09, "sampling/sampling_logp_difference/max": 2.604898452758789, "sampling/sampling_logp_difference/mean": 0.44210314750671387, "step": 319, "step_time": 11.070161927011213 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "entropy": 2.4852645993232727, "epoch": 0.0016, "grad_norm": 0.2631101608276367, "kl": 0.23126854002475739, "learning_rate": 7.999992535795547e-06, "loss": -0.11, "step": 320, "step_time": 5.316308082998148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.5625, "completions/mean_terminated_length": 5.857142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4255905151367188, "epoch": 0.001605, "frac_reward_zero_std": 0.0, "grad_norm": 0.18059511482715607, "kl": 0.11861879797652364, "learning_rate": 7.999992483138206e-06, "loss": -0.0661, "num_tokens": 4142183.0, "reward": 0.47850775718688965, "reward_std": 1.4306713342666626, "rewards/rollout_reward_func/mean": 0.47850775718688965, "rewards/rollout_reward_func/std": 1.4306714534759521, "sampling/importance_sampling_ratio/max": 1.152642846107483, "sampling/importance_sampling_ratio/mean": 0.3807982802391052, "sampling/importance_sampling_ratio/min": 6.670932748420455e-07, "sampling/sampling_logp_difference/max": 1.8178184032440186, "sampling/sampling_logp_difference/mean": 0.3763525187969208, "step": 321, "step_time": 10.431080780006596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.427551507949829, "epoch": 0.00161, "grad_norm": 0.1765649914741516, "kl": 0.12011566432192922, "learning_rate": 7.999992430295777e-06, "loss": -0.0669, "step": 322, "step_time": 5.173342852998758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.3125, "completions/mean_terminated_length": 7.083333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.440555155277252, "epoch": 0.001615, "frac_reward_zero_std": 0.0, "grad_norm": 0.20276769995689392, "kl": 0.1629136987030506, "learning_rate": 7.99999237726826e-06, "loss": -0.0357, "num_tokens": 4170642.0, "reward": 0.11873501539230347, "reward_std": 0.8706129789352417, "rewards/rollout_reward_func/mean": 0.11873501539230347, "rewards/rollout_reward_func/std": 0.8706130385398865, "sampling/importance_sampling_ratio/max": 1.4192155599594116, "sampling/importance_sampling_ratio/mean": 0.4956302344799042, "sampling/importance_sampling_ratio/min": 2.762419535429217e-05, "sampling/sampling_logp_difference/max": 1.5911316871643066, "sampling/sampling_logp_difference/mean": 0.3628150224685669, "step": 323, "step_time": 11.816350182023598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.441673368215561, "epoch": 0.00162, "grad_norm": 0.19956858456134796, "kl": 0.1631072387099266, "learning_rate": 7.999992324055659e-06, "loss": -0.0354, "step": 324, "step_time": 5.592155433012522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 4.400000095367432, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0144113898277283, "epoch": 0.001625, "frac_reward_zero_std": 0.0, "grad_norm": 0.266996830701828, "kl": 0.8775773216038942, "learning_rate": 7.99999227065797e-06, "loss": -0.1004, "num_tokens": 4198904.0, "reward": 0.4826161563396454, "reward_std": 1.2275795936584473, "rewards/rollout_reward_func/mean": 0.4826161563396454, "rewards/rollout_reward_func/std": 1.2275795936584473, "sampling/importance_sampling_ratio/max": 1.6066986322402954, "sampling/importance_sampling_ratio/mean": 0.5725682377815247, "sampling/importance_sampling_ratio/min": 3.2031532555265585e-06, "sampling/sampling_logp_difference/max": 2.5441129207611084, "sampling/sampling_logp_difference/mean": 0.37846624851226807, "step": 325, "step_time": 10.421091608004645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.007244974374771, "epoch": 0.00163, "grad_norm": 0.25915366411209106, "kl": 0.8497587200254202, "learning_rate": 7.999992217075192e-06, "loss": -0.1011, "step": 326, "step_time": 5.320792777987663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.6875, "completions/mean_terminated_length": 6.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.002562999725342, "epoch": 0.001635, "frac_reward_zero_std": 0.0, "grad_norm": 0.22528153657913208, "kl": 0.34581098705530167, "learning_rate": 7.999992163307328e-06, "loss": -0.1051, "num_tokens": 4220809.0, "reward": 1.2698885202407837, "reward_std": 1.083410382270813, "rewards/rollout_reward_func/mean": 1.2698885202407837, "rewards/rollout_reward_func/std": 1.083410382270813, "sampling/importance_sampling_ratio/max": 1.2472790479660034, "sampling/importance_sampling_ratio/mean": 0.6095151305198669, "sampling/importance_sampling_ratio/min": 4.229924854826095e-08, "sampling/sampling_logp_difference/max": 2.4392175674438477, "sampling/sampling_logp_difference/mean": 0.4164733290672302, "step": 327, "step_time": 9.564879680008744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9954736530780792, "epoch": 0.00164, "grad_norm": 0.2272084355354309, "kl": 0.3571777790784836, "learning_rate": 7.999992109354377e-06, "loss": -0.1055, "step": 328, "step_time": 4.779329370998312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.5625, "completions/mean_terminated_length": 5.18181848526001, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.066596806049347, "epoch": 0.001645, "frac_reward_zero_std": 0.0, "grad_norm": 0.22890011966228485, "kl": 0.2293665111064911, "learning_rate": 7.999992055216339e-06, "loss": -0.0764, "num_tokens": 4244126.0, "reward": 0.15708857774734497, "reward_std": 1.3295716047286987, "rewards/rollout_reward_func/mean": 0.15708857774734497, "rewards/rollout_reward_func/std": 1.3295716047286987, "sampling/importance_sampling_ratio/max": 1.548280954360962, "sampling/importance_sampling_ratio/mean": 0.6633567810058594, "sampling/importance_sampling_ratio/min": 3.3843190294646774e-07, "sampling/sampling_logp_difference/max": 2.193275213241577, "sampling/sampling_logp_difference/mean": 0.374092161655426, "step": 329, "step_time": 9.875889012007974 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "entropy": 2.048087328672409, "epoch": 0.00165, "grad_norm": 0.10482878237962723, "kl": 0.2619082536548376, "learning_rate": 7.999992000893214e-06, "loss": -0.0774, "step": 330, "step_time": 5.082317302003503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.1875, "completions/mean_terminated_length": 5.666666507720947, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1333145797252655, "epoch": 0.001655, "frac_reward_zero_std": 0.0, "grad_norm": 0.22076468169689178, "kl": 0.21196862310171127, "learning_rate": 7.999991946385003e-06, "loss": -0.0901, "num_tokens": 4271522.0, "reward": 0.29623574018478394, "reward_std": 1.158704161643982, "rewards/rollout_reward_func/mean": 0.29623574018478394, "rewards/rollout_reward_func/std": 1.158704161643982, "sampling/importance_sampling_ratio/max": 1.2099838256835938, "sampling/importance_sampling_ratio/mean": 0.5287919044494629, "sampling/importance_sampling_ratio/min": 5.421325113275088e-07, "sampling/sampling_logp_difference/max": 2.539682626724243, "sampling/sampling_logp_difference/mean": 0.34069573879241943, "step": 331, "step_time": 10.781700404986623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1324748396873474, "epoch": 0.00166, "grad_norm": 0.22497282922267914, "kl": 0.2267313487827778, "learning_rate": 7.999991891691704e-06, "loss": -0.0901, "step": 332, "step_time": 5.654638086009072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.9375, "completions/mean_terminated_length": 5.727272987365723, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7544403374195099, "epoch": 0.001665, "frac_reward_zero_std": 0.0, "grad_norm": 0.2395891398191452, "kl": 0.15163567289710045, "learning_rate": 7.999991836813319e-06, "loss": -0.0529, "num_tokens": 4298814.0, "reward": -0.2158125787973404, "reward_std": 0.9640118479728699, "rewards/rollout_reward_func/mean": -0.2158125787973404, "rewards/rollout_reward_func/std": 0.9640119075775146, "sampling/importance_sampling_ratio/max": 1.1659616231918335, "sampling/importance_sampling_ratio/mean": 0.536327600479126, "sampling/importance_sampling_ratio/min": 3.361984113325889e-07, "sampling/sampling_logp_difference/max": 2.633484363555908, "sampling/sampling_logp_difference/mean": 0.2817060947418213, "step": 333, "step_time": 11.423557026981143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7533987164497375, "epoch": 0.00167, "grad_norm": 0.2452945113182068, "kl": 0.1524546556174755, "learning_rate": 7.999991781749846e-06, "loss": -0.0535, "step": 334, "step_time": 6.052493302006042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.375, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.902677297592163, "epoch": 0.001675, "frac_reward_zero_std": 0.0, "grad_norm": 0.20273201167583466, "kl": 0.46772362291812897, "learning_rate": 7.999991726501287e-06, "loss": -0.0819, "num_tokens": 4324993.0, "reward": 0.41093963384628296, "reward_std": 1.2954283952713013, "rewards/rollout_reward_func/mean": 0.41093963384628296, "rewards/rollout_reward_func/std": 1.2954283952713013, "sampling/importance_sampling_ratio/max": 1.4185307025909424, "sampling/importance_sampling_ratio/mean": 0.5219278931617737, "sampling/importance_sampling_ratio/min": 8.017752406885847e-06, "sampling/sampling_logp_difference/max": 2.11301326751709, "sampling/sampling_logp_difference/mean": 0.3166663348674774, "step": 335, "step_time": 10.507528040005127 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.9078361988067627, "epoch": 0.00168, "grad_norm": 0.18847158551216125, "kl": 0.41155554354190826, "learning_rate": 7.99999167106764e-06, "loss": -0.0827, "step": 336, "step_time": 5.218142581987195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 5.714285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2719236612319946, "epoch": 0.001685, "frac_reward_zero_std": 0.0, "grad_norm": 0.39961081743240356, "kl": 0.21359545271843672, "learning_rate": 7.999991615448907e-06, "loss": -0.0644, "num_tokens": 4348935.0, "reward": 0.13928931951522827, "reward_std": 1.3423514366149902, "rewards/rollout_reward_func/mean": 0.13928931951522827, "rewards/rollout_reward_func/std": 1.3423514366149902, "sampling/importance_sampling_ratio/max": 1.5739431381225586, "sampling/importance_sampling_ratio/mean": 0.779413640499115, "sampling/importance_sampling_ratio/min": 0.00025378353893756866, "sampling/sampling_logp_difference/max": 1.7070225477218628, "sampling/sampling_logp_difference/mean": 0.20307427644729614, "step": 337, "step_time": 10.387435479002306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2641088366508484, "epoch": 0.00169, "grad_norm": 0.398499995470047, "kl": 0.21619694121181965, "learning_rate": 7.999991559645087e-06, "loss": -0.065, "step": 338, "step_time": 5.22030552700744 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 5.214285850524902, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0164798498153687, "epoch": 0.001695, "frac_reward_zero_std": 0.0, "grad_norm": 0.13512268662452698, "kl": 0.2547510489821434, "learning_rate": 7.999991503656178e-06, "loss": -0.0908, "num_tokens": 4377534.0, "reward": 0.7797658443450928, "reward_std": 1.138999342918396, "rewards/rollout_reward_func/mean": 0.7797658443450928, "rewards/rollout_reward_func/std": 1.1389994621276855, "sampling/importance_sampling_ratio/max": 2.2085776329040527, "sampling/importance_sampling_ratio/mean": 0.8923128843307495, "sampling/importance_sampling_ratio/min": 0.007864442653954029, "sampling/sampling_logp_difference/max": 0.820809006690979, "sampling/sampling_logp_difference/mean": 0.16035863757133484, "step": 339, "step_time": 11.218272127996897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0110243111848831, "epoch": 0.0017, "grad_norm": 0.17574593424797058, "kl": 0.258464690297842, "learning_rate": 7.999991447482183e-06, "loss": -0.0908, "step": 340, "step_time": 5.612075166005525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.9375, "completions/mean_terminated_length": 5.266666889190674, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0343466103076935, "epoch": 0.001705, "frac_reward_zero_std": 0.0, "grad_norm": 0.2168373316526413, "kl": 0.24559292197227478, "learning_rate": 7.999991391123103e-06, "loss": -0.0985, "num_tokens": 4400448.0, "reward": 1.3849372863769531, "reward_std": 1.0386550426483154, "rewards/rollout_reward_func/mean": 1.3849372863769531, "rewards/rollout_reward_func/std": 1.0386550426483154, "sampling/importance_sampling_ratio/max": 1.205522060394287, "sampling/importance_sampling_ratio/mean": 0.87197345495224, "sampling/importance_sampling_ratio/min": 0.001419078791514039, "sampling/sampling_logp_difference/max": 1.6431982517242432, "sampling/sampling_logp_difference/mean": 0.18690291047096252, "step": 341, "step_time": 9.49050287398859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0242765098810196, "epoch": 0.00171, "grad_norm": 0.2130175530910492, "kl": 0.24668651819229126, "learning_rate": 7.999991334578934e-06, "loss": -0.0992, "step": 342, "step_time": 5.273740130010992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.375, "completions/mean_terminated_length": 6.615385055541992, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4451000690460205, "epoch": 0.001715, "frac_reward_zero_std": 0.0, "grad_norm": 0.2511591613292694, "kl": 0.1767534427344799, "learning_rate": 7.99999127784968e-06, "loss": -0.108, "num_tokens": 4422548.0, "reward": 0.8545815944671631, "reward_std": 1.2843021154403687, "rewards/rollout_reward_func/mean": 0.8545815944671631, "rewards/rollout_reward_func/std": 1.2843023538589478, "sampling/importance_sampling_ratio/max": 1.3218212127685547, "sampling/importance_sampling_ratio/mean": 0.7121111154556274, "sampling/importance_sampling_ratio/min": 0.0016658416716381907, "sampling/sampling_logp_difference/max": 1.6258691549301147, "sampling/sampling_logp_difference/mean": 0.2034301459789276, "step": 343, "step_time": 10.525551818005624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4369467943906784, "epoch": 0.00172, "grad_norm": 0.24975480139255524, "kl": 0.17807364277541637, "learning_rate": 7.999991220935337e-06, "loss": -0.1082, "step": 344, "step_time": 5.348967354992055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 4.727272987365723, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4393704012036324, "epoch": 0.001725, "frac_reward_zero_std": 0.0, "grad_norm": 0.11665785312652588, "kl": 0.4212101511657238, "learning_rate": 7.999991163835908e-06, "loss": -0.0969, "num_tokens": 4445704.0, "reward": 0.951106607913971, "reward_std": 1.1758053302764893, "rewards/rollout_reward_func/mean": 0.951106607913971, "rewards/rollout_reward_func/std": 1.1758053302764893, "sampling/importance_sampling_ratio/max": 1.3074767589569092, "sampling/importance_sampling_ratio/mean": 0.7351016998291016, "sampling/importance_sampling_ratio/min": 2.3065024379320676e-06, "sampling/sampling_logp_difference/max": 2.271068572998047, "sampling/sampling_logp_difference/mean": 0.26049721240997314, "step": 345, "step_time": 10.463785118001397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.43735883384943, "epoch": 0.00173, "grad_norm": 0.11280106008052826, "kl": 0.39112373627722263, "learning_rate": 7.999991106551393e-06, "loss": -0.0973, "step": 346, "step_time": 5.224425127991708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.875, "completions/mean_terminated_length": 6.200000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.986250638961792, "epoch": 0.001735, "frac_reward_zero_std": 0.0, "grad_norm": 0.23927682638168335, "kl": 0.371383068151772, "learning_rate": 7.99999104908179e-06, "loss": -0.0755, "num_tokens": 4471486.0, "reward": 0.007465869188308716, "reward_std": 1.1314008235931396, "rewards/rollout_reward_func/mean": 0.007465869188308716, "rewards/rollout_reward_func/std": 1.1314008235931396, "sampling/importance_sampling_ratio/max": 1.208875298500061, "sampling/importance_sampling_ratio/mean": 0.4956190586090088, "sampling/importance_sampling_ratio/min": 4.0343773434869945e-05, "sampling/sampling_logp_difference/max": 1.7924470901489258, "sampling/sampling_logp_difference/mean": 0.3370683789253235, "step": 347, "step_time": 10.561872895006672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9805769324302673, "epoch": 0.00174, "grad_norm": 0.2305927276611328, "kl": 0.3890927289612591, "learning_rate": 7.9999909914271e-06, "loss": -0.0761, "step": 348, "step_time": 5.260940111984382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.857142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8266974166035652, "epoch": 0.001745, "frac_reward_zero_std": 0.0, "grad_norm": 0.3162243068218231, "kl": 0.24099339917302132, "learning_rate": 7.999990933587323e-06, "loss": -0.0401, "num_tokens": 4494147.0, "reward": 1.149849772453308, "reward_std": 1.043046236038208, "rewards/rollout_reward_func/mean": 1.149849772453308, "rewards/rollout_reward_func/std": 1.043046236038208, "sampling/importance_sampling_ratio/max": 1.4216437339782715, "sampling/importance_sampling_ratio/mean": 0.8822938203811646, "sampling/importance_sampling_ratio/min": 0.00118041574023664, "sampling/sampling_logp_difference/max": 1.2657122611999512, "sampling/sampling_logp_difference/mean": 0.13786299526691437, "step": 349, "step_time": 10.64904220099561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8066444396972656, "epoch": 0.00175, "grad_norm": 0.2715013921260834, "kl": 0.26387903094291687, "learning_rate": 7.99999087556246e-06, "loss": -0.0422, "step": 350, "step_time": 5.318110030988464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 5.428571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.496074452996254, "epoch": 0.001755, "frac_reward_zero_std": 0.0, "grad_norm": 0.16971267759799957, "kl": 1.5899510979652405, "learning_rate": 7.999990817352509e-06, "loss": -0.0885, "num_tokens": 4518101.0, "reward": 0.9587571024894714, "reward_std": 1.0046733617782593, "rewards/rollout_reward_func/mean": 0.9587571024894714, "rewards/rollout_reward_func/std": 1.0046733617782593, "sampling/importance_sampling_ratio/max": 1.280389428138733, "sampling/importance_sampling_ratio/mean": 0.6815527677536011, "sampling/importance_sampling_ratio/min": 8.665996915624419e-07, "sampling/sampling_logp_difference/max": 1.7371025085449219, "sampling/sampling_logp_difference/mean": 0.3684096336364746, "step": 351, "step_time": 10.227328424007283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4915431141853333, "epoch": 0.00176, "grad_norm": 0.16393442451953888, "kl": 1.5950785279273987, "learning_rate": 7.999990758957471e-06, "loss": -0.0891, "step": 352, "step_time": 5.297187956995913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.3125, "completions/mean_terminated_length": 4.818181991577148, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4953170120716095, "epoch": 0.001765, "frac_reward_zero_std": 0.0, "grad_norm": 0.21122857928276062, "kl": 0.16993142664432526, "learning_rate": 7.999990700377347e-06, "loss": -0.0837, "num_tokens": 4541668.0, "reward": 0.36514928936958313, "reward_std": 1.2534840106964111, "rewards/rollout_reward_func/mean": 0.36514928936958313, "rewards/rollout_reward_func/std": 1.2534840106964111, "sampling/importance_sampling_ratio/max": 1.4008777141571045, "sampling/importance_sampling_ratio/mean": 0.6981677412986755, "sampling/importance_sampling_ratio/min": 0.005691978614777327, "sampling/sampling_logp_difference/max": 1.2937524318695068, "sampling/sampling_logp_difference/mean": 0.20886284112930298, "step": 353, "step_time": 10.368871406011749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4884890913963318, "epoch": 0.00177, "grad_norm": 0.19502738118171692, "kl": 0.16470134258270264, "learning_rate": 7.999990641612135e-06, "loss": -0.0847, "step": 354, "step_time": 5.616825270029949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 5.214285850524902, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1137199252843857, "epoch": 0.001775, "frac_reward_zero_std": 0.0, "grad_norm": 0.11568685621023178, "kl": 0.6001000478863716, "learning_rate": 7.999990582661837e-06, "loss": -0.0601, "num_tokens": 4570946.0, "reward": 1.0932537317276, "reward_std": 0.9997373819351196, "rewards/rollout_reward_func/mean": 1.0932537317276, "rewards/rollout_reward_func/std": 0.9997374415397644, "sampling/importance_sampling_ratio/max": 1.4543968439102173, "sampling/importance_sampling_ratio/mean": 0.723755955696106, "sampling/importance_sampling_ratio/min": 1.653850995353423e-05, "sampling/sampling_logp_difference/max": 2.5347418785095215, "sampling/sampling_logp_difference/mean": 0.3299894332885742, "step": 355, "step_time": 10.312050356995314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.114170290529728, "epoch": 0.00178, "grad_norm": 0.11920608580112457, "kl": 0.6577122956514359, "learning_rate": 7.999990523526452e-06, "loss": -0.0594, "step": 356, "step_time": 5.3199708750034915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.875, "completions/mean_terminated_length": 4.18181848526001, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7388002276420593, "epoch": 0.001785, "frac_reward_zero_std": 0.0, "grad_norm": 0.16788478195667267, "kl": 0.2758379206061363, "learning_rate": 7.99999046420598e-06, "loss": -0.0741, "num_tokens": 4594117.0, "reward": 0.6918206214904785, "reward_std": 1.3027609586715698, "rewards/rollout_reward_func/mean": 0.6918206214904785, "rewards/rollout_reward_func/std": 1.3027609586715698, "sampling/importance_sampling_ratio/max": 1.3234970569610596, "sampling/importance_sampling_ratio/mean": 0.7312365770339966, "sampling/importance_sampling_ratio/min": 0.0003316473448649049, "sampling/sampling_logp_difference/max": 1.5038354396820068, "sampling/sampling_logp_difference/mean": 0.26556235551834106, "step": 357, "step_time": 10.590649454999948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7380758225917816, "epoch": 0.00179, "grad_norm": 0.17084082961082458, "kl": 0.2773459516465664, "learning_rate": 7.999990404700422e-06, "loss": -0.0749, "step": 358, "step_time": 5.09680293200654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 4.9166669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6802732348442078, "epoch": 0.001795, "frac_reward_zero_std": 0.0, "grad_norm": 0.602737307548523, "kl": 0.1958588846027851, "learning_rate": 7.999990345009776e-06, "loss": -0.0913, "num_tokens": 4622765.0, "reward": 0.8228509426116943, "reward_std": 1.2077953815460205, "rewards/rollout_reward_func/mean": 0.8228509426116943, "rewards/rollout_reward_func/std": 1.2077953815460205, "sampling/importance_sampling_ratio/max": 1.4314991235733032, "sampling/importance_sampling_ratio/mean": 0.7914246320724487, "sampling/importance_sampling_ratio/min": 0.00020118649990763515, "sampling/sampling_logp_difference/max": 1.597353458404541, "sampling/sampling_logp_difference/mean": 0.2590228319168091, "step": 359, "step_time": 10.524195133999456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013888888992369175, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013888888992369175, "entropy": 1.678587168455124, "epoch": 0.0018, "grad_norm": 0.1289140284061432, "kl": 0.19291549921035767, "learning_rate": 7.999990285134044e-06, "loss": -0.0931, "step": 360, "step_time": 5.311535895976704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2106887213885784, "epoch": 0.001805, "frac_reward_zero_std": 0.0, "grad_norm": 0.03323969617486, "kl": 0.23034491762518883, "learning_rate": 7.999990225073224e-06, "loss": -0.1079, "num_tokens": 4645681.0, "reward": 0.40043288469314575, "reward_std": 1.2555336952209473, "rewards/rollout_reward_func/mean": 0.40043288469314575, "rewards/rollout_reward_func/std": 1.2555336952209473, "sampling/importance_sampling_ratio/max": 1.426573395729065, "sampling/importance_sampling_ratio/mean": 0.844785213470459, "sampling/importance_sampling_ratio/min": 0.0021476661786437035, "sampling/sampling_logp_difference/max": 1.4209601879119873, "sampling/sampling_logp_difference/mean": 0.2044116109609604, "step": 361, "step_time": 10.467358139998396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2059707567095757, "epoch": 0.00181, "grad_norm": 0.031799640506505966, "kl": 0.2309376783668995, "learning_rate": 7.999990164827318e-06, "loss": -0.108, "step": 362, "step_time": 5.247643014008645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 6.4375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9850169979035854, "epoch": 0.001815, "frac_reward_zero_std": 0.0, "grad_norm": 0.03148331865668297, "kl": 0.3670741394162178, "learning_rate": 7.999990104396325e-06, "loss": -0.1065, "num_tokens": 4668009.0, "reward": 1.7314858436584473, "reward_std": 0.6223039627075195, "rewards/rollout_reward_func/mean": 1.7314858436584473, "rewards/rollout_reward_func/std": 0.6223039627075195, "sampling/importance_sampling_ratio/max": 1.259169578552246, "sampling/importance_sampling_ratio/mean": 0.8268052339553833, "sampling/importance_sampling_ratio/min": 5.8902900491375476e-05, "sampling/sampling_logp_difference/max": 1.959756851196289, "sampling/sampling_logp_difference/mean": 0.2857620120048523, "step": 363, "step_time": 9.95006003198796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9861429845914245, "epoch": 0.00182, "grad_norm": 0.027950625866651535, "kl": 0.35802973061800003, "learning_rate": 7.999990043780244e-06, "loss": -0.1067, "step": 364, "step_time": 5.69474562999676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 4.9166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3649369776248932, "epoch": 0.001825, "frac_reward_zero_std": 0.0, "grad_norm": 0.018203893676400185, "kl": 0.17058642953634262, "learning_rate": 7.999989982979077e-06, "loss": -0.0525, "num_tokens": 4691829.0, "reward": 0.7152018547058105, "reward_std": 1.2729134559631348, "rewards/rollout_reward_func/mean": 0.7152018547058105, "rewards/rollout_reward_func/std": 1.2729134559631348, "sampling/importance_sampling_ratio/max": 1.1399929523468018, "sampling/importance_sampling_ratio/mean": 0.7548270225524902, "sampling/importance_sampling_ratio/min": 6.889875658089295e-05, "sampling/sampling_logp_difference/max": 1.8734405040740967, "sampling/sampling_logp_difference/mean": 0.19254246354103088, "step": 365, "step_time": 10.637012349019642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3637812435626984, "epoch": 0.00183, "grad_norm": 0.016799114644527435, "kl": 0.17078327760100365, "learning_rate": 7.999989921992825e-06, "loss": -0.0526, "step": 366, "step_time": 5.152828338992549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 3.769230842590332, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0735637582838535, "epoch": 0.001835, "frac_reward_zero_std": 0.0, "grad_norm": 0.2878216803073883, "kl": 1.364368624985218, "learning_rate": 7.999989860821483e-06, "loss": -0.0757, "num_tokens": 4720494.0, "reward": 0.1501825898885727, "reward_std": 0.963499903678894, "rewards/rollout_reward_func/mean": 0.1501825898885727, "rewards/rollout_reward_func/std": 0.9634999632835388, "sampling/importance_sampling_ratio/max": 1.4181710481643677, "sampling/importance_sampling_ratio/mean": 0.8643820285797119, "sampling/importance_sampling_ratio/min": 8.260355389211327e-05, "sampling/sampling_logp_difference/max": 2.2804505825042725, "sampling/sampling_logp_difference/mean": 0.23537002503871918, "step": 367, "step_time": 11.149611477987492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0642878897488117, "epoch": 0.00184, "grad_norm": 0.24139045178890228, "kl": 1.1635746136307716, "learning_rate": 7.999989799465057e-06, "loss": -0.0771, "step": 368, "step_time": 5.753695975974551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 3.92307710647583, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1143478751182556, "epoch": 0.001845, "frac_reward_zero_std": 0.0, "grad_norm": 0.2330222874879837, "kl": 0.21127596870064735, "learning_rate": 7.999989737923541e-06, "loss": -0.0598, "num_tokens": 4744829.0, "reward": -0.5228592157363892, "reward_std": 0.5427514910697937, "rewards/rollout_reward_func/mean": -0.5228592157363892, "rewards/rollout_reward_func/std": 0.5427514910697937, "sampling/importance_sampling_ratio/max": 1.259883999824524, "sampling/importance_sampling_ratio/mean": 0.8683611154556274, "sampling/importance_sampling_ratio/min": 1.984124310183688e-06, "sampling/sampling_logp_difference/max": 1.6568723917007446, "sampling/sampling_logp_difference/mean": 0.22826625406742096, "step": 369, "step_time": 10.749359131979872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1157994791865349, "epoch": 0.00185, "grad_norm": 0.2305590659379959, "kl": 0.2114604376256466, "learning_rate": 7.999989676196942e-06, "loss": -0.0603, "step": 370, "step_time": 5.610737100010738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 4.384615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3357193693518639, "epoch": 0.001855, "frac_reward_zero_std": 0.0, "grad_norm": 0.136695995926857, "kl": 0.3221541028469801, "learning_rate": 7.999989614285252e-06, "loss": -0.0672, "num_tokens": 4769480.0, "reward": 0.9970309734344482, "reward_std": 1.1039071083068848, "rewards/rollout_reward_func/mean": 0.9970309734344482, "rewards/rollout_reward_func/std": 1.1039071083068848, "sampling/importance_sampling_ratio/max": 1.4838643074035645, "sampling/importance_sampling_ratio/mean": 0.8853051662445068, "sampling/importance_sampling_ratio/min": 7.006069063208997e-05, "sampling/sampling_logp_difference/max": 2.076009511947632, "sampling/sampling_logp_difference/mean": 0.24266518652439117, "step": 371, "step_time": 10.919766096005333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3381499350070953, "epoch": 0.00186, "grad_norm": 0.13647685945034027, "kl": 0.3287909757345915, "learning_rate": 7.999989552188477e-06, "loss": -0.0674, "step": 372, "step_time": 5.617796557009569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.125, "completions/mean_terminated_length": 5.857142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.509484052658081, "epoch": 0.001865, "frac_reward_zero_std": 0.0, "grad_norm": 0.24253836274147034, "kl": 1.1459577046334743, "learning_rate": 7.999989489906616e-06, "loss": -0.0701, "num_tokens": 4796363.0, "reward": 0.6985896825790405, "reward_std": 1.303996205329895, "rewards/rollout_reward_func/mean": 0.6985896825790405, "rewards/rollout_reward_func/std": 1.303996205329895, "sampling/importance_sampling_ratio/max": 1.1824437379837036, "sampling/importance_sampling_ratio/mean": 0.7735380530357361, "sampling/importance_sampling_ratio/min": 3.733930134330876e-05, "sampling/sampling_logp_difference/max": 2.4277777671813965, "sampling/sampling_logp_difference/mean": 0.2982974648475647, "step": 373, "step_time": 10.193510431985487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5068039298057556, "epoch": 0.00187, "grad_norm": 0.2422718107700348, "kl": 1.1488596759736538, "learning_rate": 7.999989427439667e-06, "loss": -0.07, "step": 374, "step_time": 5.7840630220016465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 3.92307710647583, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0996451750397682, "epoch": 0.001875, "frac_reward_zero_std": 0.0, "grad_norm": 0.07777892053127289, "kl": 0.42483582347631454, "learning_rate": 7.999989364787634e-06, "loss": -0.0922, "num_tokens": 4823308.0, "reward": 1.3291317224502563, "reward_std": 1.06264066696167, "rewards/rollout_reward_func/mean": 1.3291317224502563, "rewards/rollout_reward_func/std": 1.06264066696167, "sampling/importance_sampling_ratio/max": 1.3436148166656494, "sampling/importance_sampling_ratio/mean": 0.8822535276412964, "sampling/importance_sampling_ratio/min": 7.858492608647794e-06, "sampling/sampling_logp_difference/max": 1.5694434642791748, "sampling/sampling_logp_difference/mean": 0.2388511598110199, "step": 375, "step_time": 10.750131473017973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.100001834332943, "epoch": 0.00188, "grad_norm": 0.0765051618218422, "kl": 0.43909310922026634, "learning_rate": 7.999989301950511e-06, "loss": -0.092, "step": 376, "step_time": 5.747890461003408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8376703597605228, "epoch": 0.001885, "frac_reward_zero_std": 0.0, "grad_norm": 0.04779822379350662, "kl": 0.23844246938824654, "learning_rate": 7.999989238928302e-06, "loss": -0.084, "num_tokens": 4842102.0, "reward": 1.346588134765625, "reward_std": 1.2371342182159424, "rewards/rollout_reward_func/mean": 1.346588134765625, "rewards/rollout_reward_func/std": 1.2371342182159424, "sampling/importance_sampling_ratio/max": 1.2092676162719727, "sampling/importance_sampling_ratio/mean": 0.7504057288169861, "sampling/importance_sampling_ratio/min": 3.12358281462366e-08, "sampling/sampling_logp_difference/max": 2.3893795013427734, "sampling/sampling_logp_difference/mean": 0.3827604353427887, "step": 377, "step_time": 7.537548706997768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8324353136122227, "epoch": 0.00189, "grad_norm": 0.04496929049491882, "kl": 0.24116137623786926, "learning_rate": 7.999989175721006e-06, "loss": -0.0841, "step": 378, "step_time": 3.845448321008007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5096580982208252, "epoch": 0.001895, "frac_reward_zero_std": 0.0, "grad_norm": 0.12066873162984848, "kl": 0.6814188845455647, "learning_rate": 7.999989112328623e-06, "loss": -0.098, "num_tokens": 4864229.0, "reward": 1.039881944656372, "reward_std": 1.3876692056655884, "rewards/rollout_reward_func/mean": 1.039881944656372, "rewards/rollout_reward_func/std": 1.3876692056655884, "sampling/importance_sampling_ratio/max": 1.2924045324325562, "sampling/importance_sampling_ratio/mean": 0.7335419654846191, "sampling/importance_sampling_ratio/min": 2.3102921886675176e-07, "sampling/sampling_logp_difference/max": 2.777423620223999, "sampling/sampling_logp_difference/mean": 0.3002992868423462, "step": 379, "step_time": 10.08959888901154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.512382984161377, "epoch": 0.0019, "grad_norm": 0.12679092586040497, "kl": 0.6476633138954639, "learning_rate": 7.999989048751154e-06, "loss": -0.0978, "step": 380, "step_time": 5.128264407991082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.9375, "completions/mean_terminated_length": 4.200000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5971319563686848, "epoch": 0.001905, "frac_reward_zero_std": 0.0, "grad_norm": 0.11882641166448593, "kl": 0.604166965931654, "learning_rate": 7.999988984988599e-06, "loss": -0.0644, "num_tokens": 4888278.0, "reward": 1.13261878490448, "reward_std": 1.0327956676483154, "rewards/rollout_reward_func/mean": 1.13261878490448, "rewards/rollout_reward_func/std": 1.0327956676483154, "sampling/importance_sampling_ratio/max": 1.28909170627594, "sampling/importance_sampling_ratio/mean": 0.9118539094924927, "sampling/importance_sampling_ratio/min": 0.002881323918700218, "sampling/sampling_logp_difference/max": 1.5644278526306152, "sampling/sampling_logp_difference/mean": 0.13659097254276276, "step": 381, "step_time": 10.650587779993657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5992350950837135, "epoch": 0.00191, "grad_norm": 0.12401295453310013, "kl": 0.6177422143518925, "learning_rate": 7.999988921040955e-06, "loss": -0.0648, "step": 382, "step_time": 5.762373754987493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.5625, "completions/mean_terminated_length": 4.0714287757873535, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.427495151758194, "epoch": 0.001915, "frac_reward_zero_std": 0.0, "grad_norm": 0.3691357374191284, "kl": 0.4610893875360489, "learning_rate": 7.999988856908225e-06, "loss": -0.019, "num_tokens": 4914624.0, "reward": 0.04710268974304199, "reward_std": 1.0694756507873535, "rewards/rollout_reward_func/mean": 0.04710268974304199, "rewards/rollout_reward_func/std": 1.069475769996643, "sampling/importance_sampling_ratio/max": 1.4202603101730347, "sampling/importance_sampling_ratio/mean": 0.8843743205070496, "sampling/importance_sampling_ratio/min": 6.85265549691394e-05, "sampling/sampling_logp_difference/max": 1.859026551246643, "sampling/sampling_logp_difference/mean": 0.24663642048835754, "step": 383, "step_time": 10.915411576017505 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.4206380546092987, "epoch": 0.00192, "grad_norm": 0.09586647152900696, "kl": 0.4475311040878296, "learning_rate": 7.99998879259041e-06, "loss": -0.0205, "step": 384, "step_time": 5.596722571004648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 4.285714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7629721090197563, "epoch": 0.001925, "frac_reward_zero_std": 0.0, "grad_norm": 0.101819708943367, "kl": 0.6571244671940804, "learning_rate": 7.999988728087506e-06, "loss": -0.057, "num_tokens": 4936481.0, "reward": 0.6574656963348389, "reward_std": 1.1050117015838623, "rewards/rollout_reward_func/mean": 0.6574656963348389, "rewards/rollout_reward_func/std": 1.1050117015838623, "sampling/importance_sampling_ratio/max": 1.3322046995162964, "sampling/importance_sampling_ratio/mean": 0.8757823705673218, "sampling/importance_sampling_ratio/min": 0.0030043201986700296, "sampling/sampling_logp_difference/max": 1.3580269813537598, "sampling/sampling_logp_difference/mean": 0.1602553129196167, "step": 385, "step_time": 9.82964015599282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.766827404499054, "epoch": 0.00193, "grad_norm": 0.10111477971076965, "kl": 0.6550864577293396, "learning_rate": 7.999988663399516e-06, "loss": -0.0572, "step": 386, "step_time": 4.833826069996576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.38482957892119884, "epoch": 0.001935, "frac_reward_zero_std": 0.0, "grad_norm": 0.009885765612125397, "kl": 0.2940465062856674, "learning_rate": 7.999988598526439e-06, "loss": -0.037, "num_tokens": 4958732.0, "reward": 1.7674963474273682, "reward_std": 0.7135646939277649, "rewards/rollout_reward_func/mean": 1.7674963474273682, "rewards/rollout_reward_func/std": 0.7135648131370544, "sampling/importance_sampling_ratio/max": 1.2406245470046997, "sampling/importance_sampling_ratio/mean": 1.0291972160339355, "sampling/importance_sampling_ratio/min": 0.003218489931896329, "sampling/sampling_logp_difference/max": 1.2426880598068237, "sampling/sampling_logp_difference/mean": 0.08506172150373459, "step": 387, "step_time": 9.235281052009668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3857683278620243, "epoch": 0.00194, "grad_norm": 0.010324140079319477, "kl": 0.29380911216139793, "learning_rate": 7.999988533468276e-06, "loss": -0.037, "step": 388, "step_time": 5.012073020989192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 5.500000476837158, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2620170265436172, "epoch": 0.001945, "frac_reward_zero_std": 0.0, "grad_norm": 0.2192489355802536, "kl": 1.062687374651432, "learning_rate": 7.999988468225024e-06, "loss": -0.0807, "num_tokens": 4983904.0, "reward": 1.076349139213562, "reward_std": 1.2488224506378174, "rewards/rollout_reward_func/mean": 1.076349139213562, "rewards/rollout_reward_func/std": 1.2488224506378174, "sampling/importance_sampling_ratio/max": 1.4810786247253418, "sampling/importance_sampling_ratio/mean": 0.7397128343582153, "sampling/importance_sampling_ratio/min": 0.001060020294971764, "sampling/sampling_logp_difference/max": 2.0550594329833984, "sampling/sampling_logp_difference/mean": 0.22783982753753662, "step": 389, "step_time": 11.906528255000012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.26023019105196, "epoch": 0.00195, "grad_norm": 0.16411611437797546, "kl": 0.9201815873384476, "learning_rate": 7.999988402796688e-06, "loss": -0.081, "step": 390, "step_time": 6.023623871005839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 4.230769157409668, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3090041130781174, "epoch": 0.001955, "frac_reward_zero_std": 0.0, "grad_norm": 0.3187005817890167, "kl": 0.44969429075717926, "learning_rate": 7.999988337183264e-06, "loss": -0.0534, "num_tokens": 5015166.0, "reward": 0.6606802940368652, "reward_std": 1.283058762550354, "rewards/rollout_reward_func/mean": 0.6606802940368652, "rewards/rollout_reward_func/std": 1.2830586433410645, "sampling/importance_sampling_ratio/max": 1.568703532218933, "sampling/importance_sampling_ratio/mean": 0.8521636724472046, "sampling/importance_sampling_ratio/min": 0.00010019787441706285, "sampling/sampling_logp_difference/max": 1.8543270826339722, "sampling/sampling_logp_difference/mean": 0.25358283519744873, "step": 391, "step_time": 12.289289089021622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3145272508263588, "epoch": 0.00196, "grad_norm": 0.3227544128894806, "kl": 0.4286797530949116, "learning_rate": 7.999988271384754e-06, "loss": -0.0542, "step": 392, "step_time": 6.3368780039891135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.125, "completions/mean_terminated_length": 5.076923370361328, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5291369706392288, "epoch": 0.001965, "frac_reward_zero_std": 0.0, "grad_norm": 0.8950464129447937, "kl": 6.000868514180183, "learning_rate": 7.999988205401156e-06, "loss": -0.0481, "num_tokens": 5039731.0, "reward": 0.35827112197875977, "reward_std": 1.352571964263916, "rewards/rollout_reward_func/mean": 0.35827112197875977, "rewards/rollout_reward_func/std": 1.352571964263916, "sampling/importance_sampling_ratio/max": 1.4305652379989624, "sampling/importance_sampling_ratio/mean": 0.6684235334396362, "sampling/importance_sampling_ratio/min": 2.0841298464802094e-05, "sampling/sampling_logp_difference/max": 1.9944705963134766, "sampling/sampling_logp_difference/mean": 0.3007715344429016, "step": 393, "step_time": 12.725093371002004 }, { "clip_ratio/high_max": 0.02777777798473835, "clip_ratio/high_mean": 0.013888888992369175, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013888888992369175, "entropy": 1.5390875041484833, "epoch": 0.00197, "grad_norm": 0.39616531133651733, "kl": 2.6721730642020702, "learning_rate": 7.999988139232472e-06, "loss": -0.057, "step": 394, "step_time": 6.479002101012156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.400000095367432, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.966424435377121, "epoch": 0.001975, "frac_reward_zero_std": 0.0, "grad_norm": 0.21645444631576538, "kl": 2.2879348881542683, "learning_rate": 7.999988072878701e-06, "loss": -0.0385, "num_tokens": 5061894.0, "reward": -0.5122629404067993, "reward_std": 0.5295305252075195, "rewards/rollout_reward_func/mean": -0.5122629404067993, "rewards/rollout_reward_func/std": 0.5295305252075195, "sampling/importance_sampling_ratio/max": 1.219046950340271, "sampling/importance_sampling_ratio/mean": 0.7155815958976746, "sampling/importance_sampling_ratio/min": 3.0534185498254374e-05, "sampling/sampling_logp_difference/max": 1.9197454452514648, "sampling/sampling_logp_difference/mean": 0.2432733029127121, "step": 395, "step_time": 9.630086228004075 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "entropy": 0.9949267208576202, "epoch": 0.00198, "grad_norm": 0.1484791487455368, "kl": 2.184163924306631, "learning_rate": 7.999988006339844e-06, "loss": -0.0392, "step": 396, "step_time": 5.012366559996735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9189420342445374, "epoch": 0.001985, "frac_reward_zero_std": 0.0, "grad_norm": 0.15895426273345947, "kl": 0.8375239968299866, "learning_rate": 7.999987939615899e-06, "loss": -0.0662, "num_tokens": 5088441.0, "reward": 0.8883561491966248, "reward_std": 1.261866569519043, "rewards/rollout_reward_func/mean": 0.8883561491966248, "rewards/rollout_reward_func/std": 1.2618666887283325, "sampling/importance_sampling_ratio/max": 1.762088656425476, "sampling/importance_sampling_ratio/mean": 0.9231594800949097, "sampling/importance_sampling_ratio/min": 1.4711421499669086e-05, "sampling/sampling_logp_difference/max": 1.6536542177200317, "sampling/sampling_logp_difference/mean": 0.21253931522369385, "step": 397, "step_time": 12.331309160013916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9358097389340401, "epoch": 0.00199, "grad_norm": 0.16061335802078247, "kl": 0.8255690857768059, "learning_rate": 7.999987872706869e-06, "loss": -0.0664, "step": 398, "step_time": 6.499515127012273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.375, "completions/mean_terminated_length": 6.363636493682861, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6048619747161865, "epoch": 0.001995, "frac_reward_zero_std": 0.0, "grad_norm": 0.14453689754009247, "kl": 0.2236691452562809, "learning_rate": 7.99998780561275e-06, "loss": -0.0643, "num_tokens": 5117081.0, "reward": 0.6946181058883667, "reward_std": 1.3955459594726562, "rewards/rollout_reward_func/mean": 0.6946181058883667, "rewards/rollout_reward_func/std": 1.3955460786819458, "sampling/importance_sampling_ratio/max": 1.3926446437835693, "sampling/importance_sampling_ratio/mean": 0.5245368480682373, "sampling/importance_sampling_ratio/min": 0.000133177571115084, "sampling/sampling_logp_difference/max": 1.8343119621276855, "sampling/sampling_logp_difference/mean": 0.27688711881637573, "step": 399, "step_time": 12.261816387996078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6103662252426147, "epoch": 0.002, "grad_norm": 0.14727449417114258, "kl": 0.2063254788517952, "learning_rate": 7.999987738333547e-06, "loss": -0.0646, "step": 400, "step_time": 6.079266581989941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.1875, "completions/mean_terminated_length": 5.583333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6980331242084503, "epoch": 0.002005, "frac_reward_zero_std": 0.0, "grad_norm": 0.12853454053401947, "kl": 0.2220988217741251, "learning_rate": 7.999987670869257e-06, "loss": -0.0864, "num_tokens": 5149662.0, "reward": 0.5920583605766296, "reward_std": 1.2340011596679688, "rewards/rollout_reward_func/mean": 0.5920583605766296, "rewards/rollout_reward_func/std": 1.2340011596679688, "sampling/importance_sampling_ratio/max": 1.2607829570770264, "sampling/importance_sampling_ratio/mean": 0.5698373317718506, "sampling/importance_sampling_ratio/min": 6.143857899587601e-05, "sampling/sampling_logp_difference/max": 1.69197416305542, "sampling/sampling_logp_difference/mean": 0.2620908319950104, "step": 401, "step_time": 14.121643465012312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7005712389945984, "epoch": 0.00201, "grad_norm": 0.13076579570770264, "kl": 0.22326273377984762, "learning_rate": 7.999987603219878e-06, "loss": -0.0861, "step": 402, "step_time": 7.000039869002649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 4.2727274894714355, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5569478571414948, "epoch": 0.002015, "frac_reward_zero_std": 0.0, "grad_norm": 0.1495421677827835, "kl": 0.27546979766339064, "learning_rate": 7.999987535385413e-06, "loss": -0.0983, "num_tokens": 5178718.0, "reward": 0.6583528518676758, "reward_std": 1.2770541906356812, "rewards/rollout_reward_func/mean": 0.6583528518676758, "rewards/rollout_reward_func/std": 1.2770541906356812, "sampling/importance_sampling_ratio/max": 1.3417977094650269, "sampling/importance_sampling_ratio/mean": 0.7273508310317993, "sampling/importance_sampling_ratio/min": 0.0028331438079476357, "sampling/sampling_logp_difference/max": 1.7180547714233398, "sampling/sampling_logp_difference/mean": 0.21536590158939362, "step": 403, "step_time": 12.616426414024318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.559838593006134, "epoch": 0.00202, "grad_norm": 0.14340566098690033, "kl": 0.28756335005164146, "learning_rate": 7.99998746736586e-06, "loss": -0.0986, "step": 404, "step_time": 5.991053068006295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 5.230769634246826, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8477215468883514, "epoch": 0.002025, "frac_reward_zero_std": 0.0, "grad_norm": 0.1664692461490631, "kl": 2.7981413677334785, "learning_rate": 7.999987399161224e-06, "loss": -0.0999, "num_tokens": 5202564.0, "reward": 1.0476574897766113, "reward_std": 1.3157217502593994, "rewards/rollout_reward_func/mean": 1.0476574897766113, "rewards/rollout_reward_func/std": 1.315721869468689, "sampling/importance_sampling_ratio/max": 1.2907378673553467, "sampling/importance_sampling_ratio/mean": 0.6774812936782837, "sampling/importance_sampling_ratio/min": 0.00014228907821234316, "sampling/sampling_logp_difference/max": 1.977290391921997, "sampling/sampling_logp_difference/mean": 0.32076725363731384, "step": 405, "step_time": 10.233427095008665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8618475198745728, "epoch": 0.00203, "grad_norm": 0.15516255795955658, "kl": 2.6569803953170776, "learning_rate": 7.999987330771498e-06, "loss": -0.1004, "step": 406, "step_time": 5.5311586130119395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 5.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9903083145618439, "epoch": 0.002035, "frac_reward_zero_std": 0.0, "grad_norm": 0.18317359685897827, "kl": 0.2313507366925478, "learning_rate": 7.999987262196688e-06, "loss": -0.0829, "num_tokens": 5225787.0, "reward": 1.5385627746582031, "reward_std": 0.9159702062606812, "rewards/rollout_reward_func/mean": 1.5385627746582031, "rewards/rollout_reward_func/std": 0.9159702062606812, "sampling/importance_sampling_ratio/max": 1.388248324394226, "sampling/importance_sampling_ratio/mean": 0.8785833120346069, "sampling/importance_sampling_ratio/min": 0.0010256089735776186, "sampling/sampling_logp_difference/max": 1.608703851699829, "sampling/sampling_logp_difference/mean": 0.17944231629371643, "step": 407, "step_time": 11.061881513000117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9846137091517448, "epoch": 0.00204, "grad_norm": 0.170656219124794, "kl": 0.23672310449182987, "learning_rate": 7.99998719343679e-06, "loss": -0.0837, "step": 408, "step_time": 5.5652926030161325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.857142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.087178535759449, "epoch": 0.002045, "frac_reward_zero_std": 0.0, "grad_norm": 0.16905057430267334, "kl": 0.4514891356229782, "learning_rate": 7.999987124491804e-06, "loss": -0.1, "num_tokens": 5256201.0, "reward": 0.9170963764190674, "reward_std": 1.0741151571273804, "rewards/rollout_reward_func/mean": 0.9170963764190674, "rewards/rollout_reward_func/std": 1.0741151571273804, "sampling/importance_sampling_ratio/max": 2.4437670707702637, "sampling/importance_sampling_ratio/mean": 1.0707159042358398, "sampling/importance_sampling_ratio/min": 0.003085096599534154, "sampling/sampling_logp_difference/max": 1.6402695178985596, "sampling/sampling_logp_difference/mean": 0.18759359419345856, "step": 409, "step_time": 12.625917496989132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0749839842319489, "epoch": 0.00205, "grad_norm": 0.13952964544296265, "kl": 0.5115707293152809, "learning_rate": 7.999987055361734e-06, "loss": -0.1007, "step": 410, "step_time": 6.300768674002029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 4.266666889190674, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6787650398910046, "epoch": 0.002055, "frac_reward_zero_std": 0.0, "grad_norm": 0.35098254680633545, "kl": 0.5086080133914948, "learning_rate": 7.999986986046575e-06, "loss": -0.046, "num_tokens": 5279038.0, "reward": 1.4672672748565674, "reward_std": 1.0603457689285278, "rewards/rollout_reward_func/mean": 1.4672672748565674, "rewards/rollout_reward_func/std": 1.0603457689285278, "sampling/importance_sampling_ratio/max": 1.3081163167953491, "sampling/importance_sampling_ratio/mean": 0.9316145181655884, "sampling/importance_sampling_ratio/min": 0.0024144581984728575, "sampling/sampling_logp_difference/max": 1.2787935733795166, "sampling/sampling_logp_difference/mean": 0.11731661856174469, "step": 411, "step_time": 12.079176594997989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6678028181195259, "epoch": 0.00206, "grad_norm": 0.30974888801574707, "kl": 0.5127650015056133, "learning_rate": 7.99998691654633e-06, "loss": -0.0478, "step": 412, "step_time": 6.418537595993257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.692029319703579, "epoch": 0.002065, "frac_reward_zero_std": 0.0, "grad_norm": 0.1536330282688141, "kl": 0.28561894595623016, "learning_rate": 7.999986846861e-06, "loss": -0.0435, "num_tokens": 5318077.0, "reward": 0.08585097640752792, "reward_std": 0.9410378932952881, "rewards/rollout_reward_func/mean": 0.08585097640752792, "rewards/rollout_reward_func/std": 0.9410379528999329, "sampling/importance_sampling_ratio/max": 1.5904099941253662, "sampling/importance_sampling_ratio/mean": 1.0754302740097046, "sampling/importance_sampling_ratio/min": 0.01066703163087368, "sampling/sampling_logp_difference/max": 0.9085627794265747, "sampling/sampling_logp_difference/mean": 0.13224773108959198, "step": 413, "step_time": 14.31801533199905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6727906167507172, "epoch": 0.00207, "grad_norm": 0.15247145295143127, "kl": 0.2924337722361088, "learning_rate": 7.999986776990581e-06, "loss": -0.0441, "step": 414, "step_time": 7.285314167995239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5012946501374245, "epoch": 0.002075, "frac_reward_zero_std": 0.0, "grad_norm": 0.08419489860534668, "kl": 0.35052530746906996, "learning_rate": 7.999986706935076e-06, "loss": -0.0927, "num_tokens": 5352742.0, "reward": 1.0196183919906616, "reward_std": 1.1123415231704712, "rewards/rollout_reward_func/mean": 1.0196183919906616, "rewards/rollout_reward_func/std": 1.1123415231704712, "sampling/importance_sampling_ratio/max": 1.3070868253707886, "sampling/importance_sampling_ratio/mean": 0.8010417222976685, "sampling/importance_sampling_ratio/min": 5.9872436395380646e-05, "sampling/sampling_logp_difference/max": 1.5063852071762085, "sampling/sampling_logp_difference/mean": 0.2736932337284088, "step": 415, "step_time": 13.25256339200132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4888232983648777, "epoch": 0.00208, "grad_norm": 0.08286798745393753, "kl": 0.3736973190680146, "learning_rate": 7.999986636694485e-06, "loss": -0.0929, "step": 416, "step_time": 7.377008857001783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3217617645859718, "epoch": 0.002085, "frac_reward_zero_std": 0.0, "grad_norm": 0.1341053545475006, "kl": 0.3705991506576538, "learning_rate": 7.999986566268808e-06, "loss": -0.0253, "num_tokens": 5379094.0, "reward": 0.9152259230613708, "reward_std": 1.2272377014160156, "rewards/rollout_reward_func/mean": 0.9152259230613708, "rewards/rollout_reward_func/std": 1.227237582206726, "sampling/importance_sampling_ratio/max": 1.3024137020111084, "sampling/importance_sampling_ratio/mean": 0.8247548341751099, "sampling/importance_sampling_ratio/min": 4.1604537592832e-09, "sampling/sampling_logp_difference/max": 2.2259485721588135, "sampling/sampling_logp_difference/mean": 0.2726716995239258, "step": 417, "step_time": 13.959751583999605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0059523810632526875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0059523810632526875, "entropy": 1.3123666942119598, "epoch": 0.00209, "grad_norm": 0.1112261712551117, "kl": 0.3924168851226568, "learning_rate": 7.999986495658042e-06, "loss": -0.026, "step": 418, "step_time": 6.893153122990043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.375, "completions/mean_terminated_length": 4.6666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8026549220085144, "epoch": 0.002095, "frac_reward_zero_std": 0.0, "grad_norm": 0.09237297624349594, "kl": 0.35536882653832436, "learning_rate": 7.999986424862192e-06, "loss": -0.072, "num_tokens": 5408629.0, "reward": 0.6120055913925171, "reward_std": 1.1618564128875732, "rewards/rollout_reward_func/mean": 0.6120055913925171, "rewards/rollout_reward_func/std": 1.1618564128875732, "sampling/importance_sampling_ratio/max": 1.1625562906265259, "sampling/importance_sampling_ratio/mean": 0.9022448658943176, "sampling/importance_sampling_ratio/min": 0.0020194880198687315, "sampling/sampling_logp_difference/max": 1.3087067604064941, "sampling/sampling_logp_difference/mean": 0.16508513689041138, "step": 419, "step_time": 10.319594382017385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7914645113050938, "epoch": 0.0021, "grad_norm": 0.08577532321214676, "kl": 0.357616662979126, "learning_rate": 7.999986353881253e-06, "loss": -0.0721, "step": 420, "step_time": 5.350368063009228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5858156904578209, "epoch": 0.002105, "frac_reward_zero_std": 0.5, "grad_norm": 0.13921333849430084, "kl": 0.4025640897452831, "learning_rate": 7.999986282715228e-06, "loss": -0.0382, "num_tokens": 5427506.0, "reward": 1.3312697410583496, "reward_std": 1.317884922027588, "rewards/rollout_reward_func/mean": 1.3312697410583496, "rewards/rollout_reward_func/std": 1.317884922027588, "sampling/importance_sampling_ratio/max": 1.0925772190093994, "sampling/importance_sampling_ratio/mean": 0.8264043927192688, "sampling/importance_sampling_ratio/min": 0.00588312977924943, "sampling/sampling_logp_difference/max": 1.7382960319519043, "sampling/sampling_logp_difference/mean": 0.11537399888038635, "step": 421, "step_time": 8.373944703998859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5775466822087765, "epoch": 0.00211, "grad_norm": 0.1290176808834076, "kl": 0.42072687298059464, "learning_rate": 7.999986211364117e-06, "loss": -0.0384, "step": 422, "step_time": 3.9662484700093046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8006591871380806, "epoch": 0.002115, "frac_reward_zero_std": 0.0, "grad_norm": 0.05460485816001892, "kl": 0.8262946344912052, "learning_rate": 7.999986139827919e-06, "loss": -0.0905, "num_tokens": 5452486.0, "reward": 1.1993162631988525, "reward_std": 1.2481776475906372, "rewards/rollout_reward_func/mean": 1.1993162631988525, "rewards/rollout_reward_func/std": 1.2481778860092163, "sampling/importance_sampling_ratio/max": 1.1286448240280151, "sampling/importance_sampling_ratio/mean": 0.8034477829933167, "sampling/importance_sampling_ratio/min": 0.0015236554900184274, "sampling/sampling_logp_difference/max": 1.9042255878448486, "sampling/sampling_logp_difference/mean": 0.1524968147277832, "step": 423, "step_time": 10.486017930990783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.800329152494669, "epoch": 0.00212, "grad_norm": 0.059104301035404205, "kl": 0.9086621701717377, "learning_rate": 7.999986068106634e-06, "loss": -0.0905, "step": 424, "step_time": 5.014274649976869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.4375, "completions/mean_terminated_length": 3.9285717010498047, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7884259968996048, "epoch": 0.002125, "frac_reward_zero_std": 0.0, "grad_norm": 0.17365598678588867, "kl": 0.21569580771028996, "learning_rate": 7.999985996200264e-06, "loss": -0.0528, "num_tokens": 5483484.0, "reward": 0.8243902921676636, "reward_std": 1.194877028465271, "rewards/rollout_reward_func/mean": 0.8243902921676636, "rewards/rollout_reward_func/std": 1.1948771476745605, "sampling/importance_sampling_ratio/max": 1.4812103509902954, "sampling/importance_sampling_ratio/mean": 0.9376837015151978, "sampling/importance_sampling_ratio/min": 0.002512532752007246, "sampling/sampling_logp_difference/max": 1.7889206409454346, "sampling/sampling_logp_difference/mean": 0.14681071043014526, "step": 425, "step_time": 13.003214383003069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7802898287773132, "epoch": 0.00213, "grad_norm": 0.17110346257686615, "kl": 0.2160678245127201, "learning_rate": 7.999985924108805e-06, "loss": -0.0532, "step": 426, "step_time": 7.387808999992558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5390550941228867, "epoch": 0.002135, "frac_reward_zero_std": 0.5, "grad_norm": 0.11232518404722214, "kl": 1.0664145313203335, "learning_rate": 7.999985851832262e-06, "loss": -0.0462, "num_tokens": 5500837.0, "reward": 1.581069827079773, "reward_std": 0.9897377490997314, "rewards/rollout_reward_func/mean": 1.581069827079773, "rewards/rollout_reward_func/std": 0.9897378087043762, "sampling/importance_sampling_ratio/max": 1.120303988456726, "sampling/importance_sampling_ratio/mean": 0.8988562822341919, "sampling/importance_sampling_ratio/min": 0.0034840756561607122, "sampling/sampling_logp_difference/max": 1.3655922412872314, "sampling/sampling_logp_difference/mean": 0.11986894905567169, "step": 427, "step_time": 7.143762257008348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5283152386546135, "epoch": 0.00214, "grad_norm": 0.10175580531358719, "kl": 1.0129971131682396, "learning_rate": 7.999985779370631e-06, "loss": -0.0465, "step": 428, "step_time": 3.792924719993607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.789367288351059, "epoch": 0.002145, "frac_reward_zero_std": 0.0, "grad_norm": 0.021824903786182404, "kl": 0.22623608633875847, "learning_rate": 7.999985706723913e-06, "loss": -0.0634, "num_tokens": 5524488.0, "reward": 0.9793949127197266, "reward_std": 1.341995120048523, "rewards/rollout_reward_func/mean": 0.9793949127197266, "rewards/rollout_reward_func/std": 1.3419952392578125, "sampling/importance_sampling_ratio/max": 1.1267549991607666, "sampling/importance_sampling_ratio/mean": 0.8747721314430237, "sampling/importance_sampling_ratio/min": 0.002439941046759486, "sampling/sampling_logp_difference/max": 1.1996009349822998, "sampling/sampling_logp_difference/mean": 0.1366189420223236, "step": 429, "step_time": 12.20243041800859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7859347090125084, "epoch": 0.00215, "grad_norm": 0.020763138309121132, "kl": 0.22660653851926327, "learning_rate": 7.999985633892109e-06, "loss": -0.0635, "step": 430, "step_time": 6.028614452981856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0368444919586182, "epoch": 0.002155, "frac_reward_zero_std": 0.0, "grad_norm": 0.07675840705633163, "kl": 0.5519007556140423, "learning_rate": 7.999985560875218e-06, "loss": -0.0561, "num_tokens": 5553787.0, "reward": -0.02509579062461853, "reward_std": 1.0528748035430908, "rewards/rollout_reward_func/mean": -0.02509579062461853, "rewards/rollout_reward_func/std": 1.0528749227523804, "sampling/importance_sampling_ratio/max": 1.4147603511810303, "sampling/importance_sampling_ratio/mean": 0.706780731678009, "sampling/importance_sampling_ratio/min": 0.000783633382525295, "sampling/sampling_logp_difference/max": 1.8665674924850464, "sampling/sampling_logp_difference/mean": 0.24103519320487976, "step": 431, "step_time": 12.925794278999092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0359977334737778, "epoch": 0.00216, "grad_norm": 0.08050304651260376, "kl": 0.5374142564833164, "learning_rate": 7.999985487673243e-06, "loss": -0.0561, "step": 432, "step_time": 6.803290862997528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.11363080143928528, "epoch": 0.002165, "frac_reward_zero_std": 0.5, "grad_norm": 0.024916650727391243, "kl": 0.304887980222702, "learning_rate": 7.999985414286176e-06, "loss": 0.0035, "num_tokens": 5577511.0, "reward": 1.7370622158050537, "reward_std": 0.6045935153961182, "rewards/rollout_reward_func/mean": 1.7370622158050537, "rewards/rollout_reward_func/std": 0.6045936346054077, "sampling/importance_sampling_ratio/max": 1.1590021848678589, "sampling/importance_sampling_ratio/mean": 1.0178693532943726, "sampling/importance_sampling_ratio/min": 0.7978665232658386, "sampling/sampling_logp_difference/max": 0.33639371395111084, "sampling/sampling_logp_difference/mean": 0.024338331073522568, "step": 433, "step_time": 9.672583613995812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11076458357274532, "epoch": 0.00217, "grad_norm": 0.02465805411338806, "kl": 0.30466779321432114, "learning_rate": 7.999985340714028e-06, "loss": 0.0034, "step": 434, "step_time": 5.359009623003658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 4.5625, "completions/mean_terminated_length": 4.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.4301276318728924, "epoch": 0.002175, "frac_reward_zero_std": 0.0, "grad_norm": 0.18479177355766296, "kl": 1.0126710273325443, "learning_rate": 7.99998526695679e-06, "loss": -0.0353, "num_tokens": 5612513.0, "reward": 0.7720308303833008, "reward_std": 1.078931212425232, "rewards/rollout_reward_func/mean": 0.7720308303833008, "rewards/rollout_reward_func/std": 1.0789310932159424, "sampling/importance_sampling_ratio/max": 1.5776149034500122, "sampling/importance_sampling_ratio/mean": 0.9489172101020813, "sampling/importance_sampling_ratio/min": 0.007825460284948349, "sampling/sampling_logp_difference/max": 2.0528578758239746, "sampling/sampling_logp_difference/mean": 0.1339920610189438, "step": 435, "step_time": 13.19029218198557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42830108664929867, "epoch": 0.00218, "grad_norm": 0.17507001757621765, "kl": 0.9911728389561176, "learning_rate": 7.999985193014467e-06, "loss": -0.0354, "step": 436, "step_time": 7.377333769021789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4610031694173813, "epoch": 0.002185, "frac_reward_zero_std": 0.0, "grad_norm": 0.29950371384620667, "kl": 0.34214499220252037, "learning_rate": 7.999985118887056e-06, "loss": -0.0968, "num_tokens": 5642158.0, "reward": 0.6637955904006958, "reward_std": 1.1256606578826904, "rewards/rollout_reward_func/mean": 0.6637955904006958, "rewards/rollout_reward_func/std": 1.1256606578826904, "sampling/importance_sampling_ratio/max": 1.3726619482040405, "sampling/importance_sampling_ratio/mean": 0.7126230001449585, "sampling/importance_sampling_ratio/min": 2.6603249352774583e-05, "sampling/sampling_logp_difference/max": 2.203644275665283, "sampling/sampling_logp_difference/mean": 0.28262412548065186, "step": 437, "step_time": 13.883348360992386 }, { "clip_ratio/high_max": 0.043750000186264515, "clip_ratio/high_mean": 0.021875000093132257, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021875000093132257, "entropy": 1.4588286653161049, "epoch": 0.00219, "grad_norm": 0.1721174716949463, "kl": 0.3238530922681093, "learning_rate": 7.99998504457456e-06, "loss": -0.0979, "step": 438, "step_time": 7.255685734984581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 3.92307710647583, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.882682915776968, "epoch": 0.002195, "frac_reward_zero_std": 0.0, "grad_norm": 0.03773719444870949, "kl": 0.36409060657024384, "learning_rate": 7.999984970076977e-06, "loss": -0.0735, "num_tokens": 5671934.0, "reward": 0.8413308262825012, "reward_std": 1.1265411376953125, "rewards/rollout_reward_func/mean": 0.8413308262825012, "rewards/rollout_reward_func/std": 1.1265411376953125, "sampling/importance_sampling_ratio/max": 1.1241543292999268, "sampling/importance_sampling_ratio/mean": 0.8186948299407959, "sampling/importance_sampling_ratio/min": 0.0021365326829254627, "sampling/sampling_logp_difference/max": 1.0520343780517578, "sampling/sampling_logp_difference/mean": 0.15146061778068542, "step": 439, "step_time": 13.587037488003261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8829769678413868, "epoch": 0.0022, "grad_norm": 0.03704822435975075, "kl": 0.3639100566506386, "learning_rate": 7.999984895394308e-06, "loss": -0.0735, "step": 440, "step_time": 7.086516403986025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2729654014110565, "epoch": 0.002205, "frac_reward_zero_std": 0.0, "grad_norm": 0.2060329169034958, "kl": 0.17700719460844994, "learning_rate": 7.999984820526551e-06, "loss": -0.0497, "num_tokens": 5703281.0, "reward": 0.6103469729423523, "reward_std": 1.1746468544006348, "rewards/rollout_reward_func/mean": 0.6103469729423523, "rewards/rollout_reward_func/std": 1.1746469736099243, "sampling/importance_sampling_ratio/max": 1.171785831451416, "sampling/importance_sampling_ratio/mean": 0.7934533357620239, "sampling/importance_sampling_ratio/min": 0.0009175788727588952, "sampling/sampling_logp_difference/max": 1.4927325248718262, "sampling/sampling_logp_difference/mean": 0.20535627007484436, "step": 441, "step_time": 13.55497656899388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2782820165157318, "epoch": 0.00221, "grad_norm": 0.1999286413192749, "kl": 0.17675883881747723, "learning_rate": 7.999984745473708e-06, "loss": -0.05, "step": 442, "step_time": 6.848561276987311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.6875, "completions/mean_terminated_length": 3.9333336353302, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.4501506183296442, "epoch": 0.002215, "frac_reward_zero_std": 0.5, "grad_norm": 0.03109028749167919, "kl": 0.340883731842041, "learning_rate": 7.99998467023578e-06, "loss": -0.0231, "num_tokens": 5733740.0, "reward": 1.142160177230835, "reward_std": 1.0870469808578491, "rewards/rollout_reward_func/mean": 1.142160177230835, "rewards/rollout_reward_func/std": 1.0870469808578491, "sampling/importance_sampling_ratio/max": 1.1233503818511963, "sampling/importance_sampling_ratio/mean": 0.9397611618041992, "sampling/importance_sampling_ratio/min": 0.0087612709030509, "sampling/sampling_logp_difference/max": 0.8175392150878906, "sampling/sampling_logp_difference/mean": 0.08439216762781143, "step": 443, "step_time": 13.209593319988926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4512583836913109, "epoch": 0.00222, "grad_norm": 0.031130658462643623, "kl": 0.3402293361723423, "learning_rate": 7.999984594812764e-06, "loss": -0.0231, "step": 444, "step_time": 6.910215731986682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.0625, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5794810876250267, "epoch": 0.002225, "frac_reward_zero_std": 0.0, "grad_norm": 0.07344932854175568, "kl": 0.4269479662179947, "learning_rate": 7.999984519204662e-06, "loss": -0.0807, "num_tokens": 5753419.0, "reward": 0.37364912033081055, "reward_std": 1.4111742973327637, "rewards/rollout_reward_func/mean": 0.37364912033081055, "rewards/rollout_reward_func/std": 1.4111744165420532, "sampling/importance_sampling_ratio/max": 1.1023703813552856, "sampling/importance_sampling_ratio/mean": 0.8720027208328247, "sampling/importance_sampling_ratio/min": 0.0076099783182144165, "sampling/sampling_logp_difference/max": 0.9781570434570312, "sampling/sampling_logp_difference/mean": 0.12469613552093506, "step": 445, "step_time": 7.450737866980489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5772213470190763, "epoch": 0.00223, "grad_norm": 0.07154776155948639, "kl": 0.4372318536043167, "learning_rate": 7.999984443411473e-06, "loss": -0.0808, "step": 446, "step_time": 3.999360767978942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9927229806780815, "epoch": 0.002235, "frac_reward_zero_std": 0.0, "grad_norm": 0.40281322598457336, "kl": 2.933543086051941, "learning_rate": 7.999984367433198e-06, "loss": -0.0513, "num_tokens": 5775564.0, "reward": 0.40708398818969727, "reward_std": 1.2548130750656128, "rewards/rollout_reward_func/mean": 0.40708398818969727, "rewards/rollout_reward_func/std": 1.2548130750656128, "sampling/importance_sampling_ratio/max": 1.4316667318344116, "sampling/importance_sampling_ratio/mean": 0.7064942121505737, "sampling/importance_sampling_ratio/min": 0.00563614210113883, "sampling/sampling_logp_difference/max": 2.8831849098205566, "sampling/sampling_logp_difference/mean": 0.28350502252578735, "step": 447, "step_time": 9.86641086700547 }, { "clip_ratio/high_max": 0.014705882407724857, "clip_ratio/high_mean": 0.007352941203862429, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007352941203862429, "entropy": 0.9915322996675968, "epoch": 0.00224, "grad_norm": 0.26473337411880493, "kl": 2.2587107121944427, "learning_rate": 7.999984291269835e-06, "loss": -0.0534, "step": 448, "step_time": 5.254258054992533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.0625, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6119433417916298, "epoch": 0.002245, "frac_reward_zero_std": 0.5, "grad_norm": 0.054154444485902786, "kl": 0.5029348023235798, "learning_rate": 7.999984214921387e-06, "loss": -0.0373, "num_tokens": 5802394.0, "reward": 1.266706943511963, "reward_std": 1.043810486793518, "rewards/rollout_reward_func/mean": 1.266706943511963, "rewards/rollout_reward_func/std": 1.043810486793518, "sampling/importance_sampling_ratio/max": 1.5417121648788452, "sampling/importance_sampling_ratio/mean": 0.9881430864334106, "sampling/importance_sampling_ratio/min": 0.00012595851148944348, "sampling/sampling_logp_difference/max": 1.598787784576416, "sampling/sampling_logp_difference/mean": 0.14378803968429565, "step": 449, "step_time": 12.607469952010433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6169676575809717, "epoch": 0.00225, "grad_norm": 0.05094533786177635, "kl": 0.49098121747374535, "learning_rate": 7.99998413838785e-06, "loss": -0.0374, "step": 450, "step_time": 6.50613042198529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.4030707385390997, "epoch": 0.002255, "frac_reward_zero_std": 0.5, "grad_norm": 0.293405145406723, "kl": 0.26829126104712486, "learning_rate": 7.99998406166923e-06, "loss": -0.0206, "num_tokens": 5823101.0, "reward": -0.4725934863090515, "reward_std": 0.24059557914733887, "rewards/rollout_reward_func/mean": -0.4725934863090515, "rewards/rollout_reward_func/std": 0.24059559404850006, "sampling/importance_sampling_ratio/max": 1.0833669900894165, "sampling/importance_sampling_ratio/mean": 0.9569692611694336, "sampling/importance_sampling_ratio/min": 0.007543278858065605, "sampling/sampling_logp_difference/max": 0.9492950439453125, "sampling/sampling_logp_difference/mean": 0.06907369196414948, "step": 451, "step_time": 8.930638969992287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4089128850027919, "epoch": 0.00226, "grad_norm": 0.2860061526298523, "kl": 0.2672489434480667, "learning_rate": 7.999983984765522e-06, "loss": -0.022, "step": 452, "step_time": 4.924791352008469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.15916120633482933, "epoch": 0.002265, "frac_reward_zero_std": 0.5, "grad_norm": 0.04818718880414963, "kl": 0.5342292338609695, "learning_rate": 7.999983907676728e-06, "loss": -0.0278, "num_tokens": 5841895.0, "reward": 1.5910563468933105, "reward_std": 0.050000011920928955, "rewards/rollout_reward_func/mean": 1.5910563468933105, "rewards/rollout_reward_func/std": 0.050000011920928955, "sampling/importance_sampling_ratio/max": 1.0782015323638916, "sampling/importance_sampling_ratio/mean": 0.9725953340530396, "sampling/importance_sampling_ratio/min": 0.13846103847026825, "sampling/sampling_logp_difference/max": 1.277134656906128, "sampling/sampling_logp_difference/mean": 0.03622131049633026, "step": 453, "step_time": 6.195538624990149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.16054041124880314, "epoch": 0.00227, "grad_norm": 0.044616375118494034, "kl": 0.5593908131122589, "learning_rate": 7.999983830402848e-06, "loss": -0.0279, "step": 454, "step_time": 3.813961231018766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.41762816347181797, "epoch": 0.002275, "frac_reward_zero_std": 0.0, "grad_norm": 0.0935421884059906, "kl": 0.5533692948520184, "learning_rate": 7.99998375294388e-06, "loss": -0.0681, "num_tokens": 5864778.0, "reward": 1.5075278282165527, "reward_std": 0.97694331407547, "rewards/rollout_reward_func/mean": 1.5075278282165527, "rewards/rollout_reward_func/std": 0.97694331407547, "sampling/importance_sampling_ratio/max": 1.1854170560836792, "sampling/importance_sampling_ratio/mean": 0.9433751106262207, "sampling/importance_sampling_ratio/min": 0.008790329098701477, "sampling/sampling_logp_difference/max": 0.9829587936401367, "sampling/sampling_logp_difference/mean": 0.08733346313238144, "step": 455, "step_time": 9.9889190299873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4177401661872864, "epoch": 0.00228, "grad_norm": 0.08619321137666702, "kl": 0.5721571184694767, "learning_rate": 7.999983675299827e-06, "loss": -0.0682, "step": 456, "step_time": 5.006127186992671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 4.8125, "completions/mean_terminated_length": 4.8125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.715128555893898, "epoch": 0.002285, "frac_reward_zero_std": 0.0, "grad_norm": 0.22806064784526825, "kl": 0.824800468981266, "learning_rate": 7.999983597470686e-06, "loss": -0.0684, "num_tokens": 5898459.0, "reward": 0.7848740816116333, "reward_std": 1.0888258218765259, "rewards/rollout_reward_func/mean": 0.7848740816116333, "rewards/rollout_reward_func/std": 1.0888258218765259, "sampling/importance_sampling_ratio/max": 1.6816290616989136, "sampling/importance_sampling_ratio/mean": 0.9008280038833618, "sampling/importance_sampling_ratio/min": 5.9007252275478095e-05, "sampling/sampling_logp_difference/max": 1.8644884824752808, "sampling/sampling_logp_difference/mean": 0.21812763810157776, "step": 457, "step_time": 13.624335189015255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7120818495750427, "epoch": 0.00229, "grad_norm": 0.23417535424232483, "kl": 0.846783310174942, "learning_rate": 7.99998351945646e-06, "loss": -0.069, "step": 458, "step_time": 7.180034896999132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0835980474948883, "epoch": 0.002295, "frac_reward_zero_std": 0.0, "grad_norm": 0.04895113408565521, "kl": 0.3758278861641884, "learning_rate": 7.999983441257147e-06, "loss": -0.0941, "num_tokens": 5927758.0, "reward": 0.8527010679244995, "reward_std": 1.3200280666351318, "rewards/rollout_reward_func/mean": 0.8527010679244995, "rewards/rollout_reward_func/std": 1.3200280666351318, "sampling/importance_sampling_ratio/max": 1.136947512626648, "sampling/importance_sampling_ratio/mean": 0.7503846883773804, "sampling/importance_sampling_ratio/min": 0.0012541265459731221, "sampling/sampling_logp_difference/max": 1.817187786102295, "sampling/sampling_logp_difference/mean": 0.21492063999176025, "step": 459, "step_time": 13.593937459998415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0849112942814827, "epoch": 0.0023, "grad_norm": 0.04781700670719147, "kl": 0.3755405619740486, "learning_rate": 7.999983362872746e-06, "loss": -0.0942, "step": 460, "step_time": 6.755363378993934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.3430630099028349, "epoch": 0.002305, "frac_reward_zero_std": 0.0, "grad_norm": 0.005975942127406597, "kl": 0.3451468497514725, "learning_rate": 7.999983284303261e-06, "loss": -0.0368, "num_tokens": 5955473.0, "reward": 1.7507362365722656, "reward_std": 0.7097315788269043, "rewards/rollout_reward_func/mean": 1.7507362365722656, "rewards/rollout_reward_func/std": 0.7097315192222595, "sampling/importance_sampling_ratio/max": 1.1791476011276245, "sampling/importance_sampling_ratio/mean": 0.9826844334602356, "sampling/importance_sampling_ratio/min": 0.00410878099501133, "sampling/sampling_logp_difference/max": 1.0908243656158447, "sampling/sampling_logp_difference/mean": 0.0723208338022232, "step": 461, "step_time": 11.744055693008704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34450955502688885, "epoch": 0.00231, "grad_norm": 0.0057505168952047825, "kl": 0.34520598500967026, "learning_rate": 7.999983205548689e-06, "loss": -0.0368, "step": 462, "step_time": 6.25468178401934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.5625, "completions/mean_terminated_length": 4.866666793823242, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7597738727927208, "epoch": 0.002315, "frac_reward_zero_std": 0.0, "grad_norm": 0.21509145200252533, "kl": 0.35708119720220566, "learning_rate": 7.99998312660903e-06, "loss": -0.0227, "num_tokens": 5980535.0, "reward": 0.018202438950538635, "reward_std": 1.2292191982269287, "rewards/rollout_reward_func/mean": 0.018202438950538635, "rewards/rollout_reward_func/std": 1.2292191982269287, "sampling/importance_sampling_ratio/max": 1.330505609512329, "sampling/importance_sampling_ratio/mean": 0.9212244749069214, "sampling/importance_sampling_ratio/min": 0.0012631850549951196, "sampling/sampling_logp_difference/max": 1.7830804586410522, "sampling/sampling_logp_difference/mean": 0.1732577383518219, "step": 463, "step_time": 9.97566444100812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7675352208316326, "epoch": 0.00232, "grad_norm": 0.25885650515556335, "kl": 0.3518580421805382, "learning_rate": 7.999983047484286e-06, "loss": -0.0236, "step": 464, "step_time": 5.174846927009639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.39007705077528954, "epoch": 0.002325, "frac_reward_zero_std": 0.5, "grad_norm": 0.03298141807317734, "kl": 0.33877017721533775, "learning_rate": 7.999982968174453e-06, "loss": -0.0105, "num_tokens": 5999725.0, "reward": 0.7236645221710205, "reward_std": 1.495073676109314, "rewards/rollout_reward_func/mean": 0.7236645221710205, "rewards/rollout_reward_func/std": 1.495073676109314, "sampling/importance_sampling_ratio/max": 1.191287875175476, "sampling/importance_sampling_ratio/mean": 1.0263088941574097, "sampling/importance_sampling_ratio/min": 0.04407363757491112, "sampling/sampling_logp_difference/max": 0.6900825500488281, "sampling/sampling_logp_difference/mean": 0.05405854806303978, "step": 465, "step_time": 9.460210682023899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39311633259058, "epoch": 0.00233, "grad_norm": 0.037215083837509155, "kl": 0.3377370052039623, "learning_rate": 7.999982888679535e-06, "loss": -0.0107, "step": 466, "step_time": 4.702985443989746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.46348328702151775, "epoch": 0.002335, "frac_reward_zero_std": 0.5, "grad_norm": 0.01223289780318737, "kl": 0.291065938770771, "learning_rate": 7.99998280899953e-06, "loss": -0.0414, "num_tokens": 6022146.0, "reward": 1.767663598060608, "reward_std": 0.7401252388954163, "rewards/rollout_reward_func/mean": 1.767663598060608, "rewards/rollout_reward_func/std": 0.7401251792907715, "sampling/importance_sampling_ratio/max": 1.1894010305404663, "sampling/importance_sampling_ratio/mean": 1.0100889205932617, "sampling/importance_sampling_ratio/min": 0.002366645261645317, "sampling/sampling_logp_difference/max": 1.6760077476501465, "sampling/sampling_logp_difference/mean": 0.10066822916269302, "step": 467, "step_time": 10.220560265981476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4694256577640772, "epoch": 0.00234, "grad_norm": 0.012326722033321857, "kl": 0.2883693277835846, "learning_rate": 7.999982729134439e-06, "loss": -0.0414, "step": 468, "step_time": 5.2603194279945455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.076923370361328, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1216932386159897, "epoch": 0.002345, "frac_reward_zero_std": 0.0, "grad_norm": 0.09851887077093124, "kl": 0.5052860714495182, "learning_rate": 7.99998264908426e-06, "loss": -0.0523, "num_tokens": 6051674.0, "reward": -0.019131958484649658, "reward_std": 1.0970169305801392, "rewards/rollout_reward_func/mean": -0.019131958484649658, "rewards/rollout_reward_func/std": 1.0970169305801392, "sampling/importance_sampling_ratio/max": 1.5130300521850586, "sampling/importance_sampling_ratio/mean": 0.7348606586456299, "sampling/importance_sampling_ratio/min": 8.927239832701162e-05, "sampling/sampling_logp_difference/max": 1.7064074277877808, "sampling/sampling_logp_difference/mean": 0.2434157431125641, "step": 469, "step_time": 13.749693267018301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 1.129982203245163, "epoch": 0.00235, "grad_norm": 0.10080380737781525, "kl": 0.5051640085875988, "learning_rate": 7.999982568848996e-06, "loss": -0.0525, "step": 470, "step_time": 7.17628287502157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.4611295908689499, "epoch": 0.002355, "frac_reward_zero_std": 0.5, "grad_norm": 0.06904445588588715, "kl": 0.2811216115951538, "learning_rate": 7.999982488428647e-06, "loss": -0.0069, "num_tokens": 6071264.0, "reward": 0.7755776643753052, "reward_std": 1.415205955505371, "rewards/rollout_reward_func/mean": 0.7755776643753052, "rewards/rollout_reward_func/std": 1.4152060747146606, "sampling/importance_sampling_ratio/max": 1.2854775190353394, "sampling/importance_sampling_ratio/mean": 0.9998332858085632, "sampling/importance_sampling_ratio/min": 0.016116701066493988, "sampling/sampling_logp_difference/max": 1.8068562746047974, "sampling/sampling_logp_difference/mean": 0.08598697185516357, "step": 471, "step_time": 8.516352188002202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.46887677907943726, "epoch": 0.00236, "grad_norm": 0.07144328951835632, "kl": 0.27828918397426605, "learning_rate": 7.99998240782321e-06, "loss": -0.0072, "step": 472, "step_time": 4.658049918987672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.800000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9872406013309956, "epoch": 0.002365, "frac_reward_zero_std": 0.5, "grad_norm": 0.08084413409233093, "kl": 0.38199208676815033, "learning_rate": 7.999982327032687e-06, "loss": -0.0525, "num_tokens": 6088696.0, "reward": 1.9702601432800293, "reward_std": 0.06661711633205414, "rewards/rollout_reward_func/mean": 1.9702601432800293, "rewards/rollout_reward_func/std": 0.06661713123321533, "sampling/importance_sampling_ratio/max": 1.1580827236175537, "sampling/importance_sampling_ratio/mean": 0.9491156339645386, "sampling/importance_sampling_ratio/min": 1.9894000047315785e-07, "sampling/sampling_logp_difference/max": 2.1211001873016357, "sampling/sampling_logp_difference/mean": 0.21905648708343506, "step": 473, "step_time": 6.322487591998652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.994726549834013, "epoch": 0.00237, "grad_norm": 0.0906461551785469, "kl": 0.3664122596383095, "learning_rate": 7.999982246057078e-06, "loss": -0.0522, "step": 474, "step_time": 3.299170865007909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9435404743999243, "epoch": 0.002375, "frac_reward_zero_std": 0.0, "grad_norm": 0.12628567218780518, "kl": 1.1940841674804688, "learning_rate": 7.999982164896383e-06, "loss": -0.0362, "num_tokens": 6118646.0, "reward": 0.5067487955093384, "reward_std": 1.1865557432174683, "rewards/rollout_reward_func/mean": 0.5067487955093384, "rewards/rollout_reward_func/std": 1.1865557432174683, "sampling/importance_sampling_ratio/max": 1.2698591947555542, "sampling/importance_sampling_ratio/mean": 0.7908790111541748, "sampling/importance_sampling_ratio/min": 0.00020503037376329303, "sampling/sampling_logp_difference/max": 1.7997595071792603, "sampling/sampling_logp_difference/mean": 0.2302858531475067, "step": 475, "step_time": 11.421590081008617 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.9502515699714422, "epoch": 0.00238, "grad_norm": 0.12081054598093033, "kl": 1.2476342841982841, "learning_rate": 7.9999820835506e-06, "loss": -0.0365, "step": 476, "step_time": 5.646234564992483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.65511155128479, "epoch": 0.002385, "frac_reward_zero_std": 0.0, "grad_norm": 0.12720178067684174, "kl": 0.42600259743630886, "learning_rate": 7.999982002019732e-06, "loss": -0.0993, "num_tokens": 6149634.0, "reward": 0.5255920886993408, "reward_std": 1.3287718296051025, "rewards/rollout_reward_func/mean": 0.5255920886993408, "rewards/rollout_reward_func/std": 1.3287718296051025, "sampling/importance_sampling_ratio/max": 1.2185980081558228, "sampling/importance_sampling_ratio/mean": 0.6607292890548706, "sampling/importance_sampling_ratio/min": 9.997061715694144e-05, "sampling/sampling_logp_difference/max": 1.4387917518615723, "sampling/sampling_logp_difference/mean": 0.2485845386981964, "step": 477, "step_time": 14.413630585986539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6596583724021912, "epoch": 0.00239, "grad_norm": 0.11696924269199371, "kl": 0.4265151619911194, "learning_rate": 7.999981920303777e-06, "loss": -0.0996, "step": 478, "step_time": 6.739366124980734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.9375, "completions/mean_terminated_length": 4.84615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.417397577315569, "epoch": 0.002395, "frac_reward_zero_std": 0.0, "grad_norm": 0.10178445279598236, "kl": 0.8408323377370834, "learning_rate": 7.999981838402736e-06, "loss": -0.0665, "num_tokens": 6172382.0, "reward": 1.1429723501205444, "reward_std": 1.2867255210876465, "rewards/rollout_reward_func/mean": 1.1429723501205444, "rewards/rollout_reward_func/std": 1.2867255210876465, "sampling/importance_sampling_ratio/max": 1.2271212339401245, "sampling/importance_sampling_ratio/mean": 0.7857974767684937, "sampling/importance_sampling_ratio/min": 3.425140312174335e-05, "sampling/sampling_logp_difference/max": 1.8663164377212524, "sampling/sampling_logp_difference/mean": 0.2817327380180359, "step": 479, "step_time": 12.316335372030153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4172007478773594, "epoch": 0.0024, "grad_norm": 0.09888622164726257, "kl": 0.8090554177761078, "learning_rate": 7.999981756316607e-06, "loss": -0.0667, "step": 480, "step_time": 6.36847256400506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6111741364002228, "epoch": 0.002405, "frac_reward_zero_std": 0.0, "grad_norm": 0.02089306153357029, "kl": 0.17295720055699348, "learning_rate": 7.999981674045395e-06, "loss": -0.0874, "num_tokens": 6204754.0, "reward": 0.6900660395622253, "reward_std": 1.327754020690918, "rewards/rollout_reward_func/mean": 0.6900660395622253, "rewards/rollout_reward_func/std": 1.327754020690918, "sampling/importance_sampling_ratio/max": 1.1057953834533691, "sampling/importance_sampling_ratio/mean": 0.6156153678894043, "sampling/importance_sampling_ratio/min": 0.0005402083043009043, "sampling/sampling_logp_difference/max": 1.5274577140808105, "sampling/sampling_logp_difference/mean": 0.22229865193367004, "step": 481, "step_time": 14.25726439699065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6128948032855988, "epoch": 0.00241, "grad_norm": 0.021434616297483444, "kl": 0.17189907003194094, "learning_rate": 7.999981591589094e-06, "loss": -0.0874, "step": 482, "step_time": 6.800939545006258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 4.384615421295166, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.561001181602478, "epoch": 0.002415, "frac_reward_zero_std": 0.0, "grad_norm": 0.2969132363796234, "kl": 0.28602539002895355, "learning_rate": 7.999981508947708e-06, "loss": -0.0457, "num_tokens": 6232572.0, "reward": 0.10196764022111893, "reward_std": 0.865949809551239, "rewards/rollout_reward_func/mean": 0.10196764022111893, "rewards/rollout_reward_func/std": 0.865949809551239, "sampling/importance_sampling_ratio/max": 1.506679654121399, "sampling/importance_sampling_ratio/mean": 0.8298150300979614, "sampling/importance_sampling_ratio/min": 1.4119045772531535e-05, "sampling/sampling_logp_difference/max": 2.6183533668518066, "sampling/sampling_logp_difference/mean": 0.2891724407672882, "step": 483, "step_time": 13.024738776992308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5641088783740997, "epoch": 0.00242, "grad_norm": 0.2884169816970825, "kl": 0.2858818955719471, "learning_rate": 7.999981426121235e-06, "loss": -0.0465, "step": 484, "step_time": 6.397677962988382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.0625, "completions/mean_terminated_length": 5.0625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5673259273171425, "epoch": 0.002425, "frac_reward_zero_std": 0.0, "grad_norm": 0.12408023327589035, "kl": 1.146671336144209, "learning_rate": 7.999981343109675e-06, "loss": -0.0568, "num_tokens": 6250634.0, "reward": 0.5684711337089539, "reward_std": 1.403631567955017, "rewards/rollout_reward_func/mean": 0.5684711337089539, "rewards/rollout_reward_func/std": 1.403631567955017, "sampling/importance_sampling_ratio/max": 1.2136898040771484, "sampling/importance_sampling_ratio/mean": 0.8316695690155029, "sampling/importance_sampling_ratio/min": 0.0009474823018535972, "sampling/sampling_logp_difference/max": 1.9933087825775146, "sampling/sampling_logp_difference/mean": 0.14459878206253052, "step": 485, "step_time": 6.6850123569893185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5589953437447548, "epoch": 0.00243, "grad_norm": 0.09558843821287155, "kl": 1.2500575557351112, "learning_rate": 7.99998125991303e-06, "loss": -0.0571, "step": 486, "step_time": 3.391215897005168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.21830543503165245, "epoch": 0.002435, "frac_reward_zero_std": 0.5, "grad_norm": 0.15393854677677155, "kl": 0.3496830016374588, "learning_rate": 7.999981176531298e-06, "loss": 0.0014, "num_tokens": 6272569.0, "reward": 1.4158978462219238, "reward_std": 0.2748876214027405, "rewards/rollout_reward_func/mean": 1.4158978462219238, "rewards/rollout_reward_func/std": 0.2748876214027405, "sampling/importance_sampling_ratio/max": 1.1272974014282227, "sampling/importance_sampling_ratio/mean": 1.071879506111145, "sampling/importance_sampling_ratio/min": 1.028262734413147, "sampling/sampling_logp_difference/max": 0.1146317720413208, "sampling/sampling_logp_difference/mean": 0.02290782332420349, "step": 487, "step_time": 8.917304816990509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21220051124691963, "epoch": 0.00244, "grad_norm": 0.1438068449497223, "kl": 0.3518364354968071, "learning_rate": 7.999981092964481e-06, "loss": 0.002, "step": 488, "step_time": 4.747693480007001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 4.875, "completions/mean_terminated_length": 4.875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5237013846635818, "epoch": 0.002445, "frac_reward_zero_std": 0.0, "grad_norm": 0.0691908448934555, "kl": 0.3944927603006363, "learning_rate": 7.999981009212575e-06, "loss": -0.0355, "num_tokens": 6300291.0, "reward": 1.2767573595046997, "reward_std": 0.9660421013832092, "rewards/rollout_reward_func/mean": 1.2767573595046997, "rewards/rollout_reward_func/std": 0.9660421013832092, "sampling/importance_sampling_ratio/max": 1.2768981456756592, "sampling/importance_sampling_ratio/mean": 0.976170539855957, "sampling/importance_sampling_ratio/min": 0.002394515322521329, "sampling/sampling_logp_difference/max": 1.8231196403503418, "sampling/sampling_logp_difference/mean": 0.12026410549879074, "step": 489, "step_time": 13.622599484981038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5275178551673889, "epoch": 0.00245, "grad_norm": 0.07328467071056366, "kl": 0.41203343868255615, "learning_rate": 7.999980925275585e-06, "loss": -0.0356, "step": 490, "step_time": 7.184914809011389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.40362266078591347, "epoch": 0.002455, "frac_reward_zero_std": 0.5, "grad_norm": 0.020410405471920967, "kl": 0.3289261981844902, "learning_rate": 7.999980841153509e-06, "loss": -0.0375, "num_tokens": 6318765.0, "reward": 1.7747455835342407, "reward_std": 0.7158984541893005, "rewards/rollout_reward_func/mean": 1.7747455835342407, "rewards/rollout_reward_func/std": 0.7158984541893005, "sampling/importance_sampling_ratio/max": 1.1248624324798584, "sampling/importance_sampling_ratio/mean": 0.9927445650100708, "sampling/importance_sampling_ratio/min": 0.014152035117149353, "sampling/sampling_logp_difference/max": 0.8894553184509277, "sampling/sampling_logp_difference/mean": 0.0620061531662941, "step": 491, "step_time": 7.200246202002745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4037201441824436, "epoch": 0.00246, "grad_norm": 0.019315233454108238, "kl": 0.3292369954288006, "learning_rate": 7.999980756846344e-06, "loss": -0.0376, "step": 492, "step_time": 3.7744137169938767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8190887682139874, "epoch": 0.002465, "frac_reward_zero_std": 0.0, "grad_norm": 0.12754972279071808, "kl": 0.20410453900694847, "learning_rate": 7.999980672354094e-06, "loss": -0.0843, "num_tokens": 6342374.0, "reward": 0.24114251136779785, "reward_std": 1.3214725255966187, "rewards/rollout_reward_func/mean": 0.24114251136779785, "rewards/rollout_reward_func/std": 1.3214725255966187, "sampling/importance_sampling_ratio/max": 1.1809450387954712, "sampling/importance_sampling_ratio/mean": 0.8479908108711243, "sampling/importance_sampling_ratio/min": 0.017436599358916283, "sampling/sampling_logp_difference/max": 0.8484154939651489, "sampling/sampling_logp_difference/mean": 0.12572802603244781, "step": 493, "step_time": 10.429189219983527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8162967376410961, "epoch": 0.00247, "grad_norm": 0.1365584135055542, "kl": 0.20355375297367573, "learning_rate": 7.99998058767676e-06, "loss": -0.0845, "step": 494, "step_time": 4.9933796629920835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 4.266666889190674, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5424856618046761, "epoch": 0.002475, "frac_reward_zero_std": 0.0, "grad_norm": 0.1336805671453476, "kl": 0.5673642121255398, "learning_rate": 7.999980502814336e-06, "loss": -0.0501, "num_tokens": 6366186.0, "reward": 1.4772419929504395, "reward_std": 1.011094093322754, "rewards/rollout_reward_func/mean": 1.4772419929504395, "rewards/rollout_reward_func/std": 1.011094093322754, "sampling/importance_sampling_ratio/max": 1.2206122875213623, "sampling/importance_sampling_ratio/mean": 0.9550117254257202, "sampling/importance_sampling_ratio/min": 0.003142749657854438, "sampling/sampling_logp_difference/max": 1.1099162101745605, "sampling/sampling_logp_difference/mean": 0.10790657997131348, "step": 495, "step_time": 12.241791116030072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5403816103935242, "epoch": 0.00248, "grad_norm": 0.13119180500507355, "kl": 0.5674606896936893, "learning_rate": 7.999980417766827e-06, "loss": -0.0501, "step": 496, "step_time": 6.386017968005035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0988854765892029, "epoch": 0.002485, "frac_reward_zero_std": 0.0, "grad_norm": 0.22090013325214386, "kl": 1.1752337934449315, "learning_rate": 7.999980332534234e-06, "loss": -0.0738, "num_tokens": 6394310.0, "reward": 0.907441258430481, "reward_std": 1.148722767829895, "rewards/rollout_reward_func/mean": 0.907441258430481, "rewards/rollout_reward_func/std": 1.148722767829895, "sampling/importance_sampling_ratio/max": 1.2457622289657593, "sampling/importance_sampling_ratio/mean": 0.7999842166900635, "sampling/importance_sampling_ratio/min": 0.00027404740103520453, "sampling/sampling_logp_difference/max": 1.6804604530334473, "sampling/sampling_logp_difference/mean": 0.18931955099105835, "step": 497, "step_time": 13.773632060008822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0996078103780746, "epoch": 0.00249, "grad_norm": 0.1885000765323639, "kl": 1.0009379666298628, "learning_rate": 7.999980247116551e-06, "loss": -0.0745, "step": 498, "step_time": 6.768159675993957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.928571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2684195190668106, "epoch": 0.002495, "frac_reward_zero_std": 0.0, "grad_norm": 0.08863916248083115, "kl": 0.5890300050377846, "learning_rate": 7.999980161513784e-06, "loss": -0.0511, "num_tokens": 6419741.0, "reward": 0.746935248374939, "reward_std": 1.3621582984924316, "rewards/rollout_reward_func/mean": 0.746935248374939, "rewards/rollout_reward_func/std": 1.3621582984924316, "sampling/importance_sampling_ratio/max": 1.2299422025680542, "sampling/importance_sampling_ratio/mean": 0.7683126926422119, "sampling/importance_sampling_ratio/min": 0.0018141199834644794, "sampling/sampling_logp_difference/max": 1.8927087783813477, "sampling/sampling_logp_difference/mean": 0.20738078653812408, "step": 499, "step_time": 12.499815189963556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2680089101195335, "epoch": 0.0025, "grad_norm": 0.0880972146987915, "kl": 0.5756551586091518, "learning_rate": 7.999980075725931e-06, "loss": -0.051, "step": 500, "step_time": 5.9796981399995275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 5.733333587646484, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.251592829823494, "epoch": 0.002505, "frac_reward_zero_std": 0.0, "grad_norm": 0.07434789836406708, "kl": 1.0317162349820137, "learning_rate": 7.999979989752992e-06, "loss": -0.0709, "num_tokens": 6442887.0, "reward": 0.37177810072898865, "reward_std": 1.2283217906951904, "rewards/rollout_reward_func/mean": 0.37177810072898865, "rewards/rollout_reward_func/std": 1.2283217906951904, "sampling/importance_sampling_ratio/max": 1.1280419826507568, "sampling/importance_sampling_ratio/mean": 0.7394840717315674, "sampling/importance_sampling_ratio/min": 0.00011076540249632671, "sampling/sampling_logp_difference/max": 2.102816581726074, "sampling/sampling_logp_difference/mean": 0.3031332492828369, "step": 501, "step_time": 9.964381512007094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2523694783449173, "epoch": 0.00251, "grad_norm": 0.07146969437599182, "kl": 1.0137510150671005, "learning_rate": 7.999979903594967e-06, "loss": -0.0709, "step": 502, "step_time": 5.07566265501373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.4375, "completions/mean_terminated_length": 4.733333587646484, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0255364328622818, "epoch": 0.002515, "frac_reward_zero_std": 0.0, "grad_norm": 0.18727369606494904, "kl": 0.6188373640179634, "learning_rate": 7.999979817251854e-06, "loss": -0.0224, "num_tokens": 6477305.0, "reward": 0.5436493754386902, "reward_std": 1.2236436605453491, "rewards/rollout_reward_func/mean": 0.5436493754386902, "rewards/rollout_reward_func/std": 1.2236436605453491, "sampling/importance_sampling_ratio/max": 1.3630492687225342, "sampling/importance_sampling_ratio/mean": 0.8213285207748413, "sampling/importance_sampling_ratio/min": 2.2192016331246123e-06, "sampling/sampling_logp_difference/max": 2.716128349304199, "sampling/sampling_logp_difference/mean": 0.2515709102153778, "step": 503, "step_time": 13.397927051002625 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 1.037135623395443, "epoch": 0.00252, "grad_norm": 0.18405041098594666, "kl": 0.5664294213056564, "learning_rate": 7.999979730723655e-06, "loss": -0.023, "step": 504, "step_time": 6.97538136599178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0857472121715546, "epoch": 0.002525, "frac_reward_zero_std": 0.0, "grad_norm": 0.12168185412883759, "kl": 0.3658214285969734, "learning_rate": 7.99997964401037e-06, "loss": -0.0663, "num_tokens": 6497692.0, "reward": 0.8613289594650269, "reward_std": 1.3914527893066406, "rewards/rollout_reward_func/mean": 0.8613289594650269, "rewards/rollout_reward_func/std": 1.3914527893066406, "sampling/importance_sampling_ratio/max": 1.1394929885864258, "sampling/importance_sampling_ratio/mean": 0.8228534460067749, "sampling/importance_sampling_ratio/min": 5.36809311597608e-05, "sampling/sampling_logp_difference/max": 1.9341065883636475, "sampling/sampling_logp_difference/mean": 0.21785804629325867, "step": 505, "step_time": 9.333752619000734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0944692566990852, "epoch": 0.00253, "grad_norm": 0.12354294210672379, "kl": 0.36399875581264496, "learning_rate": 7.999979557112e-06, "loss": -0.0668, "step": 506, "step_time": 4.924225028982619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 4.266666889190674, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.4582938067615032, "epoch": 0.002535, "frac_reward_zero_std": 0.0, "grad_norm": 0.03272097930312157, "kl": 0.4189250022172928, "learning_rate": 7.999979470028542e-06, "loss": -0.0485, "num_tokens": 6525077.0, "reward": 1.4299259185791016, "reward_std": 0.9375359416007996, "rewards/rollout_reward_func/mean": 1.4299259185791016, "rewards/rollout_reward_func/std": 0.9375359416007996, "sampling/importance_sampling_ratio/max": 1.1364295482635498, "sampling/importance_sampling_ratio/mean": 0.948112964630127, "sampling/importance_sampling_ratio/min": 0.000385700142942369, "sampling/sampling_logp_difference/max": 1.931056261062622, "sampling/sampling_logp_difference/mean": 0.12179645895957947, "step": 507, "step_time": 13.411584832996596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.46182854659855366, "epoch": 0.00254, "grad_norm": 0.04102373495697975, "kl": 0.39868904650211334, "learning_rate": 7.99997938276e-06, "loss": -0.0484, "step": 508, "step_time": 7.00611600697448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.3125, "completions/mean_terminated_length": 4.600000381469727, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8895803410559893, "epoch": 0.002545, "frac_reward_zero_std": 0.0, "grad_norm": 0.03878353536128998, "kl": 0.5473796352744102, "learning_rate": 7.999979295306369e-06, "loss": -0.0796, "num_tokens": 6552323.0, "reward": 1.1005828380584717, "reward_std": 1.2486586570739746, "rewards/rollout_reward_func/mean": 1.1005828380584717, "rewards/rollout_reward_func/std": 1.2486587762832642, "sampling/importance_sampling_ratio/max": 1.2756950855255127, "sampling/importance_sampling_ratio/mean": 0.8662576675415039, "sampling/importance_sampling_ratio/min": 0.006246987264603376, "sampling/sampling_logp_difference/max": 1.0695586204528809, "sampling/sampling_logp_difference/mean": 0.16718845069408417, "step": 509, "step_time": 10.25968659199134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8913229405879974, "epoch": 0.00255, "grad_norm": 0.04198608919978142, "kl": 0.5669384151697159, "learning_rate": 7.999979207667654e-06, "loss": -0.0796, "step": 510, "step_time": 5.27675842201279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 4.714285850524902, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2865487933158875, "epoch": 0.002555, "frac_reward_zero_std": 0.0, "grad_norm": 0.3097732961177826, "kl": 2.08723895996809, "learning_rate": 7.999979119843853e-06, "loss": -0.0428, "num_tokens": 6590306.0, "reward": 0.33544689416885376, "reward_std": 0.9254790544509888, "rewards/rollout_reward_func/mean": 0.33544689416885376, "rewards/rollout_reward_func/std": 0.9254790544509888, "sampling/importance_sampling_ratio/max": 1.4095392227172852, "sampling/importance_sampling_ratio/mean": 0.7836941480636597, "sampling/importance_sampling_ratio/min": 0.00013884284999221563, "sampling/sampling_logp_difference/max": 1.5991500616073608, "sampling/sampling_logp_difference/mean": 0.2575099766254425, "step": 511, "step_time": 14.070530892000534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.2942427396774292, "epoch": 0.00256, "grad_norm": 0.24447816610336304, "kl": 1.6471002474427223, "learning_rate": 7.999979031834965e-06, "loss": -0.0442, "step": 512, "step_time": 7.050692931996309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.1875, "completions/mean_terminated_length": 4.636363983154297, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.598793774843216, "epoch": 0.002565, "frac_reward_zero_std": 0.0, "grad_norm": 0.0376322902739048, "kl": 0.14140948466956615, "learning_rate": 7.99997894364099e-06, "loss": -0.0801, "num_tokens": 6618800.0, "reward": 0.34568846225738525, "reward_std": 1.1680344343185425, "rewards/rollout_reward_func/mean": 0.34568846225738525, "rewards/rollout_reward_func/std": 1.168034553527832, "sampling/importance_sampling_ratio/max": 1.3081456422805786, "sampling/importance_sampling_ratio/mean": 0.7000802755355835, "sampling/importance_sampling_ratio/min": 0.0022847915533930063, "sampling/sampling_logp_difference/max": 1.174647569656372, "sampling/sampling_logp_difference/mean": 0.22068078815937042, "step": 513, "step_time": 11.2932045369962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6046181619167328, "epoch": 0.00257, "grad_norm": 0.040399570018053055, "kl": 0.14097189530730247, "learning_rate": 7.99997885526193e-06, "loss": -0.08, "step": 514, "step_time": 5.27028716399218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6521321833133698, "epoch": 0.002575, "frac_reward_zero_std": 0.0, "grad_norm": 0.1310352385044098, "kl": 0.18617416452616453, "learning_rate": 7.999978766697782e-06, "loss": -0.0525, "num_tokens": 6649174.0, "reward": -0.18262097239494324, "reward_std": 0.8079031109809875, "rewards/rollout_reward_func/mean": -0.18262097239494324, "rewards/rollout_reward_func/std": 0.8079031705856323, "sampling/importance_sampling_ratio/max": 1.2260057926177979, "sampling/importance_sampling_ratio/mean": 0.7114049196243286, "sampling/importance_sampling_ratio/min": 3.292866676929407e-05, "sampling/sampling_logp_difference/max": 1.6062489748001099, "sampling/sampling_logp_difference/mean": 0.2493038773536682, "step": 515, "step_time": 13.269191853003576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6532468795776367, "epoch": 0.00258, "grad_norm": 0.13068822026252747, "kl": 0.18302083481103182, "learning_rate": 7.99997867794855e-06, "loss": -0.0525, "step": 516, "step_time": 6.330207495004288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 5.6666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0544071793556213, "epoch": 0.002585, "frac_reward_zero_std": 0.0, "grad_norm": 0.25056636333465576, "kl": 0.2849878668785095, "learning_rate": 7.99997858901423e-06, "loss": -0.0871, "num_tokens": 6678933.0, "reward": 0.5825240015983582, "reward_std": 0.9604540467262268, "rewards/rollout_reward_func/mean": 0.5825240015983582, "rewards/rollout_reward_func/std": 0.9604541063308716, "sampling/importance_sampling_ratio/max": 1.2588343620300293, "sampling/importance_sampling_ratio/mean": 0.6074012517929077, "sampling/importance_sampling_ratio/min": 4.471932697924785e-06, "sampling/sampling_logp_difference/max": 2.1211941242218018, "sampling/sampling_logp_difference/mean": 0.3143550455570221, "step": 517, "step_time": 13.412180308019742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0586979687213898, "epoch": 0.00259, "grad_norm": 0.23506049811840057, "kl": 0.3002568185329437, "learning_rate": 7.999978499894826e-06, "loss": -0.0876, "step": 518, "step_time": 6.32322589000978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5005562417209148, "epoch": 0.002595, "frac_reward_zero_std": 0.5, "grad_norm": 0.025177177041769028, "kl": 0.28337180987000465, "learning_rate": 7.999978410590335e-06, "loss": -0.039, "num_tokens": 6697771.0, "reward": 1.4468756914138794, "reward_std": 0.6267223954200745, "rewards/rollout_reward_func/mean": 1.4468756914138794, "rewards/rollout_reward_func/std": 0.6267223954200745, "sampling/importance_sampling_ratio/max": 1.0703877210617065, "sampling/importance_sampling_ratio/mean": 0.9752093553543091, "sampling/importance_sampling_ratio/min": 0.003431200049817562, "sampling/sampling_logp_difference/max": 1.0647664070129395, "sampling/sampling_logp_difference/mean": 0.07879706472158432, "step": 519, "step_time": 7.670617723008036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.49639787152409554, "epoch": 0.0026, "grad_norm": 0.023837115615606308, "kl": 0.28395402804017067, "learning_rate": 7.999978321100757e-06, "loss": -0.039, "step": 520, "step_time": 3.9980501149839256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7905773296952248, "epoch": 0.002605, "frac_reward_zero_std": 0.0, "grad_norm": 0.024086618795990944, "kl": 0.31568699330091476, "learning_rate": 7.999978231426094e-06, "loss": -0.0767, "num_tokens": 6715714.0, "reward": 1.8018944263458252, "reward_std": 0.7230508923530579, "rewards/rollout_reward_func/mean": 1.8018944263458252, "rewards/rollout_reward_func/std": 0.7230509519577026, "sampling/importance_sampling_ratio/max": 1.0836944580078125, "sampling/importance_sampling_ratio/mean": 0.9044151306152344, "sampling/importance_sampling_ratio/min": 2.0039249648107216e-05, "sampling/sampling_logp_difference/max": 1.7730042934417725, "sampling/sampling_logp_difference/mean": 0.18070559203624725, "step": 521, "step_time": 7.303897204983514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7871853690594435, "epoch": 0.00261, "grad_norm": 0.02352083846926689, "kl": 0.32468292117118835, "learning_rate": 7.999978141566344e-06, "loss": -0.0768, "step": 522, "step_time": 3.8486077089910395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.9375, "completions/mean_terminated_length": 4.200000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5782481655478477, "epoch": 0.002615, "frac_reward_zero_std": 0.0, "grad_norm": 0.06897708773612976, "kl": 0.46095839887857437, "learning_rate": 7.999978051521509e-06, "loss": -0.0603, "num_tokens": 6736784.0, "reward": 1.6070077419281006, "reward_std": 0.7290248274803162, "rewards/rollout_reward_func/mean": 1.6070077419281006, "rewards/rollout_reward_func/std": 0.7290248870849609, "sampling/importance_sampling_ratio/max": 1.068511962890625, "sampling/importance_sampling_ratio/mean": 0.9126459360122681, "sampling/importance_sampling_ratio/min": 0.0017516309162601829, "sampling/sampling_logp_difference/max": 1.4533495903015137, "sampling/sampling_logp_difference/mean": 0.11044710874557495, "step": 523, "step_time": 9.273861312991357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5731345303356647, "epoch": 0.00262, "grad_norm": 0.06878531724214554, "kl": 0.4621454104781151, "learning_rate": 7.999977961291588e-06, "loss": -0.0603, "step": 524, "step_time": 4.869353818998206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.625, "completions/mean_terminated_length": 5.2727274894714355, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2384706139564514, "epoch": 0.002625, "frac_reward_zero_std": 0.0, "grad_norm": 0.04691416770219803, "kl": 0.2521306099370122, "learning_rate": 7.99997787087658e-06, "loss": -0.0903, "num_tokens": 6759914.0, "reward": -0.25448504090309143, "reward_std": 1.0253801345825195, "rewards/rollout_reward_func/mean": -0.25448504090309143, "rewards/rollout_reward_func/std": 1.0253801345825195, "sampling/importance_sampling_ratio/max": 1.1432262659072876, "sampling/importance_sampling_ratio/mean": 0.588655948638916, "sampling/importance_sampling_ratio/min": 1.6821596773297642e-06, "sampling/sampling_logp_difference/max": 2.0847649574279785, "sampling/sampling_logp_difference/mean": 0.3593083620071411, "step": 525, "step_time": 10.353705607005395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.234217047691345, "epoch": 0.00263, "grad_norm": 0.04492419585585594, "kl": 0.24451035261154175, "learning_rate": 7.999977780276485e-06, "loss": -0.0905, "step": 526, "step_time": 4.8125794119841885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.3125, "completions/mean_terminated_length": 3.7857143878936768, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8231585919857025, "epoch": 0.002635, "frac_reward_zero_std": 0.0, "grad_norm": 0.17402522265911102, "kl": 0.24453449994325638, "learning_rate": 7.999977689491306e-06, "loss": -0.0515, "num_tokens": 6790338.0, "reward": 0.48393404483795166, "reward_std": 1.1698081493377686, "rewards/rollout_reward_func/mean": 0.48393404483795166, "rewards/rollout_reward_func/std": 1.169808268547058, "sampling/importance_sampling_ratio/max": 1.2208305597305298, "sampling/importance_sampling_ratio/mean": 0.9338998794555664, "sampling/importance_sampling_ratio/min": 0.013158729299902916, "sampling/sampling_logp_difference/max": 0.8634169101715088, "sampling/sampling_logp_difference/mean": 0.12871788442134857, "step": 527, "step_time": 13.583561778024887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8192494064569473, "epoch": 0.00264, "grad_norm": 0.1729569435119629, "kl": 0.24892208725214005, "learning_rate": 7.99997759852104e-06, "loss": -0.0522, "step": 528, "step_time": 6.92410544998711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 4.9166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7500503659248352, "epoch": 0.002645, "frac_reward_zero_std": 0.0, "grad_norm": 0.18368075788021088, "kl": 0.1868999134749174, "learning_rate": 7.999977507365686e-06, "loss": -0.0046, "num_tokens": 6817647.0, "reward": -0.6176737546920776, "reward_std": 0.4851016700267792, "rewards/rollout_reward_func/mean": -0.6176737546920776, "rewards/rollout_reward_func/std": 0.4851016700267792, "sampling/importance_sampling_ratio/max": 1.0552030801773071, "sampling/importance_sampling_ratio/mean": 0.5883631706237793, "sampling/importance_sampling_ratio/min": 0.0004260851419530809, "sampling/sampling_logp_difference/max": 1.106400489807129, "sampling/sampling_logp_difference/mean": 0.2593179941177368, "step": 529, "step_time": 13.398483514029067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7579559087753296, "epoch": 0.00265, "grad_norm": 0.18665601313114166, "kl": 0.19256324134767056, "learning_rate": 7.999977416025248e-06, "loss": -0.0051, "step": 530, "step_time": 6.645680824993178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9745768383145332, "epoch": 0.002655, "frac_reward_zero_std": 0.5, "grad_norm": 0.0946330726146698, "kl": 1.7002102695405483, "learning_rate": 7.999977324499723e-06, "loss": -0.0542, "num_tokens": 6841507.0, "reward": 1.4922842979431152, "reward_std": 0.8858503699302673, "rewards/rollout_reward_func/mean": 1.4922842979431152, "rewards/rollout_reward_func/std": 0.8858503699302673, "sampling/importance_sampling_ratio/max": 1.1645128726959229, "sampling/importance_sampling_ratio/mean": 0.8657066226005554, "sampling/importance_sampling_ratio/min": 5.736904995501391e-07, "sampling/sampling_logp_difference/max": 2.40616512298584, "sampling/sampling_logp_difference/mean": 0.21544793248176575, "step": 531, "step_time": 11.76412367298326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9683554396033287, "epoch": 0.00266, "grad_norm": 0.0862419530749321, "kl": 1.6171714179217815, "learning_rate": 7.999977232789113e-06, "loss": -0.0543, "step": 532, "step_time": 6.259639578012866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 4.888888835906982, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9494810700416565, "epoch": 0.002665, "frac_reward_zero_std": 0.0, "grad_norm": 0.07358116656541824, "kl": 0.41880615055561066, "learning_rate": 7.999977140893417e-06, "loss": -0.0433, "num_tokens": 6865638.0, "reward": 0.27770107984542847, "reward_std": 1.1867843866348267, "rewards/rollout_reward_func/mean": 0.27770107984542847, "rewards/rollout_reward_func/std": 1.1867843866348267, "sampling/importance_sampling_ratio/max": 1.1242294311523438, "sampling/importance_sampling_ratio/mean": 0.4152607023715973, "sampling/importance_sampling_ratio/min": 0.00019949641136918217, "sampling/sampling_logp_difference/max": 1.8333444595336914, "sampling/sampling_logp_difference/mean": 0.3338293433189392, "step": 533, "step_time": 13.047631844005082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9501871466636658, "epoch": 0.00267, "grad_norm": 0.07746844738721848, "kl": 0.4086383357644081, "learning_rate": 7.999977048812635e-06, "loss": -0.0436, "step": 534, "step_time": 5.878467037997325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.612856512889266, "epoch": 0.002675, "frac_reward_zero_std": 0.5, "grad_norm": 0.03320863097906113, "kl": 0.250710878521204, "learning_rate": 7.999976956546766e-06, "loss": -0.0361, "num_tokens": 6888509.0, "reward": 0.34414228796958923, "reward_std": 1.3430521488189697, "rewards/rollout_reward_func/mean": 0.34414228796958923, "rewards/rollout_reward_func/std": 1.3430521488189697, "sampling/importance_sampling_ratio/max": 1.1471534967422485, "sampling/importance_sampling_ratio/mean": 0.9830490350723267, "sampling/importance_sampling_ratio/min": 1.6307059922837652e-05, "sampling/sampling_logp_difference/max": 1.6492571830749512, "sampling/sampling_logp_difference/mean": 0.13808532059192657, "step": 535, "step_time": 12.083631584013347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6157355979084969, "epoch": 0.00268, "grad_norm": 0.03246904909610748, "kl": 0.2504003308713436, "learning_rate": 7.99997686409581e-06, "loss": -0.0361, "step": 536, "step_time": 6.6448454199999105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7440853659063578, "epoch": 0.002685, "frac_reward_zero_std": 0.0, "grad_norm": 0.1273319125175476, "kl": 1.270368404686451, "learning_rate": 7.99997677145977e-06, "loss": -0.0566, "num_tokens": 6911265.0, "reward": 1.2024462223052979, "reward_std": 1.1307733058929443, "rewards/rollout_reward_func/mean": 1.2024462223052979, "rewards/rollout_reward_func/std": 1.1307734251022339, "sampling/importance_sampling_ratio/max": 1.7708251476287842, "sampling/importance_sampling_ratio/mean": 0.8805896639823914, "sampling/importance_sampling_ratio/min": 0.003864174010232091, "sampling/sampling_logp_difference/max": 1.6176624298095703, "sampling/sampling_logp_difference/mean": 0.1386660635471344, "step": 537, "step_time": 10.38291095099703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7456217557191849, "epoch": 0.00269, "grad_norm": 0.11716163903474808, "kl": 1.2062959149479866, "learning_rate": 7.999976678638642e-06, "loss": -0.0566, "step": 538, "step_time": 5.787487556983251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.20178353786468506, "epoch": 0.002695, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007676857057958841, "kl": 0.241325531154871, "learning_rate": 7.99997658563243e-06, "loss": 0.0006, "num_tokens": 6928739.0, "reward": 1.9452074766159058, "reward_std": 0.0467989444732666, "rewards/rollout_reward_func/mean": 1.9452074766159058, "rewards/rollout_reward_func/std": 0.0467989444732666, "sampling/importance_sampling_ratio/max": 1.0851924419403076, "sampling/importance_sampling_ratio/mean": 1.0471030473709106, "sampling/importance_sampling_ratio/min": 1.0176457166671753, "sampling/sampling_logp_difference/max": 0.07860223948955536, "sampling/sampling_logp_difference/mean": 0.01615098863840103, "step": 539, "step_time": 6.007852098991862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20589907467365265, "epoch": 0.0027, "grad_norm": 0.0007801689789630473, "kl": 0.2409198135137558, "learning_rate": 7.999976492441131e-06, "loss": 0.0006, "step": 540, "step_time": 3.2816557869809913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 4.800000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.11318102478981, "epoch": 0.002705, "frac_reward_zero_std": 0.0, "grad_norm": 0.09958460181951523, "kl": 0.24184574466198683, "learning_rate": 7.999976399064746e-06, "loss": -0.0874, "num_tokens": 6953345.0, "reward": 0.065614715218544, "reward_std": 1.1313292980194092, "rewards/rollout_reward_func/mean": 0.065614715218544, "rewards/rollout_reward_func/std": 1.1313292980194092, "sampling/importance_sampling_ratio/max": 1.080618143081665, "sampling/importance_sampling_ratio/mean": 0.5795111656188965, "sampling/importance_sampling_ratio/min": 9.33749251998961e-05, "sampling/sampling_logp_difference/max": 1.842895746231079, "sampling/sampling_logp_difference/mean": 0.34388232231140137, "step": 541, "step_time": 12.552704178000567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1139875650405884, "epoch": 0.00271, "grad_norm": 0.1004953533411026, "kl": 0.2249519983306527, "learning_rate": 7.999976305503275e-06, "loss": -0.0875, "step": 542, "step_time": 6.2149195579841034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 6.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.921658143401146, "epoch": 0.002715, "frac_reward_zero_std": 0.0, "grad_norm": 0.09859180450439453, "kl": 0.3564767502248287, "learning_rate": 7.999976211756718e-06, "loss": -0.0889, "num_tokens": 6985789.0, "reward": 0.6643190383911133, "reward_std": 1.200286626815796, "rewards/rollout_reward_func/mean": 0.6643190383911133, "rewards/rollout_reward_func/std": 1.200286626815796, "sampling/importance_sampling_ratio/max": 1.099629282951355, "sampling/importance_sampling_ratio/mean": 0.5398083925247192, "sampling/importance_sampling_ratio/min": 0.00035465043038129807, "sampling/sampling_logp_difference/max": 1.8581064939498901, "sampling/sampling_logp_difference/mean": 0.3220486044883728, "step": 543, "step_time": 13.662358886998845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.929307222366333, "epoch": 0.00272, "grad_norm": 0.09410881251096725, "kl": 0.35559053905308247, "learning_rate": 7.999976117825075e-06, "loss": -0.0887, "step": 544, "step_time": 6.733783321004012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2059092335402966, "epoch": 0.002725, "frac_reward_zero_std": 0.0, "grad_norm": 0.04200951009988785, "kl": 0.20696384459733963, "learning_rate": 7.999976023708346e-06, "loss": -0.0853, "num_tokens": 7009393.0, "reward": 1.385894536972046, "reward_std": 1.0595591068267822, "rewards/rollout_reward_func/mean": 1.385894536972046, "rewards/rollout_reward_func/std": 1.0595592260360718, "sampling/importance_sampling_ratio/max": 1.1139559745788574, "sampling/importance_sampling_ratio/mean": 0.8033786416053772, "sampling/importance_sampling_ratio/min": 5.145874820300378e-05, "sampling/sampling_logp_difference/max": 2.0510199069976807, "sampling/sampling_logp_difference/mean": 0.26207077503204346, "step": 545, "step_time": 10.62696827801119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2026693597435951, "epoch": 0.00273, "grad_norm": 0.04319683089852333, "kl": 0.20581671595573425, "learning_rate": 7.999975929406532e-06, "loss": -0.0853, "step": 546, "step_time": 5.577903023004183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.5625, "completions/mean_terminated_length": 6.357142925262451, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8576257079839706, "epoch": 0.002735, "frac_reward_zero_std": 0.0, "grad_norm": 0.07923827320337296, "kl": 0.18170195631682873, "learning_rate": 7.99997583491963e-06, "loss": -0.0967, "num_tokens": 7034943.0, "reward": 0.3995704650878906, "reward_std": 1.3251234292984009, "rewards/rollout_reward_func/mean": 0.3995704650878906, "rewards/rollout_reward_func/std": 1.3251234292984009, "sampling/importance_sampling_ratio/max": 1.4391162395477295, "sampling/importance_sampling_ratio/mean": 0.6977283358573914, "sampling/importance_sampling_ratio/min": 2.5967610781663097e-05, "sampling/sampling_logp_difference/max": 1.9947060346603394, "sampling/sampling_logp_difference/mean": 0.2968503534793854, "step": 547, "step_time": 11.575844785984373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.859519898891449, "epoch": 0.00274, "grad_norm": 0.08187327533960342, "kl": 0.18231757916510105, "learning_rate": 7.999975740247644e-06, "loss": -0.0969, "step": 548, "step_time": 6.493669203991885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 4.714285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9427269250154495, "epoch": 0.002745, "frac_reward_zero_std": 0.5, "grad_norm": 0.020228777080774307, "kl": 0.2723778188228607, "learning_rate": 7.99997564539057e-06, "loss": 0.0107, "num_tokens": 7059416.0, "reward": 0.7727701663970947, "reward_std": 1.2582567930221558, "rewards/rollout_reward_func/mean": 0.7727701663970947, "rewards/rollout_reward_func/std": 1.2582567930221558, "sampling/importance_sampling_ratio/max": 1.2194184064865112, "sampling/importance_sampling_ratio/mean": 0.8736177682876587, "sampling/importance_sampling_ratio/min": 0.00015105005877558142, "sampling/sampling_logp_difference/max": 2.3397274017333984, "sampling/sampling_logp_difference/mean": 0.19345982372760773, "step": 549, "step_time": 10.774916508016759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9363944828510284, "epoch": 0.00275, "grad_norm": 0.02044812962412834, "kl": 0.27313268184661865, "learning_rate": 7.999975550348412e-06, "loss": 0.0107, "step": 550, "step_time": 5.312095030007185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 4.642857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.897483617067337, "epoch": 0.002755, "frac_reward_zero_std": 0.0, "grad_norm": 0.09015368670225143, "kl": 0.47092315554618835, "learning_rate": 7.999975455121166e-06, "loss": -0.0697, "num_tokens": 7083985.0, "reward": 1.1456127166748047, "reward_std": 1.1458780765533447, "rewards/rollout_reward_func/mean": 1.1456127166748047, "rewards/rollout_reward_func/std": 1.1458780765533447, "sampling/importance_sampling_ratio/max": 1.2421619892120361, "sampling/importance_sampling_ratio/mean": 0.7908700704574585, "sampling/importance_sampling_ratio/min": 6.351005140459165e-05, "sampling/sampling_logp_difference/max": 1.8431094884872437, "sampling/sampling_logp_difference/mean": 0.23921705782413483, "step": 551, "step_time": 12.41734276599891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8995065428316593, "epoch": 0.00276, "grad_norm": 0.08284864574670792, "kl": 0.4879816137254238, "learning_rate": 7.999975359708836e-06, "loss": -0.0698, "step": 552, "step_time": 6.378164115012623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 5.142857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5961823165416718, "epoch": 0.002765, "frac_reward_zero_std": 0.0, "grad_norm": 0.04229619354009628, "kl": 0.4444999396800995, "learning_rate": 7.99997526411142e-06, "loss": -0.0484, "num_tokens": 7108770.0, "reward": 0.17026004195213318, "reward_std": 1.343947172164917, "rewards/rollout_reward_func/mean": 0.17026004195213318, "rewards/rollout_reward_func/std": 1.3439472913742065, "sampling/importance_sampling_ratio/max": 1.0977575778961182, "sampling/importance_sampling_ratio/mean": 0.7467529773712158, "sampling/importance_sampling_ratio/min": 1.860768634287524e-06, "sampling/sampling_logp_difference/max": 1.7489298582077026, "sampling/sampling_logp_difference/mean": 0.30339670181274414, "step": 553, "step_time": 12.70330739399651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5946721136569977, "epoch": 0.00277, "grad_norm": 0.04112646356225014, "kl": 0.451421070843935, "learning_rate": 7.999975168328916e-06, "loss": -0.0484, "step": 554, "step_time": 7.017887363006594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3152501285076141, "epoch": 0.002775, "frac_reward_zero_std": 0.0, "grad_norm": 0.13560250401496887, "kl": 0.26600614562630653, "learning_rate": 7.999975072361328e-06, "loss": -0.0956, "num_tokens": 7138346.0, "reward": 0.671912670135498, "reward_std": 1.275866985321045, "rewards/rollout_reward_func/mean": 0.671912670135498, "rewards/rollout_reward_func/std": 1.275866985321045, "sampling/importance_sampling_ratio/max": 1.20643949508667, "sampling/importance_sampling_ratio/mean": 0.6379274129867554, "sampling/importance_sampling_ratio/min": 0.002966269152238965, "sampling/sampling_logp_difference/max": 1.449935793876648, "sampling/sampling_logp_difference/mean": 0.20839455723762512, "step": 555, "step_time": 12.944666922980105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3129764795303345, "epoch": 0.00278, "grad_norm": 0.12852096557617188, "kl": 0.27514483965933323, "learning_rate": 7.999974976208653e-06, "loss": -0.0963, "step": 556, "step_time": 6.676254944017273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6538712084293365, "epoch": 0.002785, "frac_reward_zero_std": 0.0, "grad_norm": 0.16535523533821106, "kl": 0.8638176508247852, "learning_rate": 7.999974879870894e-06, "loss": -0.049, "num_tokens": 7169027.0, "reward": 0.35946446657180786, "reward_std": 1.0150833129882812, "rewards/rollout_reward_func/mean": 0.35946446657180786, "rewards/rollout_reward_func/std": 1.0150833129882812, "sampling/importance_sampling_ratio/max": 1.2878841161727905, "sampling/importance_sampling_ratio/mean": 0.6235902309417725, "sampling/importance_sampling_ratio/min": 4.613341388903791e-06, "sampling/sampling_logp_difference/max": 1.9293179512023926, "sampling/sampling_logp_difference/mean": 0.3470953702926636, "step": 557, "step_time": 13.20068631398317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6512252688407898, "epoch": 0.00279, "grad_norm": 0.1573704183101654, "kl": 0.849974013864994, "learning_rate": 7.999974783348047e-06, "loss": -0.0492, "step": 558, "step_time": 6.9114774290064815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2285005822777748, "epoch": 0.002795, "frac_reward_zero_std": 0.0, "grad_norm": 0.07741666585206985, "kl": 0.2274778075516224, "learning_rate": 7.999974686640113e-06, "loss": -0.0732, "num_tokens": 7198740.0, "reward": 0.7217791676521301, "reward_std": 1.1803439855575562, "rewards/rollout_reward_func/mean": 0.7217791676521301, "rewards/rollout_reward_func/std": 1.1803441047668457, "sampling/importance_sampling_ratio/max": 1.217275619506836, "sampling/importance_sampling_ratio/mean": 0.7853485345840454, "sampling/importance_sampling_ratio/min": 0.0007779997540637851, "sampling/sampling_logp_difference/max": 1.240645170211792, "sampling/sampling_logp_difference/mean": 0.1917411834001541, "step": 559, "step_time": 14.222165137980483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2262263596057892, "epoch": 0.0028, "grad_norm": 0.0764053612947464, "kl": 0.22793479822576046, "learning_rate": 7.999974589747096e-06, "loss": -0.0734, "step": 560, "step_time": 6.881403418985428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 4.384615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0428338125348091, "epoch": 0.002805, "frac_reward_zero_std": 0.0, "grad_norm": 0.09462765604257584, "kl": 0.34611038863658905, "learning_rate": 7.99997449266899e-06, "loss": -0.0611, "num_tokens": 7226164.0, "reward": 0.5892612338066101, "reward_std": 1.3251943588256836, "rewards/rollout_reward_func/mean": 0.5892612338066101, "rewards/rollout_reward_func/std": 1.3251943588256836, "sampling/importance_sampling_ratio/max": 1.1215256452560425, "sampling/importance_sampling_ratio/mean": 0.7860763669013977, "sampling/importance_sampling_ratio/min": 7.329881555051543e-06, "sampling/sampling_logp_difference/max": 1.9418307542800903, "sampling/sampling_logp_difference/mean": 0.2450357973575592, "step": 561, "step_time": 10.673292818988557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0373164676129818, "epoch": 0.00281, "grad_norm": 0.0980432853102684, "kl": 0.34648851677775383, "learning_rate": 7.999974395405802e-06, "loss": -0.0609, "step": 562, "step_time": 5.237234365995391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 3.909090995788574, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1593412458896637, "epoch": 0.002815, "frac_reward_zero_std": 0.0, "grad_norm": 0.372730016708374, "kl": 1.110349859111011, "learning_rate": 7.999974297957524e-06, "loss": -0.0831, "num_tokens": 7259222.0, "reward": 0.050683680921792984, "reward_std": 1.029821515083313, "rewards/rollout_reward_func/mean": 0.050683680921792984, "rewards/rollout_reward_func/std": 1.029821515083313, "sampling/importance_sampling_ratio/max": 1.0935826301574707, "sampling/importance_sampling_ratio/mean": 0.508792519569397, "sampling/importance_sampling_ratio/min": 7.400270760626881e-07, "sampling/sampling_logp_difference/max": 2.647996425628662, "sampling/sampling_logp_difference/mean": 0.35832464694976807, "step": 563, "step_time": 13.77854489001038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.15875506401062, "epoch": 0.00282, "grad_norm": 0.23578906059265137, "kl": 0.8241990217939019, "learning_rate": 7.999974200324165e-06, "loss": -0.0849, "step": 564, "step_time": 6.9739615240105195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0625, "completions/mean_terminated_length": 4.900000095367432, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0881139636039734, "epoch": 0.002825, "frac_reward_zero_std": 0.0, "grad_norm": 0.11780417710542679, "kl": 0.33933062944561243, "learning_rate": 7.999974102505715e-06, "loss": -0.0592, "num_tokens": 7295772.0, "reward": 0.38704168796539307, "reward_std": 1.0328975915908813, "rewards/rollout_reward_func/mean": 0.38704168796539307, "rewards/rollout_reward_func/std": 1.032897710800171, "sampling/importance_sampling_ratio/max": 1.555598258972168, "sampling/importance_sampling_ratio/mean": 0.48767220973968506, "sampling/importance_sampling_ratio/min": 9.681199298938736e-05, "sampling/sampling_logp_difference/max": 1.9534900188446045, "sampling/sampling_logp_difference/mean": 0.31665241718292236, "step": 565, "step_time": 14.53728941999725 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 2.0886641144752502, "epoch": 0.00283, "grad_norm": 0.10201948881149292, "kl": 0.31103052804246545, "learning_rate": 7.999974004502184e-06, "loss": -0.0595, "step": 566, "step_time": 7.646648396010278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.439632199704647, "epoch": 0.002835, "frac_reward_zero_std": 0.0, "grad_norm": 0.09101810306310654, "kl": 1.0938887745141983, "learning_rate": 7.999973906313564e-06, "loss": -0.0939, "num_tokens": 7327074.0, "reward": 1.0149104595184326, "reward_std": 1.1319679021835327, "rewards/rollout_reward_func/mean": 1.0149104595184326, "rewards/rollout_reward_func/std": 1.1319680213928223, "sampling/importance_sampling_ratio/max": 1.2555261850357056, "sampling/importance_sampling_ratio/mean": 0.7998930215835571, "sampling/importance_sampling_ratio/min": 1.0596613009283828e-07, "sampling/sampling_logp_difference/max": 1.7639845609664917, "sampling/sampling_logp_difference/mean": 0.34828072786331177, "step": 567, "step_time": 13.174558215003344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.440519779920578, "epoch": 0.00284, "grad_norm": 0.0861617848277092, "kl": 1.0893044918775558, "learning_rate": 7.999973807939857e-06, "loss": -0.094, "step": 568, "step_time": 6.9237173489818815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.066667079925537, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1023857444524765, "epoch": 0.002845, "frac_reward_zero_std": 0.5, "grad_norm": 0.055852022022008896, "kl": 0.6444133222103119, "learning_rate": 7.999973709381066e-06, "loss": -0.0306, "num_tokens": 7345738.0, "reward": 0.5606469511985779, "reward_std": 1.1947791576385498, "rewards/rollout_reward_func/mean": 0.5606469511985779, "rewards/rollout_reward_func/std": 1.1947791576385498, "sampling/importance_sampling_ratio/max": 1.1258000135421753, "sampling/importance_sampling_ratio/mean": 0.8576456308364868, "sampling/importance_sampling_ratio/min": 0.000212277373066172, "sampling/sampling_logp_difference/max": 1.810342788696289, "sampling/sampling_logp_difference/mean": 0.1816839873790741, "step": 569, "step_time": 7.99256910099939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.099019531160593, "epoch": 0.00285, "grad_norm": 0.051866065710783005, "kl": 0.6222297959029675, "learning_rate": 7.99997361063719e-06, "loss": -0.0307, "step": 570, "step_time": 4.1151251240225974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.875, "completions/mean_terminated_length": 4.133333683013916, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6452251821756363, "epoch": 0.002855, "frac_reward_zero_std": 0.0, "grad_norm": 0.25557130575180054, "kl": 1.0774343684315681, "learning_rate": 7.999973511708227e-06, "loss": -0.0534, "num_tokens": 7377047.0, "reward": 0.9602388143539429, "reward_std": 1.1585040092468262, "rewards/rollout_reward_func/mean": 0.9602388143539429, "rewards/rollout_reward_func/std": 1.1585040092468262, "sampling/importance_sampling_ratio/max": 1.292405366897583, "sampling/importance_sampling_ratio/mean": 0.9333553314208984, "sampling/importance_sampling_ratio/min": 0.007994378916919231, "sampling/sampling_logp_difference/max": 1.0465881824493408, "sampling/sampling_logp_difference/mean": 0.11768674850463867, "step": 571, "step_time": 13.071678139982396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6406066864728928, "epoch": 0.00286, "grad_norm": 0.2477649599313736, "kl": 1.1289393529295921, "learning_rate": 7.999973412594178e-06, "loss": -0.0546, "step": 572, "step_time": 6.854776240026695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.9375, "completions/mean_terminated_length": 4.200000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.4197988472878933, "epoch": 0.002865, "frac_reward_zero_std": 0.0, "grad_norm": 0.046671297401189804, "kl": 0.6251085363328457, "learning_rate": 7.999973313295043e-06, "loss": -0.0377, "num_tokens": 7402611.0, "reward": 0.02319967746734619, "reward_std": 0.9126076698303223, "rewards/rollout_reward_func/mean": 0.02319967746734619, "rewards/rollout_reward_func/std": 0.912607729434967, "sampling/importance_sampling_ratio/max": 1.1496442556381226, "sampling/importance_sampling_ratio/mean": 0.9368122816085815, "sampling/importance_sampling_ratio/min": 0.007412992883473635, "sampling/sampling_logp_difference/max": 1.1753761768341064, "sampling/sampling_logp_difference/mean": 0.07949604839086533, "step": 573, "step_time": 12.37067415099591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41666240990161896, "epoch": 0.00287, "grad_norm": 0.045264121145009995, "kl": 0.6255751438438892, "learning_rate": 7.999973213810824e-06, "loss": -0.0378, "step": 574, "step_time": 6.884150551995845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.120919182896614, "epoch": 0.002875, "frac_reward_zero_std": 0.0, "grad_norm": 0.21992789208889008, "kl": 0.25591352209448814, "learning_rate": 7.999973114141517e-06, "loss": -0.0656, "num_tokens": 7436451.0, "reward": 0.3962659239768982, "reward_std": 1.0648725032806396, "rewards/rollout_reward_func/mean": 0.3962659239768982, "rewards/rollout_reward_func/std": 1.0648725032806396, "sampling/importance_sampling_ratio/max": 1.1969655752182007, "sampling/importance_sampling_ratio/mean": 0.7634780406951904, "sampling/importance_sampling_ratio/min": 5.377612978918478e-05, "sampling/sampling_logp_difference/max": 1.3405537605285645, "sampling/sampling_logp_difference/mean": 0.23948943614959717, "step": 575, "step_time": 14.320199141002377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.123553603887558, "epoch": 0.00288, "grad_norm": 0.2270326465368271, "kl": 0.24986283853650093, "learning_rate": 7.999973014287126e-06, "loss": -0.0669, "step": 576, "step_time": 7.265593567994074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.0625, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8027750104665756, "epoch": 0.002885, "frac_reward_zero_std": 0.0, "grad_norm": 0.3118148744106293, "kl": 1.9050067514181137, "learning_rate": 7.999972914247647e-06, "loss": -0.028, "num_tokens": 7468410.0, "reward": 1.056498408317566, "reward_std": 1.1610058546066284, "rewards/rollout_reward_func/mean": 1.056498408317566, "rewards/rollout_reward_func/std": 1.1610058546066284, "sampling/importance_sampling_ratio/max": 1.2091286182403564, "sampling/importance_sampling_ratio/mean": 0.9543558359146118, "sampling/importance_sampling_ratio/min": 0.0006148869870230556, "sampling/sampling_logp_difference/max": 1.5559868812561035, "sampling/sampling_logp_difference/mean": 0.16228672862052917, "step": 577, "step_time": 12.27163124200888 }, { "clip_ratio/high_max": 0.01923076994717121, "clip_ratio/high_mean": 0.009615384973585606, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009615384973585606, "entropy": 0.8104372844099998, "epoch": 0.00289, "grad_norm": 0.22818219661712646, "kl": 1.4574743211269379, "learning_rate": 7.999972814023084e-06, "loss": -0.0299, "step": 578, "step_time": 6.359149153009639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.1875, "completions/mean_terminated_length": 4.4666666984558105, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9450837895274162, "epoch": 0.002895, "frac_reward_zero_std": 0.0, "grad_norm": 0.11545483767986298, "kl": 1.4302897527813911, "learning_rate": 7.999972713613435e-06, "loss": -0.0421, "num_tokens": 7495947.0, "reward": -0.22770234942436218, "reward_std": 0.792783260345459, "rewards/rollout_reward_func/mean": -0.22770234942436218, "rewards/rollout_reward_func/std": 0.7927832007408142, "sampling/importance_sampling_ratio/max": 1.296026349067688, "sampling/importance_sampling_ratio/mean": 0.84917813539505, "sampling/importance_sampling_ratio/min": 0.0012279024813324213, "sampling/sampling_logp_difference/max": 1.7581334114074707, "sampling/sampling_logp_difference/mean": 0.18322531878948212, "step": 579, "step_time": 13.048535647016251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9574760049581528, "epoch": 0.0029, "grad_norm": 0.10922141373157501, "kl": 1.4389440268278122, "learning_rate": 7.9999726130187e-06, "loss": -0.0427, "step": 580, "step_time": 7.434975627998938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 4.083333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4527186304330826, "epoch": 0.002905, "frac_reward_zero_std": 0.0, "grad_norm": 0.07686668634414673, "kl": 0.6207643300294876, "learning_rate": 7.999972512238878e-06, "loss": -0.0957, "num_tokens": 7520509.0, "reward": 1.2147923707962036, "reward_std": 1.2051934003829956, "rewards/rollout_reward_func/mean": 1.2147923707962036, "rewards/rollout_reward_func/std": 1.2051934003829956, "sampling/importance_sampling_ratio/max": 1.1142287254333496, "sampling/importance_sampling_ratio/mean": 0.7502732872962952, "sampling/importance_sampling_ratio/min": 7.907964754849672e-05, "sampling/sampling_logp_difference/max": 1.713349461555481, "sampling/sampling_logp_difference/mean": 0.2336200475692749, "step": 581, "step_time": 10.723712047984009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4608769416809082, "epoch": 0.00291, "grad_norm": 0.07449781149625778, "kl": 0.5683820052072406, "learning_rate": 7.999972411273972e-06, "loss": -0.0958, "step": 582, "step_time": 5.3591256299987435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 4.533333778381348, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6430868804454803, "epoch": 0.002915, "frac_reward_zero_std": 0.0, "grad_norm": 0.041105519980192184, "kl": 0.3835562542080879, "learning_rate": 7.99997231012398e-06, "loss": -0.0575, "num_tokens": 7547367.0, "reward": 0.5720924735069275, "reward_std": 1.2036501169204712, "rewards/rollout_reward_func/mean": 0.5720924735069275, "rewards/rollout_reward_func/std": 1.2036501169204712, "sampling/importance_sampling_ratio/max": 1.2910032272338867, "sampling/importance_sampling_ratio/mean": 0.867390513420105, "sampling/importance_sampling_ratio/min": 0.010593855753540993, "sampling/sampling_logp_difference/max": 0.9707610607147217, "sampling/sampling_logp_difference/mean": 0.11983546614646912, "step": 583, "step_time": 11.048365877999458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6478417590260506, "epoch": 0.00292, "grad_norm": 0.042873796075582504, "kl": 0.3717106655240059, "learning_rate": 7.9999722087889e-06, "loss": -0.0574, "step": 584, "step_time": 5.8478994719916955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.774367693811655, "epoch": 0.002925, "frac_reward_zero_std": 0.5, "grad_norm": 0.08826848119497299, "kl": 1.599440447986126, "learning_rate": 7.999972107268737e-06, "loss": -0.0112, "num_tokens": 7570496.0, "reward": 1.6480036973953247, "reward_std": 0.8372569680213928, "rewards/rollout_reward_func/mean": 1.6480036973953247, "rewards/rollout_reward_func/std": 0.8372570276260376, "sampling/importance_sampling_ratio/max": 1.1079983711242676, "sampling/importance_sampling_ratio/mean": 0.9341685771942139, "sampling/importance_sampling_ratio/min": 2.6107485950888076e-07, "sampling/sampling_logp_difference/max": 2.516174793243408, "sampling/sampling_logp_difference/mean": 0.20692189037799835, "step": 585, "step_time": 9.622167662979336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7846456561237574, "epoch": 0.00293, "grad_norm": 0.08170156925916672, "kl": 1.577114075422287, "learning_rate": 7.999972005563487e-06, "loss": -0.0113, "step": 586, "step_time": 5.433561293990351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.857142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.928208976984024, "epoch": 0.002935, "frac_reward_zero_std": 0.0, "grad_norm": 0.033761583268642426, "kl": 0.28746863827109337, "learning_rate": 7.999971903673153e-06, "loss": -0.085, "num_tokens": 7597402.0, "reward": 1.2192575931549072, "reward_std": 1.065259575843811, "rewards/rollout_reward_func/mean": 1.2192575931549072, "rewards/rollout_reward_func/std": 1.065259575843811, "sampling/importance_sampling_ratio/max": 1.1692590713500977, "sampling/importance_sampling_ratio/mean": 0.8224615454673767, "sampling/importance_sampling_ratio/min": 0.0021419941913336515, "sampling/sampling_logp_difference/max": 1.0242772102355957, "sampling/sampling_logp_difference/mean": 0.18412987887859344, "step": 587, "step_time": 13.605416790014715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9342133924365044, "epoch": 0.00294, "grad_norm": 0.03423798456788063, "kl": 0.286456486210227, "learning_rate": 7.999971801597731e-06, "loss": -0.0849, "step": 588, "step_time": 6.669908685027622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.759412869811058, "epoch": 0.002945, "frac_reward_zero_std": 0.0, "grad_norm": 0.16661222279071808, "kl": 0.1344340918585658, "learning_rate": 7.999971699337224e-06, "loss": -0.0771, "num_tokens": 7623183.0, "reward": 0.48548704385757446, "reward_std": 1.2696549892425537, "rewards/rollout_reward_func/mean": 0.48548704385757446, "rewards/rollout_reward_func/std": 1.2696551084518433, "sampling/importance_sampling_ratio/max": 1.1690788269042969, "sampling/importance_sampling_ratio/mean": 0.7265276908874512, "sampling/importance_sampling_ratio/min": 3.6037195968674496e-05, "sampling/sampling_logp_difference/max": 1.6320674419403076, "sampling/sampling_logp_difference/mean": 0.26495033502578735, "step": 589, "step_time": 10.68624763302796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7634337916970253, "epoch": 0.00295, "grad_norm": 0.17497368156909943, "kl": 0.1344394087791443, "learning_rate": 7.99997159689163e-06, "loss": -0.0769, "step": 590, "step_time": 5.8162177380145295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 4.230769157409668, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1634885519742966, "epoch": 0.002955, "frac_reward_zero_std": 0.0, "grad_norm": 0.09644489735364914, "kl": 0.3279454819858074, "learning_rate": 7.999971494260952e-06, "loss": -0.0676, "num_tokens": 7648039.0, "reward": 1.1558268070220947, "reward_std": 1.2168500423431396, "rewards/rollout_reward_func/mean": 1.1558268070220947, "rewards/rollout_reward_func/std": 1.2168501615524292, "sampling/importance_sampling_ratio/max": 1.0834364891052246, "sampling/importance_sampling_ratio/mean": 0.8179597854614258, "sampling/importance_sampling_ratio/min": 6.148307147668675e-05, "sampling/sampling_logp_difference/max": 1.8616445064544678, "sampling/sampling_logp_difference/mean": 0.2154342532157898, "step": 591, "step_time": 11.930807073993492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1630998328328133, "epoch": 0.00296, "grad_norm": 0.10207051783800125, "kl": 0.3361594118177891, "learning_rate": 7.999971391445188e-06, "loss": -0.0676, "step": 592, "step_time": 5.9383123490115395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 5.0714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1288914307951927, "epoch": 0.002965, "frac_reward_zero_std": 0.0, "grad_norm": 0.07078593224287033, "kl": 0.3780316561460495, "learning_rate": 7.999971288444338e-06, "loss": -0.0978, "num_tokens": 7675757.0, "reward": 1.1787941455841064, "reward_std": 0.9751291871070862, "rewards/rollout_reward_func/mean": 1.1787941455841064, "rewards/rollout_reward_func/std": 0.975129246711731, "sampling/importance_sampling_ratio/max": 1.218291997909546, "sampling/importance_sampling_ratio/mean": 0.8084417581558228, "sampling/importance_sampling_ratio/min": 0.0011607223423197865, "sampling/sampling_logp_difference/max": 1.3869056701660156, "sampling/sampling_logp_difference/mean": 0.2117689847946167, "step": 593, "step_time": 12.055310902986093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1286385133862495, "epoch": 0.00297, "grad_norm": 0.07774083316326141, "kl": 0.37156781554222107, "learning_rate": 7.999971185258403e-06, "loss": -0.0978, "step": 594, "step_time": 6.085584882996045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 4.533333778381348, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0306786000728607, "epoch": 0.002975, "frac_reward_zero_std": 0.0, "grad_norm": 0.09768976271152496, "kl": 0.6270739734172821, "learning_rate": 7.999971081887381e-06, "loss": -0.0276, "num_tokens": 7708833.0, "reward": 0.4868485629558563, "reward_std": 1.1232578754425049, "rewards/rollout_reward_func/mean": 0.4868485629558563, "rewards/rollout_reward_func/std": 1.1232578754425049, "sampling/importance_sampling_ratio/max": 1.200019121170044, "sampling/importance_sampling_ratio/mean": 0.728137731552124, "sampling/importance_sampling_ratio/min": 0.0010255996603518724, "sampling/sampling_logp_difference/max": 1.892013669013977, "sampling/sampling_logp_difference/mean": 0.19889479875564575, "step": 595, "step_time": 12.947066198015818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0271630585193634, "epoch": 0.00298, "grad_norm": 0.09621676802635193, "kl": 0.5985138677060604, "learning_rate": 7.999970978331275e-06, "loss": -0.0278, "step": 596, "step_time": 7.48702955301269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6969862580299377, "epoch": 0.002985, "frac_reward_zero_std": 0.0, "grad_norm": 0.08829691261053085, "kl": 0.2890935391187668, "learning_rate": 7.999970874590083e-06, "loss": -0.044, "num_tokens": 7738915.0, "reward": 1.3020281791687012, "reward_std": 1.0792288780212402, "rewards/rollout_reward_func/mean": 1.3020281791687012, "rewards/rollout_reward_func/std": 1.0792288780212402, "sampling/importance_sampling_ratio/max": 1.181075096130371, "sampling/importance_sampling_ratio/mean": 0.9610468745231628, "sampling/importance_sampling_ratio/min": 0.004317207261919975, "sampling/sampling_logp_difference/max": 1.070202112197876, "sampling/sampling_logp_difference/mean": 0.10120295733213425, "step": 597, "step_time": 12.84756881701469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6974169760942459, "epoch": 0.00299, "grad_norm": 0.09025727957487106, "kl": 0.28879766166210175, "learning_rate": 7.999970770663804e-06, "loss": -0.0442, "step": 598, "step_time": 6.752201774987043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0567783936858177, "epoch": 0.002995, "frac_reward_zero_std": 0.0, "grad_norm": 0.14920176565647125, "kl": 0.43336155638098717, "learning_rate": 7.99997066655244e-06, "loss": -0.0623, "num_tokens": 7771637.0, "reward": 0.9931944608688354, "reward_std": 1.155545711517334, "rewards/rollout_reward_func/mean": 0.9931944608688354, "rewards/rollout_reward_func/std": 1.155545711517334, "sampling/importance_sampling_ratio/max": 1.2681330442428589, "sampling/importance_sampling_ratio/mean": 0.8060785531997681, "sampling/importance_sampling_ratio/min": 2.2956474765578605e-07, "sampling/sampling_logp_difference/max": 2.1234920024871826, "sampling/sampling_logp_difference/mean": 0.2483888864517212, "step": 599, "step_time": 13.27528361500299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0596920922398567, "epoch": 0.003, "grad_norm": 0.14197421073913574, "kl": 0.43085479736328125, "learning_rate": 7.99997056225599e-06, "loss": -0.0626, "step": 600, "step_time": 7.345339098988916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.1875, "completions/mean_terminated_length": 4.1875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.24279417842626572, "epoch": 0.003005, "frac_reward_zero_std": 0.5, "grad_norm": 0.11452368646860123, "kl": 0.34851638227701187, "learning_rate": 7.999970457774456e-06, "loss": -0.0116, "num_tokens": 7789744.0, "reward": 1.601231336593628, "reward_std": 0.3793242573738098, "rewards/rollout_reward_func/mean": 1.601231336593628, "rewards/rollout_reward_func/std": 0.3793242573738098, "sampling/importance_sampling_ratio/max": 1.0808528661727905, "sampling/importance_sampling_ratio/mean": 1.0023183822631836, "sampling/importance_sampling_ratio/min": 0.4114828109741211, "sampling/sampling_logp_difference/max": 0.6674091815948486, "sampling/sampling_logp_difference/mean": 0.02546972781419754, "step": 601, "step_time": 6.10752170199703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24350453913211823, "epoch": 0.00301, "grad_norm": 0.10906686633825302, "kl": 0.3528664782643318, "learning_rate": 7.999970353107835e-06, "loss": -0.012, "step": 602, "step_time": 3.3802360050176503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.428571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.940971627831459, "epoch": 0.003015, "frac_reward_zero_std": 0.0, "grad_norm": 0.10268208384513855, "kl": 0.5772782042622566, "learning_rate": 7.999970248256127e-06, "loss": -0.0708, "num_tokens": 7814344.0, "reward": 1.1277778148651123, "reward_std": 1.2234910726547241, "rewards/rollout_reward_func/mean": 1.1277778148651123, "rewards/rollout_reward_func/std": 1.2234910726547241, "sampling/importance_sampling_ratio/max": 1.165334701538086, "sampling/importance_sampling_ratio/mean": 0.8773216009140015, "sampling/importance_sampling_ratio/min": 2.772597781586228e-06, "sampling/sampling_logp_difference/max": 1.6387577056884766, "sampling/sampling_logp_difference/mean": 0.230695441365242, "step": 603, "step_time": 9.455377796999528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9380868896842003, "epoch": 0.00302, "grad_norm": 0.10480968654155731, "kl": 0.6020269580185413, "learning_rate": 7.999970143219335e-06, "loss": -0.0706, "step": 604, "step_time": 5.0368095509911655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.076923370361328, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3063066862523556, "epoch": 0.003025, "frac_reward_zero_std": 0.0, "grad_norm": 0.07003863900899887, "kl": 0.7926306929439306, "learning_rate": 7.999970037997458e-06, "loss": -0.0725, "num_tokens": 7843837.0, "reward": 1.6876345872879028, "reward_std": 0.5846492052078247, "rewards/rollout_reward_func/mean": 1.6876345872879028, "rewards/rollout_reward_func/std": 0.5846492648124695, "sampling/importance_sampling_ratio/max": 1.1251583099365234, "sampling/importance_sampling_ratio/mean": 0.8307024240493774, "sampling/importance_sampling_ratio/min": 1.7398625686837477e-06, "sampling/sampling_logp_difference/max": 2.4796361923217773, "sampling/sampling_logp_difference/mean": 0.32509711384773254, "step": 605, "step_time": 12.431214500000351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3034148011356592, "epoch": 0.00303, "grad_norm": 0.06759697943925858, "kl": 0.7593287341296673, "learning_rate": 7.999969932590495e-06, "loss": -0.0726, "step": 606, "step_time": 7.146108667002409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.625, "completions/mean_terminated_length": 4.833333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5209528654813766, "epoch": 0.003035, "frac_reward_zero_std": 0.0, "grad_norm": 0.15430819988250732, "kl": 0.6660077348351479, "learning_rate": 7.999969826998444e-06, "loss": -0.0808, "num_tokens": 7866314.0, "reward": 1.0919899940490723, "reward_std": 1.2348302602767944, "rewards/rollout_reward_func/mean": 1.0919899940490723, "rewards/rollout_reward_func/std": 1.234830379486084, "sampling/importance_sampling_ratio/max": 1.3204076290130615, "sampling/importance_sampling_ratio/mean": 0.6621419191360474, "sampling/importance_sampling_ratio/min": 7.706091054160424e-08, "sampling/sampling_logp_difference/max": 1.9857847690582275, "sampling/sampling_logp_difference/mean": 0.2715228796005249, "step": 607, "step_time": 11.022798973004683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5172456726431847, "epoch": 0.00304, "grad_norm": 0.15466316044330597, "kl": 0.6675233095884323, "learning_rate": 7.99996972122131e-06, "loss": -0.0812, "step": 608, "step_time": 5.246559166989755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 5.285714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.158256210386753, "epoch": 0.003045, "frac_reward_zero_std": 0.0, "grad_norm": 0.11563704162836075, "kl": 0.397519838064909, "learning_rate": 7.99996961525909e-06, "loss": -0.0301, "num_tokens": 7896628.0, "reward": 1.2369136810302734, "reward_std": 1.039209246635437, "rewards/rollout_reward_func/mean": 1.2369136810302734, "rewards/rollout_reward_func/std": 1.039209246635437, "sampling/importance_sampling_ratio/max": 1.177157998085022, "sampling/importance_sampling_ratio/mean": 0.7270805835723877, "sampling/importance_sampling_ratio/min": 0.00010856314474949613, "sampling/sampling_logp_difference/max": 1.6997401714324951, "sampling/sampling_logp_difference/mean": 0.2106868326663971, "step": 609, "step_time": 12.225288701985846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1546684876084328, "epoch": 0.00305, "grad_norm": 0.11353807896375656, "kl": 0.40095319226384163, "learning_rate": 7.999969509111784e-06, "loss": -0.0297, "step": 610, "step_time": 6.874960482004099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.4375, "completions/mean_terminated_length": 5.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9650688767433167, "epoch": 0.003055, "frac_reward_zero_std": 0.0, "grad_norm": 0.16627389192581177, "kl": 0.3023144514299929, "learning_rate": 7.999969402779394e-06, "loss": -0.0618, "num_tokens": 7925919.0, "reward": 0.7064187526702881, "reward_std": 1.230589747428894, "rewards/rollout_reward_func/mean": 0.7064187526702881, "rewards/rollout_reward_func/std": 1.230589747428894, "sampling/importance_sampling_ratio/max": 1.165054440498352, "sampling/importance_sampling_ratio/mean": 0.5079221129417419, "sampling/importance_sampling_ratio/min": 0.00042701727943494916, "sampling/sampling_logp_difference/max": 1.7643487453460693, "sampling/sampling_logp_difference/mean": 0.277123361825943, "step": 611, "step_time": 11.665569579985458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9774118959903717, "epoch": 0.00306, "grad_norm": 0.1735275238752365, "kl": 0.2794221490621567, "learning_rate": 7.999969296261916e-06, "loss": -0.0621, "step": 612, "step_time": 5.620253686982323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 4.769230842590332, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.18782389909029, "epoch": 0.003065, "frac_reward_zero_std": 0.0, "grad_norm": 0.058505214750766754, "kl": 0.4057614281773567, "learning_rate": 7.999969189559353e-06, "loss": -0.0817, "num_tokens": 7942619.0, "reward": 1.3745437860488892, "reward_std": 1.1494848728179932, "rewards/rollout_reward_func/mean": 1.3745437860488892, "rewards/rollout_reward_func/std": 1.1494847536087036, "sampling/importance_sampling_ratio/max": 1.0784995555877686, "sampling/importance_sampling_ratio/mean": 0.6877830624580383, "sampling/importance_sampling_ratio/min": 2.2746893591829576e-05, "sampling/sampling_logp_difference/max": 1.8248234987258911, "sampling/sampling_logp_difference/mean": 0.26386386156082153, "step": 613, "step_time": 7.488155440005357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1857228018343449, "epoch": 0.00307, "grad_norm": 0.054744821041822433, "kl": 0.3853079676628113, "learning_rate": 7.999969082671705e-06, "loss": -0.0819, "step": 614, "step_time": 3.833400662013446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.232544906437397, "epoch": 0.003075, "frac_reward_zero_std": 0.0, "grad_norm": 0.08005765080451965, "kl": 0.20702853426337242, "learning_rate": 7.999968975598971e-06, "loss": -0.0888, "num_tokens": 7970644.0, "reward": 1.0488135814666748, "reward_std": 1.2104743719100952, "rewards/rollout_reward_func/mean": 1.0488135814666748, "rewards/rollout_reward_func/std": 1.2104744911193848, "sampling/importance_sampling_ratio/max": 1.223158836364746, "sampling/importance_sampling_ratio/mean": 0.827864408493042, "sampling/importance_sampling_ratio/min": 6.084679262130521e-05, "sampling/sampling_logp_difference/max": 1.5984220504760742, "sampling/sampling_logp_difference/mean": 0.2627548575401306, "step": 615, "step_time": 11.008415755015449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2340878136456013, "epoch": 0.00308, "grad_norm": 0.0853394940495491, "kl": 0.2069066558033228, "learning_rate": 7.99996886834115e-06, "loss": -0.0889, "step": 616, "step_time": 5.721250059999875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 4.692307949066162, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3001045808196068, "epoch": 0.003085, "frac_reward_zero_std": 0.0, "grad_norm": 0.07200760394334793, "kl": 0.4308344488963485, "learning_rate": 7.999968760898247e-06, "loss": -0.0798, "num_tokens": 7993928.0, "reward": 1.2120046615600586, "reward_std": 1.160820484161377, "rewards/rollout_reward_func/mean": 1.2120046615600586, "rewards/rollout_reward_func/std": 1.1608206033706665, "sampling/importance_sampling_ratio/max": 1.0744322538375854, "sampling/importance_sampling_ratio/mean": 0.7129640579223633, "sampling/importance_sampling_ratio/min": 3.825836392934434e-05, "sampling/sampling_logp_difference/max": 1.6640459299087524, "sampling/sampling_logp_difference/mean": 0.26804986596107483, "step": 617, "step_time": 11.961329019002733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2987676560878754, "epoch": 0.00309, "grad_norm": 0.07215535640716553, "kl": 0.4398791240528226, "learning_rate": 7.999968653270258e-06, "loss": -0.0797, "step": 618, "step_time": 6.019929997011786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 4.285714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9460041895508766, "epoch": 0.003095, "frac_reward_zero_std": 0.0, "grad_norm": 0.07650470733642578, "kl": 0.2616443298757076, "learning_rate": 7.99996854545718e-06, "loss": -0.082, "num_tokens": 8018435.0, "reward": 1.363525390625, "reward_std": 1.097888708114624, "rewards/rollout_reward_func/mean": 1.363525390625, "rewards/rollout_reward_func/std": 1.097888708114624, "sampling/importance_sampling_ratio/max": 1.1410449743270874, "sampling/importance_sampling_ratio/mean": 0.8405588865280151, "sampling/importance_sampling_ratio/min": 0.0022155384067445993, "sampling/sampling_logp_difference/max": 1.291940689086914, "sampling/sampling_logp_difference/mean": 0.16306884586811066, "step": 619, "step_time": 12.174032609982532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9453967064619064, "epoch": 0.0031, "grad_norm": 0.07480785995721817, "kl": 0.254947517067194, "learning_rate": 7.99996843745902e-06, "loss": -0.0823, "step": 620, "step_time": 6.110954411997227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 5.000000476837158, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8167528882622719, "epoch": 0.003105, "frac_reward_zero_std": 0.0, "grad_norm": 0.04107770696282387, "kl": 0.4492645785212517, "learning_rate": 7.999968329275773e-06, "loss": -0.0832, "num_tokens": 8041653.0, "reward": 1.2181602716445923, "reward_std": 1.2001397609710693, "rewards/rollout_reward_func/mean": 1.2181602716445923, "rewards/rollout_reward_func/std": 1.2001397609710693, "sampling/importance_sampling_ratio/max": 1.1347726583480835, "sampling/importance_sampling_ratio/mean": 0.8449840545654297, "sampling/importance_sampling_ratio/min": 0.0007025998784229159, "sampling/sampling_logp_difference/max": 1.3310179710388184, "sampling/sampling_logp_difference/mean": 0.16416898369789124, "step": 621, "step_time": 10.71594071900472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8154686987400055, "epoch": 0.00311, "grad_norm": 0.040784016251564026, "kl": 0.4578767828643322, "learning_rate": 7.99996822090744e-06, "loss": -0.0832, "step": 622, "step_time": 5.129631510004401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.8125, "completions/mean_terminated_length": 4.357142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2046631425619125, "epoch": 0.003115, "frac_reward_zero_std": 0.5, "grad_norm": 0.0378090962767601, "kl": 0.46106475219130516, "learning_rate": 7.999968112354022e-06, "loss": -0.0579, "num_tokens": 8064893.0, "reward": -0.12030899524688721, "reward_std": 1.2227882146835327, "rewards/rollout_reward_func/mean": -0.12030899524688721, "rewards/rollout_reward_func/std": 1.2227882146835327, "sampling/importance_sampling_ratio/max": 1.1881309747695923, "sampling/importance_sampling_ratio/mean": 0.8050532341003418, "sampling/importance_sampling_ratio/min": 0.00013183329429011792, "sampling/sampling_logp_difference/max": 1.344462275505066, "sampling/sampling_logp_difference/mean": 0.19940002262592316, "step": 623, "step_time": 10.227961341995979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2008501589298248, "epoch": 0.00312, "grad_norm": 0.03783322498202324, "kl": 0.4610161744058132, "learning_rate": 7.999968003615517e-06, "loss": -0.0579, "step": 624, "step_time": 4.890624692998244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.375, "completions/mean_terminated_length": 3.857142925262451, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1188516840338707, "epoch": 0.003125, "frac_reward_zero_std": 0.0, "grad_norm": 0.10506846010684967, "kl": 0.6822447590529919, "learning_rate": 7.99996789469193e-06, "loss": -0.0506, "num_tokens": 8097134.0, "reward": 0.03031770884990692, "reward_std": 1.0129671096801758, "rewards/rollout_reward_func/mean": 0.03031770884990692, "rewards/rollout_reward_func/std": 1.0129671096801758, "sampling/importance_sampling_ratio/max": 1.3485076427459717, "sampling/importance_sampling_ratio/mean": 0.8004574179649353, "sampling/importance_sampling_ratio/min": 0.00788471195846796, "sampling/sampling_logp_difference/max": 1.6473352909088135, "sampling/sampling_logp_difference/mean": 0.18958118557929993, "step": 625, "step_time": 12.505302316014422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1205252930521965, "epoch": 0.00313, "grad_norm": 0.10557819902896881, "kl": 0.6754598096013069, "learning_rate": 7.999967785583254e-06, "loss": -0.0507, "step": 626, "step_time": 6.520474764984101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 4.714285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3173319697380066, "epoch": 0.003135, "frac_reward_zero_std": 0.0, "grad_norm": 0.08549331873655319, "kl": 1.5083216279745102, "learning_rate": 7.999967676289494e-06, "loss": -0.0521, "num_tokens": 8125626.0, "reward": 0.6267895102500916, "reward_std": 1.318513035774231, "rewards/rollout_reward_func/mean": 0.6267895102500916, "rewards/rollout_reward_func/std": 1.318513035774231, "sampling/importance_sampling_ratio/max": 1.1993497610092163, "sampling/importance_sampling_ratio/mean": 0.7391185760498047, "sampling/importance_sampling_ratio/min": 4.961427839589305e-05, "sampling/sampling_logp_difference/max": 1.7320544719696045, "sampling/sampling_logp_difference/mean": 0.2315838634967804, "step": 627, "step_time": 11.330523190001259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3195770904421806, "epoch": 0.00314, "grad_norm": 0.07163302600383759, "kl": 1.3513161316514015, "learning_rate": 7.99996756681065e-06, "loss": -0.0524, "step": 628, "step_time": 5.265857318998314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.857142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.293894499540329, "epoch": 0.003145, "frac_reward_zero_std": 0.0, "grad_norm": 0.15790081024169922, "kl": 0.255712665617466, "learning_rate": 7.999967457146718e-06, "loss": -0.0571, "num_tokens": 8152384.0, "reward": 0.5378983020782471, "reward_std": 1.3092552423477173, "rewards/rollout_reward_func/mean": 0.5378983020782471, "rewards/rollout_reward_func/std": 1.3092552423477173, "sampling/importance_sampling_ratio/max": 1.1891013383865356, "sampling/importance_sampling_ratio/mean": 0.7885023951530457, "sampling/importance_sampling_ratio/min": 0.0011245451169088483, "sampling/sampling_logp_difference/max": 1.1173988580703735, "sampling/sampling_logp_difference/mean": 0.19076694548130035, "step": 629, "step_time": 12.225040523990174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.297120988368988, "epoch": 0.00315, "grad_norm": 0.15182843804359436, "kl": 0.25024697184562683, "learning_rate": 7.9999673472977e-06, "loss": -0.0575, "step": 630, "step_time": 6.251432549979654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.2733522318303585, "epoch": 0.003155, "frac_reward_zero_std": 0.0, "grad_norm": 0.18169119954109192, "kl": 0.29508325457572937, "learning_rate": 7.999967237263601e-06, "loss": -0.0058, "num_tokens": 8181831.0, "reward": 1.5785561800003052, "reward_std": 0.8370643258094788, "rewards/rollout_reward_func/mean": 1.5785561800003052, "rewards/rollout_reward_func/std": 0.837064266204834, "sampling/importance_sampling_ratio/max": 1.2404589653015137, "sampling/importance_sampling_ratio/mean": 1.0608758926391602, "sampling/importance_sampling_ratio/min": 0.7972386479377747, "sampling/sampling_logp_difference/max": 0.17888976633548737, "sampling/sampling_logp_difference/mean": 0.02654936909675598, "step": 631, "step_time": 12.22760189899418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.27892705611884594, "epoch": 0.00316, "grad_norm": 0.17463809251785278, "kl": 0.29380738735198975, "learning_rate": 7.999967127044413e-06, "loss": -0.0073, "step": 632, "step_time": 6.3141199840029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 4.545454502105713, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9710680097341537, "epoch": 0.003165, "frac_reward_zero_std": 0.0, "grad_norm": 0.2853078544139862, "kl": 0.4117146022617817, "learning_rate": 7.99996701664014e-06, "loss": -0.0855, "num_tokens": 8210739.0, "reward": 0.5415703058242798, "reward_std": 1.3529207706451416, "rewards/rollout_reward_func/mean": 0.5415703058242798, "rewards/rollout_reward_func/std": 1.3529208898544312, "sampling/importance_sampling_ratio/max": 1.1507370471954346, "sampling/importance_sampling_ratio/mean": 0.5871351957321167, "sampling/importance_sampling_ratio/min": 3.597649822495441e-08, "sampling/sampling_logp_difference/max": 2.1348421573638916, "sampling/sampling_logp_difference/mean": 0.40287071466445923, "step": 633, "step_time": 13.406540390002192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.973513439297676, "epoch": 0.00317, "grad_norm": 0.28136295080184937, "kl": 0.412059273570776, "learning_rate": 7.999966906050781e-06, "loss": -0.0856, "step": 634, "step_time": 6.825514730007853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 5.153846263885498, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.821609616279602, "epoch": 0.003175, "frac_reward_zero_std": 0.0, "grad_norm": 0.1429181545972824, "kl": 0.6356527879834175, "learning_rate": 7.999966795276339e-06, "loss": -0.0712, "num_tokens": 8239081.0, "reward": 0.6151169538497925, "reward_std": 1.3334541320800781, "rewards/rollout_reward_func/mean": 0.6151169538497925, "rewards/rollout_reward_func/std": 1.3334541320800781, "sampling/importance_sampling_ratio/max": 1.166922688484192, "sampling/importance_sampling_ratio/mean": 0.6668628454208374, "sampling/importance_sampling_ratio/min": 7.333251687668962e-06, "sampling/sampling_logp_difference/max": 2.154344320297241, "sampling/sampling_logp_difference/mean": 0.31440746784210205, "step": 635, "step_time": 11.833569801019621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8253802061080933, "epoch": 0.00318, "grad_norm": 0.1368231475353241, "kl": 0.6365387588739395, "learning_rate": 7.999966684316808e-06, "loss": -0.0714, "step": 636, "step_time": 6.08428171100968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 4.153846263885498, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.273575872182846, "epoch": 0.003185, "frac_reward_zero_std": 0.0, "grad_norm": 0.08393744379281998, "kl": 0.3443710133433342, "learning_rate": 7.999966573172194e-06, "loss": -0.0753, "num_tokens": 8263448.0, "reward": 0.1820867657661438, "reward_std": 1.115820050239563, "rewards/rollout_reward_func/mean": 0.1820867657661438, "rewards/rollout_reward_func/std": 1.1158201694488525, "sampling/importance_sampling_ratio/max": 1.1597768068313599, "sampling/importance_sampling_ratio/mean": 0.8453497290611267, "sampling/importance_sampling_ratio/min": 9.802476415643468e-05, "sampling/sampling_logp_difference/max": 1.7266476154327393, "sampling/sampling_logp_difference/mean": 0.23582890629768372, "step": 637, "step_time": 10.72083479000139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2666711956262589, "epoch": 0.00319, "grad_norm": 0.07899610698223114, "kl": 0.3448648862540722, "learning_rate": 7.999966461842494e-06, "loss": -0.0755, "step": 638, "step_time": 5.173304088020814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 4.230769157409668, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4020227864384651, "epoch": 0.003195, "frac_reward_zero_std": 0.0, "grad_norm": 0.07717075198888779, "kl": 0.36743219569325447, "learning_rate": 7.999966350327711e-06, "loss": -0.0639, "num_tokens": 8285919.0, "reward": 0.8703616261482239, "reward_std": 1.3249512910842896, "rewards/rollout_reward_func/mean": 0.8703616261482239, "rewards/rollout_reward_func/std": 1.324951410293579, "sampling/importance_sampling_ratio/max": 1.168449878692627, "sampling/importance_sampling_ratio/mean": 0.8183956146240234, "sampling/importance_sampling_ratio/min": 5.2377455261876094e-08, "sampling/sampling_logp_difference/max": 2.0439963340759277, "sampling/sampling_logp_difference/mean": 0.32177257537841797, "step": 639, "step_time": 10.452070371989976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3965782523155212, "epoch": 0.0032, "grad_norm": 0.07447472214698792, "kl": 0.3797490522265434, "learning_rate": 7.99996623862784e-06, "loss": -0.0636, "step": 640, "step_time": 5.146140056007425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3230933584272861, "epoch": 0.003205, "frac_reward_zero_std": 0.0, "grad_norm": 0.08683346956968307, "kl": 0.23546411469578743, "learning_rate": 7.999966126742882e-06, "loss": -0.0764, "num_tokens": 8318447.0, "reward": 0.6660128235816956, "reward_std": 1.2632406949996948, "rewards/rollout_reward_func/mean": 0.6660128235816956, "rewards/rollout_reward_func/std": 1.2632406949996948, "sampling/importance_sampling_ratio/max": 1.2103071212768555, "sampling/importance_sampling_ratio/mean": 0.6975275278091431, "sampling/importance_sampling_ratio/min": 0.0001649367477511987, "sampling/sampling_logp_difference/max": 1.4618260860443115, "sampling/sampling_logp_difference/mean": 0.25332406163215637, "step": 641, "step_time": 15.118439757978194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3170897215604782, "epoch": 0.00321, "grad_norm": 0.08667749166488647, "kl": 0.2392789125442505, "learning_rate": 7.999966014672842e-06, "loss": -0.0764, "step": 642, "step_time": 7.228969939009403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0946101285517216, "epoch": 0.003215, "frac_reward_zero_std": 0.0, "grad_norm": 0.18658386170864105, "kl": 0.24499188363552094, "learning_rate": 7.999965902417713e-06, "loss": -0.0745, "num_tokens": 8350456.0, "reward": 1.247424840927124, "reward_std": 1.1142089366912842, "rewards/rollout_reward_func/mean": 1.247424840927124, "rewards/rollout_reward_func/std": 1.1142090559005737, "sampling/importance_sampling_ratio/max": 1.1989458799362183, "sampling/importance_sampling_ratio/mean": 0.8457460403442383, "sampling/importance_sampling_ratio/min": 4.0850838558981195e-05, "sampling/sampling_logp_difference/max": 1.570314645767212, "sampling/sampling_logp_difference/mean": 0.19405294954776764, "step": 643, "step_time": 13.666959012989537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0953561179339886, "epoch": 0.00322, "grad_norm": 0.2041250467300415, "kl": 0.243397394195199, "learning_rate": 7.999965789977501e-06, "loss": -0.0749, "step": 644, "step_time": 6.895986922987504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.9375, "completions/mean_terminated_length": 4.200000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8331782072782516, "epoch": 0.003225, "frac_reward_zero_std": 0.0, "grad_norm": 0.07835870236158371, "kl": 0.9027986079454422, "learning_rate": 7.999965677352206e-06, "loss": -0.0779, "num_tokens": 8374835.0, "reward": 1.5502252578735352, "reward_std": 0.885377049446106, "rewards/rollout_reward_func/mean": 1.5502252578735352, "rewards/rollout_reward_func/std": 0.8853771686553955, "sampling/importance_sampling_ratio/max": 1.1755272150039673, "sampling/importance_sampling_ratio/mean": 0.9206781387329102, "sampling/importance_sampling_ratio/min": 1.0905213457590435e-05, "sampling/sampling_logp_difference/max": 1.3931694030761719, "sampling/sampling_logp_difference/mean": 0.1957172453403473, "step": 645, "step_time": 9.817119375991751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8310690224170685, "epoch": 0.00323, "grad_norm": 0.07537659257650375, "kl": 0.8994704559445381, "learning_rate": 7.999965564541822e-06, "loss": -0.078, "step": 646, "step_time": 5.288223838011618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 5.384615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6873574536293745, "epoch": 0.003235, "frac_reward_zero_std": 0.0, "grad_norm": 0.06983915716409683, "kl": 0.49076715111732483, "learning_rate": 7.999965451546353e-06, "loss": -0.0955, "num_tokens": 8403275.0, "reward": 1.1755237579345703, "reward_std": 1.215641736984253, "rewards/rollout_reward_func/mean": 1.1755237579345703, "rewards/rollout_reward_func/std": 1.215641736984253, "sampling/importance_sampling_ratio/max": 1.232473373413086, "sampling/importance_sampling_ratio/mean": 0.7480033040046692, "sampling/importance_sampling_ratio/min": 1.0095229896478486e-07, "sampling/sampling_logp_difference/max": 2.204871892929077, "sampling/sampling_logp_difference/mean": 0.3934577703475952, "step": 647, "step_time": 13.321285975005594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.688714362680912, "epoch": 0.00324, "grad_norm": 0.05772915109992027, "kl": 0.43640412390232086, "learning_rate": 7.999965338365799e-06, "loss": -0.0959, "step": 648, "step_time": 6.594167707007728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 4.9166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7980947196483612, "epoch": 0.003245, "frac_reward_zero_std": 0.0, "grad_norm": 0.0654800683259964, "kl": 0.35675814375281334, "learning_rate": 7.999965225000161e-06, "loss": -0.0986, "num_tokens": 8435785.0, "reward": 0.38393670320510864, "reward_std": 1.4218471050262451, "rewards/rollout_reward_func/mean": 0.38393670320510864, "rewards/rollout_reward_func/std": 1.4218471050262451, "sampling/importance_sampling_ratio/max": 1.2549200057983398, "sampling/importance_sampling_ratio/mean": 0.5847409963607788, "sampling/importance_sampling_ratio/min": 0.0006293110782280564, "sampling/sampling_logp_difference/max": 1.3707821369171143, "sampling/sampling_logp_difference/mean": 0.2836979925632477, "step": 649, "step_time": 14.032537023012992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7957354485988617, "epoch": 0.00325, "grad_norm": 0.06605013459920883, "kl": 0.345407210290432, "learning_rate": 7.999965111449436e-06, "loss": -0.0988, "step": 650, "step_time": 6.471920630996465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 4.384615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1416266858577728, "epoch": 0.003255, "frac_reward_zero_std": 0.0, "grad_norm": 0.11996454745531082, "kl": 0.22612331807613373, "learning_rate": 7.999964997713627e-06, "loss": -0.0672, "num_tokens": 8461303.0, "reward": 0.7652609348297119, "reward_std": 1.2687889337539673, "rewards/rollout_reward_func/mean": 0.7652609348297119, "rewards/rollout_reward_func/std": 1.2687889337539673, "sampling/importance_sampling_ratio/max": 1.200913906097412, "sampling/importance_sampling_ratio/mean": 0.8335727453231812, "sampling/importance_sampling_ratio/min": 0.001768187852576375, "sampling/sampling_logp_difference/max": 1.7805460691452026, "sampling/sampling_logp_difference/mean": 0.17851218581199646, "step": 651, "step_time": 12.953493382010492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1538931876420975, "epoch": 0.00326, "grad_norm": 0.13139557838439941, "kl": 0.22414185106754303, "learning_rate": 7.999964883792732e-06, "loss": -0.0676, "step": 652, "step_time": 6.183343191994936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.3054974265396595, "epoch": 0.003265, "frac_reward_zero_std": 0.5, "grad_norm": 0.16456176340579987, "kl": 0.31357699632644653, "learning_rate": 7.999964769686752e-06, "loss": -0.0346, "num_tokens": 8484751.0, "reward": 1.7231106758117676, "reward_std": 0.6114364266395569, "rewards/rollout_reward_func/mean": 1.7231106758117676, "rewards/rollout_reward_func/std": 0.6114364266395569, "sampling/importance_sampling_ratio/max": 1.232934594154358, "sampling/importance_sampling_ratio/mean": 1.0362919569015503, "sampling/importance_sampling_ratio/min": 0.21569964289665222, "sampling/sampling_logp_difference/max": 0.7064833641052246, "sampling/sampling_logp_difference/mean": 0.04680383950471878, "step": 653, "step_time": 9.739973199990345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30754299089312553, "epoch": 0.00327, "grad_norm": 0.13901858031749725, "kl": 0.32181230932474136, "learning_rate": 7.999964655395686e-06, "loss": -0.0349, "step": 654, "step_time": 4.95672412001295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.928571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8238051608204842, "epoch": 0.003275, "frac_reward_zero_std": 0.5, "grad_norm": 0.08336493372917175, "kl": 0.6461129523813725, "learning_rate": 7.999964540919536e-06, "loss": -0.0317, "num_tokens": 8504430.0, "reward": 1.5377589464187622, "reward_std": 0.938045084476471, "rewards/rollout_reward_func/mean": 1.5377589464187622, "rewards/rollout_reward_func/std": 0.938045084476471, "sampling/importance_sampling_ratio/max": 1.1060359477996826, "sampling/importance_sampling_ratio/mean": 0.812759280204773, "sampling/importance_sampling_ratio/min": 0.00010921696230070665, "sampling/sampling_logp_difference/max": 1.8263731002807617, "sampling/sampling_logp_difference/mean": 0.14391648769378662, "step": 655, "step_time": 10.109892590000527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8254932183772326, "epoch": 0.00328, "grad_norm": 0.09219855070114136, "kl": 0.6873171776533127, "learning_rate": 7.9999644262583e-06, "loss": -0.0315, "step": 656, "step_time": 4.8993844560027355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.248666264116764, "epoch": 0.003285, "frac_reward_zero_std": 0.0, "grad_norm": 0.08341837674379349, "kl": 0.266302939504385, "learning_rate": 7.99996431141198e-06, "loss": -0.037, "num_tokens": 8540944.0, "reward": 0.005710870027542114, "reward_std": 0.905836820602417, "rewards/rollout_reward_func/mean": 0.005710870027542114, "rewards/rollout_reward_func/std": 0.905836820602417, "sampling/importance_sampling_ratio/max": 1.2414579391479492, "sampling/importance_sampling_ratio/mean": 0.9015401601791382, "sampling/importance_sampling_ratio/min": 0.0005222981562837958, "sampling/sampling_logp_difference/max": 1.3396201133728027, "sampling/sampling_logp_difference/mean": 0.16967138648033142, "step": 657, "step_time": 15.608505662996322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2498038858175278, "epoch": 0.00329, "grad_norm": 0.08271997421979904, "kl": 0.26650353893637657, "learning_rate": 7.999964196380572e-06, "loss": -0.0372, "step": 658, "step_time": 7.060465110000223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.5625, "completions/mean_terminated_length": 4.0714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9763163849711418, "epoch": 0.003295, "frac_reward_zero_std": 0.5, "grad_norm": 0.14080947637557983, "kl": 0.3760853223502636, "learning_rate": 7.99996408116408e-06, "loss": 0.0394, "num_tokens": 8565971.0, "reward": 0.7287441492080688, "reward_std": 1.3115098476409912, "rewards/rollout_reward_func/mean": 0.7287441492080688, "rewards/rollout_reward_func/std": 1.3115099668502808, "sampling/importance_sampling_ratio/max": 1.3163005113601685, "sampling/importance_sampling_ratio/mean": 0.9204069375991821, "sampling/importance_sampling_ratio/min": 1.9677892851177603e-05, "sampling/sampling_logp_difference/max": 1.6634492874145508, "sampling/sampling_logp_difference/mean": 0.17771145701408386, "step": 659, "step_time": 13.740592512011062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9782384112477303, "epoch": 0.0033, "grad_norm": 0.1438875049352646, "kl": 0.36733506992459297, "learning_rate": 7.999963965762504e-06, "loss": 0.0392, "step": 660, "step_time": 6.943563180000638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8621971607208252, "epoch": 0.003305, "frac_reward_zero_std": 0.5, "grad_norm": 0.07263505458831787, "kl": 0.23004139214754105, "learning_rate": 7.99996385017584e-06, "loss": -0.047, "num_tokens": 8590779.0, "reward": 0.6908890008926392, "reward_std": 1.2038356065750122, "rewards/rollout_reward_func/mean": 0.6908890008926392, "rewards/rollout_reward_func/std": 1.2038356065750122, "sampling/importance_sampling_ratio/max": 1.210442304611206, "sampling/importance_sampling_ratio/mean": 0.781091570854187, "sampling/importance_sampling_ratio/min": 0.006077005062252283, "sampling/sampling_logp_difference/max": 1.1168057918548584, "sampling/sampling_logp_difference/mean": 0.13498975336551666, "step": 661, "step_time": 13.368579108995618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8702861666679382, "epoch": 0.00331, "grad_norm": 0.07346092164516449, "kl": 0.23154975473880768, "learning_rate": 7.999963734404094e-06, "loss": -0.0468, "step": 662, "step_time": 6.068440500006545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.9375, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1594949960708618, "epoch": 0.003315, "frac_reward_zero_std": 0.0, "grad_norm": 0.05956146866083145, "kl": 0.4568057507276535, "learning_rate": 7.999963618447261e-06, "loss": -0.087, "num_tokens": 8608614.0, "reward": 1.259489893913269, "reward_std": 1.2214140892028809, "rewards/rollout_reward_func/mean": 1.259489893913269, "rewards/rollout_reward_func/std": 1.2214140892028809, "sampling/importance_sampling_ratio/max": 1.0965297222137451, "sampling/importance_sampling_ratio/mean": 0.8609210252761841, "sampling/importance_sampling_ratio/min": 6.29975716037734e-08, "sampling/sampling_logp_difference/max": 2.202613115310669, "sampling/sampling_logp_difference/mean": 0.24391871690750122, "step": 663, "step_time": 8.294879925015266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.164942990988493, "epoch": 0.00332, "grad_norm": 0.06369593739509583, "kl": 0.441891398280859, "learning_rate": 7.999963502305342e-06, "loss": -0.0869, "step": 664, "step_time": 4.249837903000298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 5.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1343101486563683, "epoch": 0.003325, "frac_reward_zero_std": 0.0, "grad_norm": 0.098505899310112, "kl": 0.4381450302898884, "learning_rate": 7.999963385978338e-06, "loss": -0.0704, "num_tokens": 8630849.0, "reward": 0.8860252499580383, "reward_std": 1.4205724000930786, "rewards/rollout_reward_func/mean": 0.8860252499580383, "rewards/rollout_reward_func/std": 1.420572280883789, "sampling/importance_sampling_ratio/max": 1.1072036027908325, "sampling/importance_sampling_ratio/mean": 0.6658751368522644, "sampling/importance_sampling_ratio/min": 0.0011688696686178446, "sampling/sampling_logp_difference/max": 1.6730393171310425, "sampling/sampling_logp_difference/mean": 0.2293042689561844, "step": 665, "step_time": 11.853200861994992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1343601867556572, "epoch": 0.00333, "grad_norm": 0.1004658117890358, "kl": 0.41770949214696884, "learning_rate": 7.999963269466251e-06, "loss": -0.0706, "step": 666, "step_time": 5.602470924015506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.066667079925537, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4599767327308655, "epoch": 0.003335, "frac_reward_zero_std": 0.0, "grad_norm": 0.13034102320671082, "kl": 0.5240182802081108, "learning_rate": 7.999963152769077e-06, "loss": -0.062, "num_tokens": 8661459.0, "reward": 0.375628262758255, "reward_std": 1.270728349685669, "rewards/rollout_reward_func/mean": 0.375628262758255, "rewards/rollout_reward_func/std": 1.2707284688949585, "sampling/importance_sampling_ratio/max": 1.1188397407531738, "sampling/importance_sampling_ratio/mean": 0.6813089847564697, "sampling/importance_sampling_ratio/min": 0.0055952235125005245, "sampling/sampling_logp_difference/max": 1.621917724609375, "sampling/sampling_logp_difference/mean": 0.22969144582748413, "step": 667, "step_time": 13.495600346010178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4612377434968948, "epoch": 0.00334, "grad_norm": 0.12890508770942688, "kl": 0.5435045808553696, "learning_rate": 7.999963035886818e-06, "loss": -0.0621, "step": 668, "step_time": 7.835224284994183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.538461685180664, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1248205080628395, "epoch": 0.003345, "frac_reward_zero_std": 0.0, "grad_norm": 0.1198320984840393, "kl": 0.47468868643045425, "learning_rate": 7.999962918819473e-06, "loss": -0.0835, "num_tokens": 8693121.0, "reward": 0.7979624271392822, "reward_std": 1.2256546020507812, "rewards/rollout_reward_func/mean": 0.7979624271392822, "rewards/rollout_reward_func/std": 1.2256546020507812, "sampling/importance_sampling_ratio/max": 1.2952824831008911, "sampling/importance_sampling_ratio/mean": 0.68336421251297, "sampling/importance_sampling_ratio/min": 0.0005926156882196665, "sampling/sampling_logp_difference/max": 1.5977582931518555, "sampling/sampling_logp_difference/mean": 0.206785649061203, "step": 669, "step_time": 13.649713800026802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1264403611421585, "epoch": 0.00335, "grad_norm": 0.1213003396987915, "kl": 0.470620833337307, "learning_rate": 7.999962801567045e-06, "loss": -0.0841, "step": 670, "step_time": 6.867428051002207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6215224862098694, "epoch": 0.003355, "frac_reward_zero_std": 0.0, "grad_norm": 0.04923594743013382, "kl": 0.18628374487161636, "learning_rate": 7.99996268412953e-06, "loss": -0.0594, "num_tokens": 8726229.0, "reward": 0.5012231469154358, "reward_std": 1.339870810508728, "rewards/rollout_reward_func/mean": 0.5012231469154358, "rewards/rollout_reward_func/std": 1.339870810508728, "sampling/importance_sampling_ratio/max": 1.2015830278396606, "sampling/importance_sampling_ratio/mean": 0.6484518051147461, "sampling/importance_sampling_ratio/min": 7.227286550914869e-05, "sampling/sampling_logp_difference/max": 1.6429895162582397, "sampling/sampling_logp_difference/mean": 0.29644545912742615, "step": 671, "step_time": 16.219206542009488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6220270097255707, "epoch": 0.00336, "grad_norm": 0.04596390947699547, "kl": 0.18836808018386364, "learning_rate": 7.99996256650693e-06, "loss": -0.0595, "step": 672, "step_time": 7.8102592949871905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 9.1875, "completions/mean_terminated_length": 3.8888888359069824, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.025106579065323, "epoch": 0.003365, "frac_reward_zero_std": 0.0, "grad_norm": 0.05345650762319565, "kl": 0.10757099092006683, "learning_rate": 7.999962448699245e-06, "loss": -0.0567, "num_tokens": 8761484.0, "reward": 0.13666939735412598, "reward_std": 0.9828222393989563, "rewards/rollout_reward_func/mean": 0.13666939735412598, "rewards/rollout_reward_func/std": 0.9828222393989563, "sampling/importance_sampling_ratio/max": 1.1262694597244263, "sampling/importance_sampling_ratio/mean": 0.6118495464324951, "sampling/importance_sampling_ratio/min": 0.00034321751445531845, "sampling/sampling_logp_difference/max": 1.186645746231079, "sampling/sampling_logp_difference/mean": 0.29571324586868286, "step": 673, "step_time": 15.72993105200294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.019382566213608, "epoch": 0.00337, "grad_norm": 0.05148537829518318, "kl": 0.10754589177668095, "learning_rate": 7.999962330706475e-06, "loss": -0.0566, "step": 674, "step_time": 7.253273798007285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 4.454545497894287, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5982207953929901, "epoch": 0.003375, "frac_reward_zero_std": 0.0, "grad_norm": 0.13589410483837128, "kl": 0.24080229178071022, "learning_rate": 7.99996221252862e-06, "loss": -0.0311, "num_tokens": 8787747.0, "reward": 0.3283061981201172, "reward_std": 0.9328439831733704, "rewards/rollout_reward_func/mean": 0.3283061981201172, "rewards/rollout_reward_func/std": 0.9328440427780151, "sampling/importance_sampling_ratio/max": 1.1096059083938599, "sampling/importance_sampling_ratio/mean": 0.6031507849693298, "sampling/importance_sampling_ratio/min": 0.0005111073842272162, "sampling/sampling_logp_difference/max": 1.7486274242401123, "sampling/sampling_logp_difference/mean": 0.25049692392349243, "step": 675, "step_time": 15.69927524200466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5975358486175537, "epoch": 0.00338, "grad_norm": 0.13577942550182343, "kl": 0.24018871039152145, "learning_rate": 7.99996209416568e-06, "loss": -0.0314, "step": 676, "step_time": 6.923183244012762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.1875, "completions/mean_terminated_length": 5.583333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3680732250213623, "epoch": 0.003385, "frac_reward_zero_std": 0.0, "grad_norm": 0.11018703877925873, "kl": 0.14329352788627148, "learning_rate": 7.999961975617654e-06, "loss": -0.1054, "num_tokens": 8813609.0, "reward": -0.4495247006416321, "reward_std": 0.9054649472236633, "rewards/rollout_reward_func/mean": -0.4495247006416321, "rewards/rollout_reward_func/std": 0.9054649472236633, "sampling/importance_sampling_ratio/max": 1.2379982471466064, "sampling/importance_sampling_ratio/mean": 0.6399083137512207, "sampling/importance_sampling_ratio/min": 4.005872324341908e-05, "sampling/sampling_logp_difference/max": 2.6433982849121094, "sampling/sampling_logp_difference/mean": 0.3503893315792084, "step": 677, "step_time": 13.804951562982751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3588943481445312, "epoch": 0.00339, "grad_norm": 0.10764890164136887, "kl": 0.14437542669475079, "learning_rate": 7.999961856884545e-06, "loss": -0.1056, "step": 678, "step_time": 7.082211147993803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.9375, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2099997326731682, "epoch": 0.003395, "frac_reward_zero_std": 0.0, "grad_norm": 0.16854244470596313, "kl": 1.4582579880952835, "learning_rate": 7.999961737966349e-06, "loss": -0.0291, "num_tokens": 8838737.0, "reward": -0.03276854753494263, "reward_std": 1.3284733295440674, "rewards/rollout_reward_func/mean": -0.03276854753494263, "rewards/rollout_reward_func/std": 1.3284733295440674, "sampling/importance_sampling_ratio/max": 1.2517013549804688, "sampling/importance_sampling_ratio/mean": 0.8151562213897705, "sampling/importance_sampling_ratio/min": 0.0013618356315419078, "sampling/sampling_logp_difference/max": 1.418928623199463, "sampling/sampling_logp_difference/mean": 0.20458433032035828, "step": 679, "step_time": 13.213871607993497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2066228091716766, "epoch": 0.0034, "grad_norm": 0.16446638107299805, "kl": 1.4559294357895851, "learning_rate": 7.999961618863067e-06, "loss": -0.0291, "step": 680, "step_time": 6.676667523992364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.6875, "completions/mean_terminated_length": 6.555555820465088, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.566129446029663, "epoch": 0.003405, "frac_reward_zero_std": 0.0, "grad_norm": 0.09664878249168396, "kl": 0.3928507808595896, "learning_rate": 7.999961499574702e-06, "loss": -0.0395, "num_tokens": 8869079.0, "reward": -0.551897406578064, "reward_std": 0.7252459526062012, "rewards/rollout_reward_func/mean": -0.551897406578064, "rewards/rollout_reward_func/std": 0.7252460718154907, "sampling/importance_sampling_ratio/max": 1.1446431875228882, "sampling/importance_sampling_ratio/mean": 0.404926598072052, "sampling/importance_sampling_ratio/min": 6.823635339969769e-05, "sampling/sampling_logp_difference/max": 1.745697021484375, "sampling/sampling_logp_difference/mean": 0.35911452770233154, "step": 681, "step_time": 15.125818811997306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5739803314208984, "epoch": 0.00341, "grad_norm": 0.09624020010232925, "kl": 0.3640786372125149, "learning_rate": 7.99996138010125e-06, "loss": -0.0398, "step": 682, "step_time": 6.917950630973792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1083355769515038, "epoch": 0.003415, "frac_reward_zero_std": 0.0, "grad_norm": 0.07499851286411285, "kl": 0.2662259191274643, "learning_rate": 7.999961260442715e-06, "loss": -0.0889, "num_tokens": 8896061.0, "reward": 1.1493291854858398, "reward_std": 1.1742724180221558, "rewards/rollout_reward_func/mean": 1.1493291854858398, "rewards/rollout_reward_func/std": 1.1742725372314453, "sampling/importance_sampling_ratio/max": 1.1354933977127075, "sampling/importance_sampling_ratio/mean": 0.8132882118225098, "sampling/importance_sampling_ratio/min": 0.0013860400067642331, "sampling/sampling_logp_difference/max": 1.1083691120147705, "sampling/sampling_logp_difference/mean": 0.19778163731098175, "step": 683, "step_time": 13.23154647499905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1100555807352066, "epoch": 0.00342, "grad_norm": 0.07318806648254395, "kl": 0.26287777349352837, "learning_rate": 7.999961140599095e-06, "loss": -0.0889, "step": 684, "step_time": 6.614927458998864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.3125, "completions/mean_terminated_length": 5.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.047624170780182, "epoch": 0.003425, "frac_reward_zero_std": 0.0, "grad_norm": 0.08950678259134293, "kl": 1.0970641281455755, "learning_rate": 7.999961020570388e-06, "loss": -0.112, "num_tokens": 8927580.0, "reward": 0.7340770959854126, "reward_std": 1.3051754236221313, "rewards/rollout_reward_func/mean": 0.7340770959854126, "rewards/rollout_reward_func/std": 1.3051753044128418, "sampling/importance_sampling_ratio/max": 1.2597452402114868, "sampling/importance_sampling_ratio/mean": 0.6219525933265686, "sampling/importance_sampling_ratio/min": 9.045785191119649e-06, "sampling/sampling_logp_difference/max": 2.005366802215576, "sampling/sampling_logp_difference/mean": 0.3656989336013794, "step": 685, "step_time": 14.64743908199307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.04900860786438, "epoch": 0.00343, "grad_norm": 0.0835244432091713, "kl": 1.0364587754011154, "learning_rate": 7.999960900356597e-06, "loss": -0.1121, "step": 686, "step_time": 6.905477987005725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 5.785714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6925899684429169, "epoch": 0.003435, "frac_reward_zero_std": 0.0, "grad_norm": 0.06389925628900528, "kl": 0.393298726528883, "learning_rate": 7.999960779957721e-06, "loss": -0.0459, "num_tokens": 8959047.0, "reward": 0.507768452167511, "reward_std": 1.2179523706436157, "rewards/rollout_reward_func/mean": 0.507768452167511, "rewards/rollout_reward_func/std": 1.2179524898529053, "sampling/importance_sampling_ratio/max": 1.156020164489746, "sampling/importance_sampling_ratio/mean": 0.6138237118721008, "sampling/importance_sampling_ratio/min": 0.00012245643301866949, "sampling/sampling_logp_difference/max": 1.652174472808838, "sampling/sampling_logp_difference/mean": 0.2892751097679138, "step": 687, "step_time": 13.054833928996231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6908849328756332, "epoch": 0.00344, "grad_norm": 0.0644867792725563, "kl": 0.37840086221694946, "learning_rate": 7.999960659373759e-06, "loss": -0.046, "step": 688, "step_time": 7.243709760994534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 4.785714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.45780442468822, "epoch": 0.003445, "frac_reward_zero_std": 0.0, "grad_norm": 0.04526463523507118, "kl": 0.1905357725918293, "learning_rate": 7.999960538604712e-06, "loss": -0.1018, "num_tokens": 8990016.0, "reward": 1.224982738494873, "reward_std": 1.0447165966033936, "rewards/rollout_reward_func/mean": 1.224982738494873, "rewards/rollout_reward_func/std": 1.0447165966033936, "sampling/importance_sampling_ratio/max": 1.2002365589141846, "sampling/importance_sampling_ratio/mean": 0.8258980512619019, "sampling/importance_sampling_ratio/min": 0.00032040057703852654, "sampling/sampling_logp_difference/max": 1.6198921203613281, "sampling/sampling_logp_difference/mean": 0.245182603597641, "step": 689, "step_time": 13.464282635992276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4569567628204823, "epoch": 0.00345, "grad_norm": 0.04649205878376961, "kl": 0.19045929610729218, "learning_rate": 7.999960417650583e-06, "loss": -0.1018, "step": 690, "step_time": 6.844496842997614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.5625, "completions/mean_terminated_length": 6.84615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8539829552173615, "epoch": 0.003455, "frac_reward_zero_std": 0.0, "grad_norm": 0.1791241317987442, "kl": 1.1530779283493757, "learning_rate": 7.999960296511364e-06, "loss": -0.0539, "num_tokens": 9024721.0, "reward": 0.1810111254453659, "reward_std": 1.1599317789077759, "rewards/rollout_reward_func/mean": 0.1810111254453659, "rewards/rollout_reward_func/std": 1.1599317789077759, "sampling/importance_sampling_ratio/max": 1.259913682937622, "sampling/importance_sampling_ratio/mean": 0.5362128615379333, "sampling/importance_sampling_ratio/min": 4.913795237371232e-06, "sampling/sampling_logp_difference/max": 2.824949026107788, "sampling/sampling_logp_difference/mean": 0.3531115651130676, "step": 691, "step_time": 14.718733504996635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8537784777581692, "epoch": 0.00346, "grad_norm": 0.1571999043226242, "kl": 1.0485824309289455, "learning_rate": 7.999960175187063e-06, "loss": -0.0543, "step": 692, "step_time": 7.651440892994287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7701923884451389, "epoch": 0.003465, "frac_reward_zero_std": 0.5, "grad_norm": 0.11663153767585754, "kl": 0.232278473675251, "learning_rate": 7.999960053677677e-06, "loss": -0.0211, "num_tokens": 9045445.0, "reward": 0.8798762559890747, "reward_std": 1.3937106132507324, "rewards/rollout_reward_func/mean": 0.8798762559890747, "rewards/rollout_reward_func/std": 1.3937106132507324, "sampling/importance_sampling_ratio/max": 1.14894700050354, "sampling/importance_sampling_ratio/mean": 0.9328707456588745, "sampling/importance_sampling_ratio/min": 0.002105084480717778, "sampling/sampling_logp_difference/max": 0.9292674660682678, "sampling/sampling_logp_difference/mean": 0.11683249473571777, "step": 693, "step_time": 10.197720935990219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7795790024101734, "epoch": 0.00347, "grad_norm": 0.12454481422901154, "kl": 0.2310417741537094, "learning_rate": 7.999959931983205e-06, "loss": -0.0213, "step": 694, "step_time": 4.938276814005803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 4.285714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9827658161520958, "epoch": 0.003475, "frac_reward_zero_std": 0.0, "grad_norm": 0.15629667043685913, "kl": 0.3471626490354538, "learning_rate": 7.999959810103648e-06, "loss": -0.0605, "num_tokens": 9076162.0, "reward": 0.9315350651741028, "reward_std": 1.1482332944869995, "rewards/rollout_reward_func/mean": 0.9315350651741028, "rewards/rollout_reward_func/std": 1.148233413696289, "sampling/importance_sampling_ratio/max": 1.1687694787979126, "sampling/importance_sampling_ratio/mean": 0.8262165784835815, "sampling/importance_sampling_ratio/min": 0.0010970367584377527, "sampling/sampling_logp_difference/max": 1.1434825658798218, "sampling/sampling_logp_difference/mean": 0.15359637141227722, "step": 695, "step_time": 12.921548619997338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.987183079123497, "epoch": 0.00348, "grad_norm": 0.14800043404102325, "kl": 0.35096951574087143, "learning_rate": 7.999959688039008e-06, "loss": -0.0609, "step": 696, "step_time": 6.495685537011013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 4.266666889190674, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.616086021065712, "epoch": 0.003485, "frac_reward_zero_std": 0.0, "grad_norm": 0.1333933025598526, "kl": 0.4414472356438637, "learning_rate": 7.999959565789282e-06, "loss": -0.0617, "num_tokens": 9099629.0, "reward": 1.6940090656280518, "reward_std": 0.7747988104820251, "rewards/rollout_reward_func/mean": 1.6940090656280518, "rewards/rollout_reward_func/std": 0.7747987508773804, "sampling/importance_sampling_ratio/max": 1.2954602241516113, "sampling/importance_sampling_ratio/mean": 0.9717329740524292, "sampling/importance_sampling_ratio/min": 0.000253507518209517, "sampling/sampling_logp_difference/max": 1.1202222108840942, "sampling/sampling_logp_difference/mean": 0.13085836172103882, "step": 697, "step_time": 11.127060719998553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6150660142302513, "epoch": 0.00349, "grad_norm": 0.12350718677043915, "kl": 0.45487869530916214, "learning_rate": 7.999959443354469e-06, "loss": -0.0618, "step": 698, "step_time": 5.590182897998602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.850397765636444, "epoch": 0.003495, "frac_reward_zero_std": 0.0, "grad_norm": 0.061336617916822433, "kl": 0.41917333751916885, "learning_rate": 7.999959320734573e-06, "loss": -0.0725, "num_tokens": 9124553.0, "reward": 0.5628238320350647, "reward_std": 1.182220220565796, "rewards/rollout_reward_func/mean": 0.5628238320350647, "rewards/rollout_reward_func/std": 1.1822203397750854, "sampling/importance_sampling_ratio/max": 1.1691925525665283, "sampling/importance_sampling_ratio/mean": 0.6789427995681763, "sampling/importance_sampling_ratio/min": 4.76079476356972e-05, "sampling/sampling_logp_difference/max": 1.8347773551940918, "sampling/sampling_logp_difference/mean": 0.29799339175224304, "step": 699, "step_time": 12.158296935987892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8430593013763428, "epoch": 0.0035, "grad_norm": 0.05299018323421478, "kl": 0.4200447928160429, "learning_rate": 7.999959197929591e-06, "loss": -0.0729, "step": 700, "step_time": 5.670394202024909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.9375, "completions/mean_terminated_length": 4.700000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8242022693157196, "epoch": 0.003505, "frac_reward_zero_std": 0.0, "grad_norm": 0.13167905807495117, "kl": 0.22871077992022038, "learning_rate": 7.999959074939526e-06, "loss": -0.087, "num_tokens": 9153264.0, "reward": 0.6601541042327881, "reward_std": 1.3100353479385376, "rewards/rollout_reward_func/mean": 0.6601541042327881, "rewards/rollout_reward_func/std": 1.3100353479385376, "sampling/importance_sampling_ratio/max": 1.0961406230926514, "sampling/importance_sampling_ratio/mean": 0.5831210017204285, "sampling/importance_sampling_ratio/min": 6.89099920236913e-07, "sampling/sampling_logp_difference/max": 1.9993950128555298, "sampling/sampling_logp_difference/mean": 0.3297823965549469, "step": 701, "step_time": 13.354054793002433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8196329809725285, "epoch": 0.00351, "grad_norm": 0.13389131426811218, "kl": 0.22027289867401123, "learning_rate": 7.999958951764375e-06, "loss": -0.0872, "step": 702, "step_time": 6.861603634009953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4229979142546654, "epoch": 0.003515, "frac_reward_zero_std": 0.0, "grad_norm": 0.052595362067222595, "kl": 0.26850154623389244, "learning_rate": 7.999958828404138e-06, "loss": -0.088, "num_tokens": 9180587.0, "reward": 0.7754933834075928, "reward_std": 1.3403338193893433, "rewards/rollout_reward_func/mean": 0.7754933834075928, "rewards/rollout_reward_func/std": 1.3403338193893433, "sampling/importance_sampling_ratio/max": 1.192733645439148, "sampling/importance_sampling_ratio/mean": 0.6764349937438965, "sampling/importance_sampling_ratio/min": 2.7158237571711652e-05, "sampling/sampling_logp_difference/max": 1.6601970195770264, "sampling/sampling_logp_difference/mean": 0.258297324180603, "step": 703, "step_time": 13.870618549990468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009615384973585606, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009615384973585606, "entropy": 1.4309498518705368, "epoch": 0.00352, "grad_norm": 0.05250920355319977, "kl": 0.2938203364610672, "learning_rate": 7.999958704858817e-06, "loss": -0.088, "step": 704, "step_time": 6.44624446099624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.625, "completions/mean_terminated_length": 4.833333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8184951096773148, "epoch": 0.003525, "frac_reward_zero_std": 0.0, "grad_norm": 0.3202535808086395, "kl": 0.26144247502088547, "learning_rate": 7.999958581128412e-06, "loss": -0.0567, "num_tokens": 9215136.0, "reward": 0.9353054761886597, "reward_std": 1.1353718042373657, "rewards/rollout_reward_func/mean": 0.9353054761886597, "rewards/rollout_reward_func/std": 1.1353718042373657, "sampling/importance_sampling_ratio/max": 1.1547070741653442, "sampling/importance_sampling_ratio/mean": 0.636810839176178, "sampling/importance_sampling_ratio/min": 0.0005602733581326902, "sampling/sampling_logp_difference/max": 1.8258981704711914, "sampling/sampling_logp_difference/mean": 0.2954903244972229, "step": 705, "step_time": 16.798510445005377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.813162624835968, "epoch": 0.00353, "grad_norm": 0.26334166526794434, "kl": 0.254614919424057, "learning_rate": 7.99995845721292e-06, "loss": -0.0587, "step": 706, "step_time": 8.129045416004374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 5.357142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4904929399490356, "epoch": 0.003535, "frac_reward_zero_std": 0.0, "grad_norm": 0.1047821044921875, "kl": 0.31853567250072956, "learning_rate": 7.999958333112344e-06, "loss": -0.0899, "num_tokens": 9240363.0, "reward": 0.6062040328979492, "reward_std": 1.2526201009750366, "rewards/rollout_reward_func/mean": 0.6062040328979492, "rewards/rollout_reward_func/std": 1.2526201009750366, "sampling/importance_sampling_ratio/max": 1.2822561264038086, "sampling/importance_sampling_ratio/mean": 0.7877361178398132, "sampling/importance_sampling_ratio/min": 3.012031265825499e-05, "sampling/sampling_logp_difference/max": 1.8769307136535645, "sampling/sampling_logp_difference/mean": 0.3073875308036804, "step": 707, "step_time": 12.322310026007472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4916400760412216, "epoch": 0.00354, "grad_norm": 0.10179579257965088, "kl": 0.32983945682644844, "learning_rate": 7.999958208826683e-06, "loss": -0.09, "step": 708, "step_time": 5.899643729993841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 4.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2094347178936005, "epoch": 0.003545, "frac_reward_zero_std": 0.0, "grad_norm": 0.11676786094903946, "kl": 1.6894513256847858, "learning_rate": 7.999958084355938e-06, "loss": -0.0903, "num_tokens": 9269930.0, "reward": 0.0758669376373291, "reward_std": 1.1072872877120972, "rewards/rollout_reward_func/mean": 0.0758669376373291, "rewards/rollout_reward_func/std": 1.1072872877120972, "sampling/importance_sampling_ratio/max": 1.2473779916763306, "sampling/importance_sampling_ratio/mean": 0.5769038796424866, "sampling/importance_sampling_ratio/min": 3.975285949309182e-07, "sampling/sampling_logp_difference/max": 2.327249526977539, "sampling/sampling_logp_difference/mean": 0.38717660307884216, "step": 709, "step_time": 13.83651842000836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2105733454227448, "epoch": 0.00355, "grad_norm": 0.1130724847316742, "kl": 1.623740190640092, "learning_rate": 7.999957959700107e-06, "loss": -0.0904, "step": 710, "step_time": 6.1769563230045605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1433958262205124, "epoch": 0.003555, "frac_reward_zero_std": 0.0, "grad_norm": 0.1722908467054367, "kl": 0.6493918001651764, "learning_rate": 7.999957834859192e-06, "loss": -0.0915, "num_tokens": 9298062.0, "reward": 0.9739018082618713, "reward_std": 1.2572673559188843, "rewards/rollout_reward_func/mean": 0.9739018082618713, "rewards/rollout_reward_func/std": 1.2572674751281738, "sampling/importance_sampling_ratio/max": 1.280350923538208, "sampling/importance_sampling_ratio/mean": 0.8280972242355347, "sampling/importance_sampling_ratio/min": 0.00023042797693051398, "sampling/sampling_logp_difference/max": 1.1006344556808472, "sampling/sampling_logp_difference/mean": 0.1842767596244812, "step": 711, "step_time": 13.392028935006238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1454831659793854, "epoch": 0.00356, "grad_norm": 0.16843749582767487, "kl": 0.5940857380628586, "learning_rate": 7.999957709833192e-06, "loss": -0.0915, "step": 712, "step_time": 6.798949435018585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 5.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.358655661344528, "epoch": 0.003565, "frac_reward_zero_std": 0.0, "grad_norm": 0.2220415472984314, "kl": 0.6409797538071871, "learning_rate": 7.999957584622105e-06, "loss": -0.1231, "num_tokens": 9327709.0, "reward": 0.4097379744052887, "reward_std": 1.1624478101730347, "rewards/rollout_reward_func/mean": 0.4097379744052887, "rewards/rollout_reward_func/std": 1.1624478101730347, "sampling/importance_sampling_ratio/max": 1.4490004777908325, "sampling/importance_sampling_ratio/mean": 0.5169346332550049, "sampling/importance_sampling_ratio/min": 1.0149221452593338e-05, "sampling/sampling_logp_difference/max": 1.7481122016906738, "sampling/sampling_logp_difference/mean": 0.42051592469215393, "step": 713, "step_time": 13.495259661984164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3511769473552704, "epoch": 0.00357, "grad_norm": 0.22832590341567993, "kl": 0.6759400079026818, "learning_rate": 7.999957459225936e-06, "loss": -0.1238, "step": 714, "step_time": 5.966107642991119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.4375, "completions/mean_terminated_length": 6.875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.913093686103821, "epoch": 0.003575, "frac_reward_zero_std": 0.0, "grad_norm": 0.11848580837249756, "kl": 0.837099848780781, "learning_rate": 7.99995733364468e-06, "loss": -0.0347, "num_tokens": 9362177.0, "reward": -0.3594660460948944, "reward_std": 0.6705210208892822, "rewards/rollout_reward_func/mean": -0.3594660460948944, "rewards/rollout_reward_func/std": 0.6705210208892822, "sampling/importance_sampling_ratio/max": 1.3980305194854736, "sampling/importance_sampling_ratio/mean": 0.2606634199619293, "sampling/importance_sampling_ratio/min": 5.574955594056519e-06, "sampling/sampling_logp_difference/max": 2.466684341430664, "sampling/sampling_logp_difference/mean": 0.4143645763397217, "step": 715, "step_time": 16.58470784498786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.911629855632782, "epoch": 0.00358, "grad_norm": 0.11523479968309402, "kl": 0.7914746575988829, "learning_rate": 7.999957207878342e-06, "loss": -0.0349, "step": 716, "step_time": 7.361864077989594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.1875, "completions/mean_terminated_length": 7.285714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.9001062512397766, "epoch": 0.003585, "frac_reward_zero_std": 0.0, "grad_norm": 0.07347320020198822, "kl": 0.2983969645574689, "learning_rate": 7.999957081926916e-06, "loss": -0.0851, "num_tokens": 9394919.0, "reward": -0.23030146956443787, "reward_std": 0.8524927496910095, "rewards/rollout_reward_func/mean": -0.23030146956443787, "rewards/rollout_reward_func/std": 0.8524927496910095, "sampling/importance_sampling_ratio/max": 1.128807783126831, "sampling/importance_sampling_ratio/mean": 0.30131983757019043, "sampling/importance_sampling_ratio/min": 3.345911352425901e-07, "sampling/sampling_logp_difference/max": 2.0282325744628906, "sampling/sampling_logp_difference/mean": 0.3812890648841858, "step": 717, "step_time": 14.607588466999005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.898187220096588, "epoch": 0.00359, "grad_norm": 0.06959161907434464, "kl": 0.27249816060066223, "learning_rate": 7.999956955790408e-06, "loss": -0.0854, "step": 718, "step_time": 6.304030937011703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 4.384615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.239696018397808, "epoch": 0.003595, "frac_reward_zero_std": 0.0, "grad_norm": 0.0770486444234848, "kl": 0.28475608862936497, "learning_rate": 7.999956829468814e-06, "loss": -0.058, "num_tokens": 9422829.0, "reward": 0.7262777090072632, "reward_std": 1.121921181678772, "rewards/rollout_reward_func/mean": 0.7262777090072632, "rewards/rollout_reward_func/std": 1.1219213008880615, "sampling/importance_sampling_ratio/max": 1.158069133758545, "sampling/importance_sampling_ratio/mean": 0.8087966442108154, "sampling/importance_sampling_ratio/min": 0.0023377372417598963, "sampling/sampling_logp_difference/max": 1.5315418243408203, "sampling/sampling_logp_difference/mean": 0.20605774223804474, "step": 719, "step_time": 15.389422115986235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2411836944520473, "epoch": 0.0036, "grad_norm": 0.07582544535398483, "kl": 0.2940532695502043, "learning_rate": 7.999956702962134e-06, "loss": -0.058, "step": 720, "step_time": 8.198954400999355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.875, "completions/mean_terminated_length": 4.18181848526001, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9097229316830635, "epoch": 0.003605, "frac_reward_zero_std": 0.0, "grad_norm": 0.4747510254383087, "kl": 0.12388110719621181, "learning_rate": 7.999956576270371e-06, "loss": -0.0545, "num_tokens": 9447931.0, "reward": 0.2247907817363739, "reward_std": 1.095218300819397, "rewards/rollout_reward_func/mean": 0.2247907817363739, "rewards/rollout_reward_func/std": 1.095218300819397, "sampling/importance_sampling_ratio/max": 1.1273859739303589, "sampling/importance_sampling_ratio/mean": 0.6009554862976074, "sampling/importance_sampling_ratio/min": 3.67626162187662e-05, "sampling/sampling_logp_difference/max": 2.2782950401306152, "sampling/sampling_logp_difference/mean": 0.3415030241012573, "step": 721, "step_time": 10.855444928005454 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.9038128331303596, "epoch": 0.00361, "grad_norm": 0.28277352452278137, "kl": 0.12652076687663794, "learning_rate": 7.999956449393523e-06, "loss": -0.0572, "step": 722, "step_time": 5.456321970021236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8126456700265408, "epoch": 0.003615, "frac_reward_zero_std": 0.5, "grad_norm": 0.13158738613128662, "kl": 0.33393100649118423, "learning_rate": 7.999956322331587e-06, "loss": -0.0481, "num_tokens": 9466681.0, "reward": 1.5924917459487915, "reward_std": 0.9605380296707153, "rewards/rollout_reward_func/mean": 1.5924917459487915, "rewards/rollout_reward_func/std": 0.9605380296707153, "sampling/importance_sampling_ratio/max": 1.1291922330856323, "sampling/importance_sampling_ratio/mean": 0.8665206432342529, "sampling/importance_sampling_ratio/min": 0.0013304422609508038, "sampling/sampling_logp_difference/max": 1.205411434173584, "sampling/sampling_logp_difference/mean": 0.1359550505876541, "step": 723, "step_time": 9.008323625967023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8019773531705141, "epoch": 0.00362, "grad_norm": 0.08781937509775162, "kl": 0.33537597209215164, "learning_rate": 7.999956195084569e-06, "loss": -0.0488, "step": 724, "step_time": 4.214237727996078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2166206240653992, "epoch": 0.003625, "frac_reward_zero_std": 0.0, "grad_norm": 0.1483994424343109, "kl": 0.2454337552189827, "learning_rate": 7.999956067652467e-06, "loss": -0.03, "num_tokens": 9493752.0, "reward": 0.029733240604400635, "reward_std": 1.113905429840088, "rewards/rollout_reward_func/mean": 0.029733240604400635, "rewards/rollout_reward_func/std": 1.1139055490493774, "sampling/importance_sampling_ratio/max": 1.1720563173294067, "sampling/importance_sampling_ratio/mean": 0.7965922355651855, "sampling/importance_sampling_ratio/min": 0.002627705689519644, "sampling/sampling_logp_difference/max": 1.0458751916885376, "sampling/sampling_logp_difference/mean": 0.16264942288398743, "step": 725, "step_time": 12.478362621011911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2201540172100067, "epoch": 0.00363, "grad_norm": 0.1456039398908615, "kl": 0.2589956670999527, "learning_rate": 7.99995594003528e-06, "loss": -0.0298, "step": 726, "step_time": 6.007940138995764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.40043329913169146, "epoch": 0.003635, "frac_reward_zero_std": 0.5, "grad_norm": 0.006213516462594271, "kl": 0.2670305371284485, "learning_rate": 7.999955812233007e-06, "loss": -0.0387, "num_tokens": 9512852.0, "reward": 0.6076880693435669, "reward_std": 1.42837393283844, "rewards/rollout_reward_func/mean": 0.6076880693435669, "rewards/rollout_reward_func/std": 1.42837393283844, "sampling/importance_sampling_ratio/max": 1.0847946405410767, "sampling/importance_sampling_ratio/mean": 0.9678269624710083, "sampling/importance_sampling_ratio/min": 0.0012725848937407136, "sampling/sampling_logp_difference/max": 1.3385956287384033, "sampling/sampling_logp_difference/mean": 0.08243055641651154, "step": 727, "step_time": 8.38165870901139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4036505576223135, "epoch": 0.00364, "grad_norm": 0.006229874677956104, "kl": 0.26682448014616966, "learning_rate": 7.999955684245649e-06, "loss": -0.0387, "step": 728, "step_time": 4.334575567016145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.4375, "completions/mean_terminated_length": 4.733333587646484, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7584939822554588, "epoch": 0.003645, "frac_reward_zero_std": 0.0, "grad_norm": 0.08533787727355957, "kl": 0.37947288155555725, "learning_rate": 7.999955556073208e-06, "loss": -0.0352, "num_tokens": 9545274.0, "reward": 0.8060377240180969, "reward_std": 1.180351972579956, "rewards/rollout_reward_func/mean": 0.8060377240180969, "rewards/rollout_reward_func/std": 1.1803520917892456, "sampling/importance_sampling_ratio/max": 1.220336675643921, "sampling/importance_sampling_ratio/mean": 0.9081854820251465, "sampling/importance_sampling_ratio/min": 0.0066147539764642715, "sampling/sampling_logp_difference/max": 0.9976920485496521, "sampling/sampling_logp_difference/mean": 0.11852318048477173, "step": 729, "step_time": 13.949388287015609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7558746784925461, "epoch": 0.00365, "grad_norm": 0.10863761603832245, "kl": 0.3799305073916912, "learning_rate": 7.999955427715682e-06, "loss": -0.0351, "step": 730, "step_time": 6.965990049997345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 4.714285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8220509923994541, "epoch": 0.003655, "frac_reward_zero_std": 0.5, "grad_norm": 0.018300827592611313, "kl": 0.23458143323659897, "learning_rate": 7.99995529917307e-06, "loss": -0.0592, "num_tokens": 9567473.0, "reward": 1.4784560203552246, "reward_std": 1.0181756019592285, "rewards/rollout_reward_func/mean": 1.4784560203552246, "rewards/rollout_reward_func/std": 1.018175721168518, "sampling/importance_sampling_ratio/max": 1.1734682321548462, "sampling/importance_sampling_ratio/mean": 0.8541676998138428, "sampling/importance_sampling_ratio/min": 0.000498823297675699, "sampling/sampling_logp_difference/max": 1.829849362373352, "sampling/sampling_logp_difference/mean": 0.16208454966545105, "step": 731, "step_time": 11.858695000002626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8229713179171085, "epoch": 0.00366, "grad_norm": 0.018160365521907806, "kl": 0.2346310019493103, "learning_rate": 7.999955170445374e-06, "loss": -0.0592, "step": 732, "step_time": 5.788905869994778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 4.266666889190674, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.725899763405323, "epoch": 0.003665, "frac_reward_zero_std": 0.0, "grad_norm": 0.05893789231777191, "kl": 0.542435459792614, "learning_rate": 7.999955041532593e-06, "loss": -0.0723, "num_tokens": 9595702.0, "reward": 1.4650492668151855, "reward_std": 1.0235916376113892, "rewards/rollout_reward_func/mean": 1.4650492668151855, "rewards/rollout_reward_func/std": 1.0235917568206787, "sampling/importance_sampling_ratio/max": 1.121846079826355, "sampling/importance_sampling_ratio/mean": 0.8978321552276611, "sampling/importance_sampling_ratio/min": 0.0005451962351799011, "sampling/sampling_logp_difference/max": 1.3022539615631104, "sampling/sampling_logp_difference/mean": 0.14550523459911346, "step": 733, "step_time": 11.100647785002366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7224628962576389, "epoch": 0.00367, "grad_norm": 0.06073994189500809, "kl": 0.5136099234223366, "learning_rate": 7.999954912434727e-06, "loss": -0.0722, "step": 734, "step_time": 5.722064221001347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0978343933820724, "epoch": 0.003675, "frac_reward_zero_std": 0.0, "grad_norm": 0.14843593537807465, "kl": 2.194794535636902, "learning_rate": 7.999954783151778e-06, "loss": -0.0668, "num_tokens": 9626363.0, "reward": 0.46862584352493286, "reward_std": 1.3084601163864136, "rewards/rollout_reward_func/mean": 0.46862584352493286, "rewards/rollout_reward_func/std": 1.3084601163864136, "sampling/importance_sampling_ratio/max": 1.2220139503479004, "sampling/importance_sampling_ratio/mean": 0.781305193901062, "sampling/importance_sampling_ratio/min": 0.001181356725282967, "sampling/sampling_logp_difference/max": 1.161195993423462, "sampling/sampling_logp_difference/mean": 0.20615792274475098, "step": 735, "step_time": 13.124848899009521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0945890545845032, "epoch": 0.00368, "grad_norm": 0.14198999106884003, "kl": 2.0830411203205585, "learning_rate": 7.999954653683744e-06, "loss": -0.0671, "step": 736, "step_time": 6.846546242988552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8106351047754288, "epoch": 0.003685, "frac_reward_zero_std": 0.5, "grad_norm": 0.21281656622886658, "kl": 0.2557304762303829, "learning_rate": 7.999954524030623e-06, "loss": -0.0146, "num_tokens": 9647423.0, "reward": 0.8915312886238098, "reward_std": 1.4148973226547241, "rewards/rollout_reward_func/mean": 0.8915312886238098, "rewards/rollout_reward_func/std": 1.4148973226547241, "sampling/importance_sampling_ratio/max": 1.0629831552505493, "sampling/importance_sampling_ratio/mean": 0.8352953791618347, "sampling/importance_sampling_ratio/min": 0.0009436335531063378, "sampling/sampling_logp_difference/max": 1.310265064239502, "sampling/sampling_logp_difference/mean": 0.14224913716316223, "step": 737, "step_time": 10.600881066988222 }, { "clip_ratio/high_max": 0.010869565419852734, "clip_ratio/high_mean": 0.005434782709926367, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005434782709926367, "entropy": 0.8143831863999367, "epoch": 0.00369, "grad_norm": 0.09942631423473358, "kl": 0.2528381459414959, "learning_rate": 7.999954394192419e-06, "loss": -0.0152, "step": 738, "step_time": 5.199637309982791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.1875, "completions/mean_terminated_length": 4.1875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.34873226657509804, "epoch": 0.003695, "frac_reward_zero_std": 0.5, "grad_norm": 0.22922031581401825, "kl": 1.1922551691532135, "learning_rate": 7.999954264169131e-06, "loss": -0.0245, "num_tokens": 9667993.0, "reward": 0.22217194736003876, "reward_std": 1.2216862440109253, "rewards/rollout_reward_func/mean": 0.22217194736003876, "rewards/rollout_reward_func/std": 1.2216863632202148, "sampling/importance_sampling_ratio/max": 1.093670129776001, "sampling/importance_sampling_ratio/mean": 0.942406415939331, "sampling/importance_sampling_ratio/min": 0.09389755129814148, "sampling/sampling_logp_difference/max": 1.1409006118774414, "sampling/sampling_logp_difference/mean": 0.06613126397132874, "step": 739, "step_time": 8.914253592010937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36582076177001, "epoch": 0.0037, "grad_norm": 0.2547614872455597, "kl": 1.089463584125042, "learning_rate": 7.999954133960758e-06, "loss": -0.0253, "step": 740, "step_time": 5.070320214013918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 4.266666889190674, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6176289841532707, "epoch": 0.003705, "frac_reward_zero_std": 0.0, "grad_norm": 0.09292418509721756, "kl": 0.2906287759542465, "learning_rate": 7.999954003567298e-06, "loss": -0.016, "num_tokens": 9695938.0, "reward": 0.6350653767585754, "reward_std": 1.2606418132781982, "rewards/rollout_reward_func/mean": 0.6350653767585754, "rewards/rollout_reward_func/std": 1.2606416940689087, "sampling/importance_sampling_ratio/max": 1.1700993776321411, "sampling/importance_sampling_ratio/mean": 0.953071117401123, "sampling/importance_sampling_ratio/min": 0.00031178389326669276, "sampling/sampling_logp_difference/max": 1.803689956665039, "sampling/sampling_logp_difference/mean": 0.12240436673164368, "step": 741, "step_time": 11.685188960997039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6409705057740211, "epoch": 0.00371, "grad_norm": 0.09687335789203644, "kl": 0.2867344431579113, "learning_rate": 7.999953872988757e-06, "loss": -0.016, "step": 742, "step_time": 6.298606356998789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 4.285714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7982365973293781, "epoch": 0.003715, "frac_reward_zero_std": 0.0, "grad_norm": 0.11069146543741226, "kl": 0.25074291229248047, "learning_rate": 7.999953742225128e-06, "loss": -0.0628, "num_tokens": 9723512.0, "reward": 1.6930534839630127, "reward_std": 0.6888841986656189, "rewards/rollout_reward_func/mean": 1.6930534839630127, "rewards/rollout_reward_func/std": 0.6888841986656189, "sampling/importance_sampling_ratio/max": 1.2147678136825562, "sampling/importance_sampling_ratio/mean": 0.8889244794845581, "sampling/importance_sampling_ratio/min": 3.6403271224116907e-05, "sampling/sampling_logp_difference/max": 1.9175902605056763, "sampling/sampling_logp_difference/mean": 0.1868063509464264, "step": 743, "step_time": 10.823854935995769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8033768273890018, "epoch": 0.00372, "grad_norm": 0.10933607071638107, "kl": 0.24998650327324867, "learning_rate": 7.999953611276415e-06, "loss": -0.0628, "step": 744, "step_time": 5.275915450009052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.538461685180664, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0119972378015518, "epoch": 0.003725, "frac_reward_zero_std": 0.0, "grad_norm": 0.05510612577199936, "kl": 0.7507788725197315, "learning_rate": 7.99995348014262e-06, "loss": -0.0751, "num_tokens": 9753258.0, "reward": 0.6161727905273438, "reward_std": 1.2342476844787598, "rewards/rollout_reward_func/mean": 0.6161727905273438, "rewards/rollout_reward_func/std": 1.2342476844787598, "sampling/importance_sampling_ratio/max": 1.0921438932418823, "sampling/importance_sampling_ratio/mean": 0.7018173933029175, "sampling/importance_sampling_ratio/min": 1.1012336472049356e-05, "sampling/sampling_logp_difference/max": 1.7161977291107178, "sampling/sampling_logp_difference/mean": 0.2482115626335144, "step": 745, "step_time": 13.127368633984588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.01429583132267, "epoch": 0.00373, "grad_norm": 0.05020260810852051, "kl": 0.7561373114585876, "learning_rate": 7.999953348823737e-06, "loss": -0.0754, "step": 746, "step_time": 6.822011327996734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9882325679063797, "epoch": 0.003735, "frac_reward_zero_std": 0.0, "grad_norm": 0.12041781097650528, "kl": 0.21671933867037296, "learning_rate": 7.999953217319772e-06, "loss": -0.0543, "num_tokens": 9779850.0, "reward": -0.36451929807662964, "reward_std": 0.8828600645065308, "rewards/rollout_reward_func/mean": -0.36451929807662964, "rewards/rollout_reward_func/std": 0.8828601241111755, "sampling/importance_sampling_ratio/max": 1.123559594154358, "sampling/importance_sampling_ratio/mean": 0.83482825756073, "sampling/importance_sampling_ratio/min": 0.004042470827698708, "sampling/sampling_logp_difference/max": 0.969362735748291, "sampling/sampling_logp_difference/mean": 0.15539191663265228, "step": 747, "step_time": 13.210734064006829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9949855282902718, "epoch": 0.00374, "grad_norm": 0.11701814085245132, "kl": 0.21582969836890697, "learning_rate": 7.999953085630722e-06, "loss": -0.0548, "step": 748, "step_time": 6.825062555988552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.928571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.414214476943016, "epoch": 0.003745, "frac_reward_zero_std": 0.0, "grad_norm": 0.07443154603242874, "kl": 0.4389503002166748, "learning_rate": 7.999952953756588e-06, "loss": -0.0492, "num_tokens": 9805767.0, "reward": 0.455191969871521, "reward_std": 1.3919706344604492, "rewards/rollout_reward_func/mean": 0.455191969871521, "rewards/rollout_reward_func/std": 1.3919705152511597, "sampling/importance_sampling_ratio/max": 1.2833235263824463, "sampling/importance_sampling_ratio/mean": 0.75983726978302, "sampling/importance_sampling_ratio/min": 0.0001497965567978099, "sampling/sampling_logp_difference/max": 1.6134068965911865, "sampling/sampling_logp_difference/mean": 0.2653772532939911, "step": 749, "step_time": 10.640216682993923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4222503900527954, "epoch": 0.00375, "grad_norm": 0.08311199396848679, "kl": 0.4492768570780754, "learning_rate": 7.999952821697368e-06, "loss": -0.0495, "step": 750, "step_time": 5.6506846559932455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9195176586508751, "epoch": 0.003755, "frac_reward_zero_std": 0.0, "grad_norm": 0.20042914152145386, "kl": 0.2533528096973896, "learning_rate": 7.999952689453064e-06, "loss": -0.0602, "num_tokens": 9831388.0, "reward": 0.4591081142425537, "reward_std": 1.357662558555603, "rewards/rollout_reward_func/mean": 0.4591081142425537, "rewards/rollout_reward_func/std": 1.357662558555603, "sampling/importance_sampling_ratio/max": 1.3070844411849976, "sampling/importance_sampling_ratio/mean": 0.9147984385490417, "sampling/importance_sampling_ratio/min": 0.0015924515901133418, "sampling/sampling_logp_difference/max": 1.2834460735321045, "sampling/sampling_logp_difference/mean": 0.16227027773857117, "step": 751, "step_time": 10.308029268999235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.9281182587146759, "epoch": 0.00376, "grad_norm": 0.06622244417667389, "kl": 0.2554949186742306, "learning_rate": 7.999952557023676e-06, "loss": -0.0612, "step": 752, "step_time": 5.172179278015392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.940245196223259, "epoch": 0.003765, "frac_reward_zero_std": 0.0, "grad_norm": 0.09859853237867355, "kl": 0.41770930029451847, "learning_rate": 7.999952424409202e-06, "loss": -0.026, "num_tokens": 9852379.0, "reward": 0.02390262484550476, "reward_std": 1.240319848060608, "rewards/rollout_reward_func/mean": 0.02390262484550476, "rewards/rollout_reward_func/std": 1.240319848060608, "sampling/importance_sampling_ratio/max": 1.0554600954055786, "sampling/importance_sampling_ratio/mean": 0.590190052986145, "sampling/importance_sampling_ratio/min": 1.415378051206062e-06, "sampling/sampling_logp_difference/max": 1.5168588161468506, "sampling/sampling_logp_difference/mean": 0.34109818935394287, "step": 753, "step_time": 10.124569953011815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.9573888033628464, "epoch": 0.00377, "grad_norm": 0.07936731725931168, "kl": 0.3849342968314886, "learning_rate": 7.999952291609644e-06, "loss": -0.0266, "step": 754, "step_time": 4.834361052009626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 3.909090995788574, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6677873879671097, "epoch": 0.003775, "frac_reward_zero_std": 0.0, "grad_norm": 0.05590459331870079, "kl": 0.25229237228631973, "learning_rate": 7.999952158625002e-06, "loss": -0.0828, "num_tokens": 9882057.0, "reward": 0.46814215183258057, "reward_std": 1.299453854560852, "rewards/rollout_reward_func/mean": 0.46814215183258057, "rewards/rollout_reward_func/std": 1.2994539737701416, "sampling/importance_sampling_ratio/max": 1.4442309141159058, "sampling/importance_sampling_ratio/mean": 0.5862385034561157, "sampling/importance_sampling_ratio/min": 0.0008710719994269311, "sampling/sampling_logp_difference/max": 1.0860185623168945, "sampling/sampling_logp_difference/mean": 0.2614670693874359, "step": 755, "step_time": 13.978127775000758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6733475401997566, "epoch": 0.00378, "grad_norm": 0.053044650703668594, "kl": 0.25886671617627144, "learning_rate": 7.999952025455275e-06, "loss": -0.0831, "step": 756, "step_time": 6.453660102997674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 4.2727274894714355, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.791813187301159, "epoch": 0.003785, "frac_reward_zero_std": 0.0, "grad_norm": 0.42822861671447754, "kl": 0.44734640046954155, "learning_rate": 7.999951892100464e-06, "loss": -0.071, "num_tokens": 9908671.0, "reward": 0.3078415095806122, "reward_std": 1.3198648691177368, "rewards/rollout_reward_func/mean": 0.3078415095806122, "rewards/rollout_reward_func/std": 1.3198649883270264, "sampling/importance_sampling_ratio/max": 1.3225003480911255, "sampling/importance_sampling_ratio/mean": 0.6559262871742249, "sampling/importance_sampling_ratio/min": 1.1211473065486643e-05, "sampling/sampling_logp_difference/max": 1.7650985717773438, "sampling/sampling_logp_difference/mean": 0.3135019540786743, "step": 757, "step_time": 12.834621502974187 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.7961555272340775, "epoch": 0.00379, "grad_norm": 0.23822848498821259, "kl": 0.4506722167134285, "learning_rate": 7.999951758560567e-06, "loss": -0.072, "step": 758, "step_time": 6.364351304000593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.4375, "completions/mean_terminated_length": 5.461538791656494, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.407891184091568, "epoch": 0.003795, "frac_reward_zero_std": 0.0, "grad_norm": 0.10175769776105881, "kl": 0.19122106209397316, "learning_rate": 7.999951624835587e-06, "loss": -0.0784, "num_tokens": 9935043.0, "reward": 0.6875476837158203, "reward_std": 1.4470492601394653, "rewards/rollout_reward_func/mean": 0.6875476837158203, "rewards/rollout_reward_func/std": 1.4470492601394653, "sampling/importance_sampling_ratio/max": 1.083859920501709, "sampling/importance_sampling_ratio/mean": 0.6694385409355164, "sampling/importance_sampling_ratio/min": 0.0003706466231960803, "sampling/sampling_logp_difference/max": 1.2277116775512695, "sampling/sampling_logp_difference/mean": 0.2523867189884186, "step": 759, "step_time": 11.376554332004162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4108516871929169, "epoch": 0.0038, "grad_norm": 0.10804326832294464, "kl": 0.1909339539706707, "learning_rate": 7.999951490925523e-06, "loss": -0.0787, "step": 760, "step_time": 5.710574181997799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.222696952521801, "epoch": 0.003805, "frac_reward_zero_std": 0.0, "grad_norm": 0.09476017951965332, "kl": 0.18105251993983984, "learning_rate": 7.999951356830373e-06, "loss": -0.0472, "num_tokens": 9965452.0, "reward": -0.1624600887298584, "reward_std": 0.931138277053833, "rewards/rollout_reward_func/mean": -0.1624600887298584, "rewards/rollout_reward_func/std": 0.9311383962631226, "sampling/importance_sampling_ratio/max": 1.087558627128601, "sampling/importance_sampling_ratio/mean": 0.7665117979049683, "sampling/importance_sampling_ratio/min": 0.0007815637509338558, "sampling/sampling_logp_difference/max": 1.2054293155670166, "sampling/sampling_logp_difference/mean": 0.20574736595153809, "step": 761, "step_time": 12.96273303599446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.229267567396164, "epoch": 0.00381, "grad_norm": 0.0949527695775032, "kl": 0.18119446747004986, "learning_rate": 7.999951222550139e-06, "loss": -0.0479, "step": 762, "step_time": 6.398220663002576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 4.9166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5261903703212738, "epoch": 0.003815, "frac_reward_zero_std": 0.0, "grad_norm": 0.11325422674417496, "kl": 0.1701567992568016, "learning_rate": 7.999951088084822e-06, "loss": -0.029, "num_tokens": 9993469.0, "reward": 0.581977367401123, "reward_std": 1.514918565750122, "rewards/rollout_reward_func/mean": 0.581977367401123, "rewards/rollout_reward_func/std": 1.5149184465408325, "sampling/importance_sampling_ratio/max": 1.0831048488616943, "sampling/importance_sampling_ratio/mean": 0.6637330651283264, "sampling/importance_sampling_ratio/min": 0.0004281656292732805, "sampling/sampling_logp_difference/max": 1.3488187789916992, "sampling/sampling_logp_difference/mean": 0.19208721816539764, "step": 763, "step_time": 16.71761110999796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.53103706240654, "epoch": 0.00382, "grad_norm": 0.12228398770093918, "kl": 0.16888852044939995, "learning_rate": 7.999950953434419e-06, "loss": -0.0289, "step": 764, "step_time": 7.567246754988446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.875, "completions/mean_terminated_length": 5.1666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7689520716667175, "epoch": 0.003825, "frac_reward_zero_std": 0.0, "grad_norm": 0.060748714953660965, "kl": 0.1577670555561781, "learning_rate": 7.999950818598932e-06, "loss": -0.0954, "num_tokens": 10018872.0, "reward": 0.6423620581626892, "reward_std": 1.3718135356903076, "rewards/rollout_reward_func/mean": 0.6423620581626892, "rewards/rollout_reward_func/std": 1.3718135356903076, "sampling/importance_sampling_ratio/max": 1.1047903299331665, "sampling/importance_sampling_ratio/mean": 0.5803654193878174, "sampling/importance_sampling_ratio/min": 0.0005977092077955604, "sampling/sampling_logp_difference/max": 1.4257856607437134, "sampling/sampling_logp_difference/mean": 0.26528429985046387, "step": 765, "step_time": 13.807651857016026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7652133703231812, "epoch": 0.00383, "grad_norm": 0.057922981679439545, "kl": 0.15862606652081013, "learning_rate": 7.99995068357836e-06, "loss": -0.0954, "step": 766, "step_time": 6.382629980987986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.538461685180664, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7767134681344032, "epoch": 0.003835, "frac_reward_zero_std": 0.0, "grad_norm": 0.16252964735031128, "kl": 0.22314058616757393, "learning_rate": 7.999950548372704e-06, "loss": -0.0824, "num_tokens": 10039852.0, "reward": 0.5485672354698181, "reward_std": 1.3290321826934814, "rewards/rollout_reward_func/mean": 0.5485672354698181, "rewards/rollout_reward_func/std": 1.3290321826934814, "sampling/importance_sampling_ratio/max": 1.2799867391586304, "sampling/importance_sampling_ratio/mean": 0.7122273445129395, "sampling/importance_sampling_ratio/min": 3.5666207622853108e-06, "sampling/sampling_logp_difference/max": 1.8252360820770264, "sampling/sampling_logp_difference/mean": 0.34073519706726074, "step": 767, "step_time": 10.632837440018193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7762400954961777, "epoch": 0.00384, "grad_norm": 0.1589597761631012, "kl": 0.22147198393940926, "learning_rate": 7.999950412981963e-06, "loss": -0.0829, "step": 768, "step_time": 4.953826940996805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7350924015045166, "epoch": 0.003845, "frac_reward_zero_std": 0.0, "grad_norm": 0.09533069282770157, "kl": 0.1428026258945465, "learning_rate": 7.99995027740614e-06, "loss": -0.0783, "num_tokens": 10064914.0, "reward": 0.7830778360366821, "reward_std": 1.4039336442947388, "rewards/rollout_reward_func/mean": 0.7830778360366821, "rewards/rollout_reward_func/std": 1.4039336442947388, "sampling/importance_sampling_ratio/max": 1.0674573183059692, "sampling/importance_sampling_ratio/mean": 0.6603968739509583, "sampling/importance_sampling_ratio/min": 1.0206380807176174e-07, "sampling/sampling_logp_difference/max": 2.2526638507843018, "sampling/sampling_logp_difference/mean": 0.3342423141002655, "step": 769, "step_time": 13.303249005024554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.73126021027565, "epoch": 0.00385, "grad_norm": 0.08988653868436813, "kl": 0.1434617806226015, "learning_rate": 7.99995014164523e-06, "loss": -0.0786, "step": 770, "step_time": 6.623757626992301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.92307710647583, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3028710633516312, "epoch": 0.003855, "frac_reward_zero_std": 0.0, "grad_norm": 0.07308857887983322, "kl": 0.3442955520004034, "learning_rate": 7.999950005699237e-06, "loss": -0.0626, "num_tokens": 10095658.0, "reward": 0.48344117403030396, "reward_std": 1.2253443002700806, "rewards/rollout_reward_func/mean": 0.48344117403030396, "rewards/rollout_reward_func/std": 1.2253443002700806, "sampling/importance_sampling_ratio/max": 1.1922379732131958, "sampling/importance_sampling_ratio/mean": 0.6959900856018066, "sampling/importance_sampling_ratio/min": 0.0005495495861396194, "sampling/sampling_logp_difference/max": 1.362788438796997, "sampling/sampling_logp_difference/mean": 0.236789733171463, "step": 771, "step_time": 13.2211000330135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2994334250688553, "epoch": 0.00386, "grad_norm": 0.07255715876817703, "kl": 0.34317421540617943, "learning_rate": 7.999949869568159e-06, "loss": -0.0627, "step": 772, "step_time": 6.18419998200261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.875, "completions/mean_terminated_length": 7.090909481048584, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8434318900108337, "epoch": 0.003865, "frac_reward_zero_std": 0.0, "grad_norm": 0.04264972731471062, "kl": 0.1178209250792861, "learning_rate": 7.999949733251996e-06, "loss": -0.1161, "num_tokens": 10130664.0, "reward": 0.655791163444519, "reward_std": 1.302643895149231, "rewards/rollout_reward_func/mean": 0.655791163444519, "rewards/rollout_reward_func/std": 1.302643895149231, "sampling/importance_sampling_ratio/max": 1.0978360176086426, "sampling/importance_sampling_ratio/mean": 0.5347107648849487, "sampling/importance_sampling_ratio/min": 0.00012426009925547987, "sampling/sampling_logp_difference/max": 1.5589747428894043, "sampling/sampling_logp_difference/mean": 0.31512027978897095, "step": 773, "step_time": 17.38634605698462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8433603942394257, "epoch": 0.00387, "grad_norm": 0.03993485867977142, "kl": 0.1191735714673996, "learning_rate": 7.99994959675075e-06, "loss": -0.1162, "step": 774, "step_time": 7.874426689988468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.345235377550125, "epoch": 0.003875, "frac_reward_zero_std": 0.0, "grad_norm": 0.15642628073692322, "kl": 0.1969826016575098, "learning_rate": 7.999949460064417e-06, "loss": -0.1144, "num_tokens": 10159140.0, "reward": 0.34580397605895996, "reward_std": 1.384642481803894, "rewards/rollout_reward_func/mean": 0.34580397605895996, "rewards/rollout_reward_func/std": 1.3846426010131836, "sampling/importance_sampling_ratio/max": 1.095271348953247, "sampling/importance_sampling_ratio/mean": 0.45000219345092773, "sampling/importance_sampling_ratio/min": 2.835779184806597e-07, "sampling/sampling_logp_difference/max": 2.2906494140625, "sampling/sampling_logp_difference/mean": 0.47056132555007935, "step": 775, "step_time": 13.75429143900692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.336964786052704, "epoch": 0.00388, "grad_norm": 0.11990028619766235, "kl": 0.19448307156562805, "learning_rate": 7.999949323193002e-06, "loss": -0.1153, "step": 776, "step_time": 5.949021216001711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.875, "completions/mean_terminated_length": 5.111111164093018, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4524281322956085, "epoch": 0.003885, "frac_reward_zero_std": 0.0, "grad_norm": 0.16550695896148682, "kl": 0.10599629208445549, "learning_rate": 7.999949186136502e-06, "loss": -0.0829, "num_tokens": 10185162.0, "reward": 0.20655770599842072, "reward_std": 1.2196818590164185, "rewards/rollout_reward_func/mean": 0.20655770599842072, "rewards/rollout_reward_func/std": 1.219681978225708, "sampling/importance_sampling_ratio/max": 1.1191664934158325, "sampling/importance_sampling_ratio/mean": 0.45617926120758057, "sampling/importance_sampling_ratio/min": 0.0001033371954690665, "sampling/sampling_logp_difference/max": 1.7332544326782227, "sampling/sampling_logp_difference/mean": 0.3834424912929535, "step": 777, "step_time": 11.93338984400907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4372584521770477, "epoch": 0.00389, "grad_norm": 0.15467476844787598, "kl": 0.10635008662939072, "learning_rate": 7.999949048894918e-06, "loss": -0.0838, "step": 778, "step_time": 5.220441110010142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5468730125576258, "epoch": 0.003895, "frac_reward_zero_std": 0.0, "grad_norm": 0.035357363522052765, "kl": 0.2051002699881792, "learning_rate": 7.99994891146825e-06, "loss": -0.1053, "num_tokens": 10215760.0, "reward": 0.7751669883728027, "reward_std": 1.2327605485916138, "rewards/rollout_reward_func/mean": 0.7751669883728027, "rewards/rollout_reward_func/std": 1.2327606678009033, "sampling/importance_sampling_ratio/max": 1.1136366128921509, "sampling/importance_sampling_ratio/mean": 0.6556033492088318, "sampling/importance_sampling_ratio/min": 7.826522778486833e-05, "sampling/sampling_logp_difference/max": 1.396798849105835, "sampling/sampling_logp_difference/mean": 0.22830212116241455, "step": 779, "step_time": 14.30760616598127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.54120665602386, "epoch": 0.0039, "grad_norm": 0.02519712597131729, "kl": 0.20637276768684387, "learning_rate": 7.999948773856498e-06, "loss": -0.1055, "step": 780, "step_time": 6.359499323982163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9141813293099403, "epoch": 0.003905, "frac_reward_zero_std": 0.0, "grad_norm": 0.04191231355071068, "kl": 0.2306758537888527, "learning_rate": 7.99994863605966e-06, "loss": -0.0897, "num_tokens": 10239657.0, "reward": 1.4399141073226929, "reward_std": 1.0931057929992676, "rewards/rollout_reward_func/mean": 1.4399141073226929, "rewards/rollout_reward_func/std": 1.0931057929992676, "sampling/importance_sampling_ratio/max": 1.1375489234924316, "sampling/importance_sampling_ratio/mean": 0.8496896028518677, "sampling/importance_sampling_ratio/min": 0.005196988116949797, "sampling/sampling_logp_difference/max": 1.0332221984863281, "sampling/sampling_logp_difference/mean": 0.15574054419994354, "step": 781, "step_time": 12.403630300992518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9107755925506353, "epoch": 0.00391, "grad_norm": 0.03383581340312958, "kl": 0.23173260316252708, "learning_rate": 7.999948498077739e-06, "loss": -0.0899, "step": 782, "step_time": 5.5837528919946635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.8125, "completions/mean_terminated_length": 6.642857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7402483820915222, "epoch": 0.003915, "frac_reward_zero_std": 0.0, "grad_norm": 0.25028422474861145, "kl": 1.4393988102674484, "learning_rate": 7.999948359910732e-06, "loss": -0.0607, "num_tokens": 10259422.0, "reward": -0.7459529638290405, "reward_std": 0.510871410369873, "rewards/rollout_reward_func/mean": -0.7459529638290405, "rewards/rollout_reward_func/std": 0.510871410369873, "sampling/importance_sampling_ratio/max": 1.1407362222671509, "sampling/importance_sampling_ratio/mean": 0.6214675903320312, "sampling/importance_sampling_ratio/min": 9.799793770071119e-05, "sampling/sampling_logp_difference/max": 1.9750442504882812, "sampling/sampling_logp_difference/mean": 0.36374229192733765, "step": 783, "step_time": 9.970475109992549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7332478947937489, "epoch": 0.00392, "grad_norm": 0.19120046496391296, "kl": 1.1787555254995823, "learning_rate": 7.999948221558643e-06, "loss": -0.0619, "step": 784, "step_time": 4.411742200987646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.428571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0053733326494694, "epoch": 0.003925, "frac_reward_zero_std": 0.0, "grad_norm": 0.008006267249584198, "kl": 0.266781959682703, "learning_rate": 7.999948083021468e-06, "loss": -0.0633, "num_tokens": 10291012.0, "reward": 0.676851212978363, "reward_std": 1.2568273544311523, "rewards/rollout_reward_func/mean": 0.676851212978363, "rewards/rollout_reward_func/std": 1.2568273544311523, "sampling/importance_sampling_ratio/max": 1.2135766744613647, "sampling/importance_sampling_ratio/mean": 0.8513239622116089, "sampling/importance_sampling_ratio/min": 0.000954003247898072, "sampling/sampling_logp_difference/max": 1.165948510169983, "sampling/sampling_logp_difference/mean": 0.18930146098136902, "step": 785, "step_time": 13.649204645014834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0112903099507093, "epoch": 0.00393, "grad_norm": 0.007569476496428251, "kl": 0.2669694311916828, "learning_rate": 7.99994794429921e-06, "loss": -0.0633, "step": 786, "step_time": 6.8609875770052895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2656361442059278, "epoch": 0.003935, "frac_reward_zero_std": 0.0, "grad_norm": 0.10388432443141937, "kl": 0.34090616926550865, "learning_rate": 7.999947805391867e-06, "loss": -0.049, "num_tokens": 10313850.0, "reward": 0.3767801523208618, "reward_std": 1.265277624130249, "rewards/rollout_reward_func/mean": 0.3767801523208618, "rewards/rollout_reward_func/std": 1.265277624130249, "sampling/importance_sampling_ratio/max": 1.2253094911575317, "sampling/importance_sampling_ratio/mean": 0.7983208298683167, "sampling/importance_sampling_ratio/min": 1.1921139957848936e-05, "sampling/sampling_logp_difference/max": 1.6261080503463745, "sampling/sampling_logp_difference/mean": 0.1996774822473526, "step": 787, "step_time": 10.043367405000026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2567575564607978, "epoch": 0.00394, "grad_norm": 0.08720025420188904, "kl": 0.3519143909215927, "learning_rate": 7.999947666299442e-06, "loss": -0.0493, "step": 788, "step_time": 4.955549106001854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7001148350536823, "epoch": 0.003945, "frac_reward_zero_std": 0.0, "grad_norm": 0.13411930203437805, "kl": 0.2551637254655361, "learning_rate": 7.99994752702193e-06, "loss": -0.0442, "num_tokens": 10341866.0, "reward": 1.3639817237854004, "reward_std": 1.068237543106079, "rewards/rollout_reward_func/mean": 1.3639817237854004, "rewards/rollout_reward_func/std": 1.0682376623153687, "sampling/importance_sampling_ratio/max": 1.2124547958374023, "sampling/importance_sampling_ratio/mean": 0.9214187860488892, "sampling/importance_sampling_ratio/min": 0.00016006140504032373, "sampling/sampling_logp_difference/max": 1.4343888759613037, "sampling/sampling_logp_difference/mean": 0.12715664505958557, "step": 789, "step_time": 10.82213116400817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7028623651713133, "epoch": 0.00395, "grad_norm": 0.14297552406787872, "kl": 0.2548943944275379, "learning_rate": 7.999947387559333e-06, "loss": -0.0444, "step": 790, "step_time": 5.148698584001977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.625, "completions/mean_terminated_length": 4.933333396911621, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.171097768470645, "epoch": 0.003955, "frac_reward_zero_std": 0.0, "grad_norm": 0.07079923897981644, "kl": 0.4849514178931713, "learning_rate": 7.999947247911654e-06, "loss": -0.0732, "num_tokens": 10362715.0, "reward": 0.42419302463531494, "reward_std": 1.0788127183914185, "rewards/rollout_reward_func/mean": 0.42419302463531494, "rewards/rollout_reward_func/std": 1.0788127183914185, "sampling/importance_sampling_ratio/max": 1.0945862531661987, "sampling/importance_sampling_ratio/mean": 0.8243577480316162, "sampling/importance_sampling_ratio/min": 0.00011767000250983983, "sampling/sampling_logp_difference/max": 2.059704065322876, "sampling/sampling_logp_difference/mean": 0.2579263746738434, "step": 791, "step_time": 9.065253797991318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1715492671355605, "epoch": 0.00396, "grad_norm": 0.0653783306479454, "kl": 0.4391101151704788, "learning_rate": 7.99994710807889e-06, "loss": -0.0735, "step": 792, "step_time": 4.354253672005143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 5.000000476837158, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3207686692476273, "epoch": 0.003965, "frac_reward_zero_std": 0.0, "grad_norm": 0.218119814991951, "kl": 1.1358569264411926, "learning_rate": 7.999946968061042e-06, "loss": -0.0766, "num_tokens": 10383510.0, "reward": 0.8900867700576782, "reward_std": 1.3282641172409058, "rewards/rollout_reward_func/mean": 0.8900867700576782, "rewards/rollout_reward_func/std": 1.3282641172409058, "sampling/importance_sampling_ratio/max": 1.6365448236465454, "sampling/importance_sampling_ratio/mean": 0.7879002094268799, "sampling/importance_sampling_ratio/min": 1.5893813554157532e-07, "sampling/sampling_logp_difference/max": 2.752352714538574, "sampling/sampling_logp_difference/mean": 0.2658015489578247, "step": 793, "step_time": 10.279095242003677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3326700329780579, "epoch": 0.00397, "grad_norm": 0.21069703996181488, "kl": 1.0904097184538841, "learning_rate": 7.999946827858109e-06, "loss": -0.0771, "step": 794, "step_time": 4.893072981984005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 4.285714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2584636881947517, "epoch": 0.003975, "frac_reward_zero_std": 0.0, "grad_norm": 0.19876302778720856, "kl": 0.29571155086159706, "learning_rate": 7.999946687470092e-06, "loss": -0.035, "num_tokens": 10409726.0, "reward": -0.11805307865142822, "reward_std": 1.12894868850708, "rewards/rollout_reward_func/mean": -0.11805307865142822, "rewards/rollout_reward_func/std": 1.1289488077163696, "sampling/importance_sampling_ratio/max": 1.3717951774597168, "sampling/importance_sampling_ratio/mean": 0.9113879799842834, "sampling/importance_sampling_ratio/min": 4.580269887810573e-06, "sampling/sampling_logp_difference/max": 2.0214295387268066, "sampling/sampling_logp_difference/mean": 0.2692537307739258, "step": 795, "step_time": 10.54180017298495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2677871882915497, "epoch": 0.00398, "grad_norm": 0.18831758201122284, "kl": 0.29077286273241043, "learning_rate": 7.999946546896992e-06, "loss": -0.035, "step": 796, "step_time": 5.281840477022342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.4375, "completions/mean_terminated_length": 5.461538791656494, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9548377692699432, "epoch": 0.003985, "frac_reward_zero_std": 0.0, "grad_norm": 0.09578566998243332, "kl": 0.44401534274220467, "learning_rate": 7.999946406138807e-06, "loss": -0.0952, "num_tokens": 10435218.0, "reward": 0.8199118971824646, "reward_std": 1.0427751541137695, "rewards/rollout_reward_func/mean": 0.8199118971824646, "rewards/rollout_reward_func/std": 1.0427751541137695, "sampling/importance_sampling_ratio/max": 1.147602915763855, "sampling/importance_sampling_ratio/mean": 0.6795491576194763, "sampling/importance_sampling_ratio/min": 1.8286192116079292e-08, "sampling/sampling_logp_difference/max": 2.140791893005371, "sampling/sampling_logp_difference/mean": 0.3714497685432434, "step": 797, "step_time": 13.843372738003382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9590119123458862, "epoch": 0.00399, "grad_norm": 0.10089938342571259, "kl": 0.4430372826755047, "learning_rate": 7.999946265195538e-06, "loss": -0.0951, "step": 798, "step_time": 6.241435456002364 }, { "clip_ratio/high_max": 0.01923076994717121, "clip_ratio/high_mean": 0.009615384973585606, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009615384973585606, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 4.692307949066162, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7760833948850632, "epoch": 0.003995, "frac_reward_zero_std": 0.0, "grad_norm": 0.20118695497512817, "kl": 0.2511912137269974, "learning_rate": 7.999946124067184e-06, "loss": -0.0794, "num_tokens": 10465397.0, "reward": 0.3169923424720764, "reward_std": 1.2582435607910156, "rewards/rollout_reward_func/mean": 0.3169923424720764, "rewards/rollout_reward_func/std": 1.2582435607910156, "sampling/importance_sampling_ratio/max": 1.310233235359192, "sampling/importance_sampling_ratio/mean": 0.7067400813102722, "sampling/importance_sampling_ratio/min": 6.377115641953424e-05, "sampling/sampling_logp_difference/max": 1.6486659049987793, "sampling/sampling_logp_difference/mean": 0.3046732544898987, "step": 799, "step_time": 15.154369082985795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7718331068754196, "epoch": 0.004, "grad_norm": 0.17624041438102722, "kl": 0.2555958330631256, "learning_rate": 7.999945982753747e-06, "loss": -0.0803, "step": 800, "step_time": 7.672128796984907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 5.090909004211426, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7465638536959887, "epoch": 0.004005, "frac_reward_zero_std": 0.0, "grad_norm": 0.03235943242907524, "kl": 0.18337423726916313, "learning_rate": 7.999945841255226e-06, "loss": -0.0807, "num_tokens": 10496028.0, "reward": 0.3791864514350891, "reward_std": 1.3045703172683716, "rewards/rollout_reward_func/mean": 0.3791864514350891, "rewards/rollout_reward_func/std": 1.3045703172683716, "sampling/importance_sampling_ratio/max": 1.3538540601730347, "sampling/importance_sampling_ratio/mean": 0.6802151203155518, "sampling/importance_sampling_ratio/min": 1.0677626050892286e-05, "sampling/sampling_logp_difference/max": 1.6729178428649902, "sampling/sampling_logp_difference/mean": 0.36187949776649475, "step": 801, "step_time": 15.121021523998934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7526963707059622, "epoch": 0.00401, "grad_norm": 0.032330404967069626, "kl": 0.18318462744355202, "learning_rate": 7.99994569957162e-06, "loss": -0.0808, "step": 802, "step_time": 6.882742801986751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 4.153846263885498, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.514447145164013, "epoch": 0.004015, "frac_reward_zero_std": 0.0, "grad_norm": 0.04892304539680481, "kl": 0.2921570222824812, "learning_rate": 7.999945557702931e-06, "loss": -0.1112, "num_tokens": 10519368.0, "reward": 0.3095848560333252, "reward_std": 1.1528383493423462, "rewards/rollout_reward_func/mean": 0.3095848560333252, "rewards/rollout_reward_func/std": 1.1528383493423462, "sampling/importance_sampling_ratio/max": 1.187055230140686, "sampling/importance_sampling_ratio/mean": 0.7657525539398193, "sampling/importance_sampling_ratio/min": 1.0578660294413567e-05, "sampling/sampling_logp_difference/max": 2.165083885192871, "sampling/sampling_logp_difference/mean": 0.29222220182418823, "step": 803, "step_time": 10.971469850002904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5135762579739094, "epoch": 0.00402, "grad_norm": 0.04919622093439102, "kl": 0.2806267961859703, "learning_rate": 7.999945415649156e-06, "loss": -0.1112, "step": 804, "step_time": 5.231323814994539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.5625, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0120148956775665, "epoch": 0.004025, "frac_reward_zero_std": 0.0, "grad_norm": 0.239168182015419, "kl": 0.2570476271212101, "learning_rate": 7.999945273410298e-06, "loss": -0.0796, "num_tokens": 10551509.0, "reward": 0.7903881072998047, "reward_std": 1.2942399978637695, "rewards/rollout_reward_func/mean": 0.7903881072998047, "rewards/rollout_reward_func/std": 1.2942399978637695, "sampling/importance_sampling_ratio/max": 1.1603055000305176, "sampling/importance_sampling_ratio/mean": 0.7026547193527222, "sampling/importance_sampling_ratio/min": 5.744033160226536e-07, "sampling/sampling_logp_difference/max": 1.706427812576294, "sampling/sampling_logp_difference/mean": 0.3663366436958313, "step": 805, "step_time": 16.433994209000957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0204259157180786, "epoch": 0.00403, "grad_norm": 0.2550611197948456, "kl": 0.25561030581593513, "learning_rate": 7.999945130986357e-06, "loss": -0.0803, "step": 806, "step_time": 8.208526788992458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7775006052106619, "epoch": 0.004035, "frac_reward_zero_std": 0.5, "grad_norm": 0.1685222089290619, "kl": 0.27888917177915573, "learning_rate": 7.999944988377332e-06, "loss": -0.0302, "num_tokens": 10574660.0, "reward": 0.9308534860610962, "reward_std": 1.3899630308151245, "rewards/rollout_reward_func/mean": 0.9308534860610962, "rewards/rollout_reward_func/std": 1.389963150024414, "sampling/importance_sampling_ratio/max": 1.0968338251113892, "sampling/importance_sampling_ratio/mean": 0.8679816722869873, "sampling/importance_sampling_ratio/min": 0.0008083858992904425, "sampling/sampling_logp_difference/max": 1.2524577379226685, "sampling/sampling_logp_difference/mean": 0.13882991671562195, "step": 807, "step_time": 12.29959618501016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.7787439655512571, "epoch": 0.00404, "grad_norm": 0.06575574725866318, "kl": 0.2915450967848301, "learning_rate": 7.99994484558322e-06, "loss": -0.0308, "step": 808, "step_time": 6.257792104996042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 5.5714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.582720760256052, "epoch": 0.004045, "frac_reward_zero_std": 0.0, "grad_norm": 0.0890568271279335, "kl": 0.23583549074828625, "learning_rate": 7.999944702604026e-06, "loss": -0.0866, "num_tokens": 10594619.0, "reward": 1.4378912448883057, "reward_std": 1.068808674812317, "rewards/rollout_reward_func/mean": 1.4378912448883057, "rewards/rollout_reward_func/std": 1.068808674812317, "sampling/importance_sampling_ratio/max": 1.1977405548095703, "sampling/importance_sampling_ratio/mean": 0.808336615562439, "sampling/importance_sampling_ratio/min": 1.1436541171860881e-06, "sampling/sampling_logp_difference/max": 2.0267982482910156, "sampling/sampling_logp_difference/mean": 0.2780812382698059, "step": 809, "step_time": 10.047575819000485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5886869318783283, "epoch": 0.00405, "grad_norm": 0.09717441350221634, "kl": 0.2396579012274742, "learning_rate": 7.999944559439747e-06, "loss": -0.0863, "step": 810, "step_time": 4.895345675002318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.6875, "completions/mean_terminated_length": 6.818181991577148, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.682672083377838, "epoch": 0.004055, "frac_reward_zero_std": 0.0, "grad_norm": 0.062065090984106064, "kl": 0.10122201219201088, "learning_rate": 7.999944416090385e-06, "loss": -0.0954, "num_tokens": 10625830.0, "reward": -0.007086798548698425, "reward_std": 1.1244661808013916, "rewards/rollout_reward_func/mean": -0.007086798548698425, "rewards/rollout_reward_func/std": 1.1244661808013916, "sampling/importance_sampling_ratio/max": 1.273112177848816, "sampling/importance_sampling_ratio/mean": 0.4927072525024414, "sampling/importance_sampling_ratio/min": 5.526043241843581e-05, "sampling/sampling_logp_difference/max": 1.5726500749588013, "sampling/sampling_logp_difference/mean": 0.38900986313819885, "step": 811, "step_time": 15.351208597028744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.687183439731598, "epoch": 0.00406, "grad_norm": 0.06330984830856323, "kl": 0.10243288427591324, "learning_rate": 7.999944272555939e-06, "loss": -0.0954, "step": 812, "step_time": 7.115361320000375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8700753822922707, "epoch": 0.004065, "frac_reward_zero_std": 0.0, "grad_norm": 0.1248372346162796, "kl": 0.21790606155991554, "learning_rate": 7.999944128836408e-06, "loss": -0.061, "num_tokens": 10650205.0, "reward": 1.1138933897018433, "reward_std": 1.2115466594696045, "rewards/rollout_reward_func/mean": 1.1138933897018433, "rewards/rollout_reward_func/std": 1.2115466594696045, "sampling/importance_sampling_ratio/max": 1.2617828845977783, "sampling/importance_sampling_ratio/mean": 0.9589241743087769, "sampling/importance_sampling_ratio/min": 0.0008691083639860153, "sampling/sampling_logp_difference/max": 1.0903434753417969, "sampling/sampling_logp_difference/mean": 0.17384719848632812, "step": 813, "step_time": 13.086072446996695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8718622624874115, "epoch": 0.00407, "grad_norm": 0.13430888950824738, "kl": 0.21756162121891975, "learning_rate": 7.999943984931794e-06, "loss": -0.0615, "step": 814, "step_time": 6.436355885001831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.375, "completions/mean_terminated_length": 7.1666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2937994599342346, "epoch": 0.004075, "frac_reward_zero_std": 0.0, "grad_norm": 0.15012270212173462, "kl": 1.4267187304794788, "learning_rate": 7.999943840842096e-06, "loss": -0.0658, "num_tokens": 10672942.0, "reward": 0.8003044724464417, "reward_std": 1.3669344186782837, "rewards/rollout_reward_func/mean": 0.8003044724464417, "rewards/rollout_reward_func/std": 1.3669345378875732, "sampling/importance_sampling_ratio/max": 1.1372681856155396, "sampling/importance_sampling_ratio/mean": 0.4106827974319458, "sampling/importance_sampling_ratio/min": 8.77024376677582e-06, "sampling/sampling_logp_difference/max": 1.8111064434051514, "sampling/sampling_logp_difference/mean": 0.4002304673194885, "step": 815, "step_time": 12.42526442099188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2946112751960754, "epoch": 0.00408, "grad_norm": 0.12437368929386139, "kl": 1.2621567100286484, "learning_rate": 7.999943696567313e-06, "loss": -0.0663, "step": 816, "step_time": 5.236665272008395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 4.2727274894714355, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6678968518972397, "epoch": 0.004085, "frac_reward_zero_std": 0.0, "grad_norm": 0.047437574714422226, "kl": 0.19917624071240425, "learning_rate": 7.999943552107446e-06, "loss": -0.0897, "num_tokens": 10702853.0, "reward": 0.6816588640213013, "reward_std": 1.1672728061676025, "rewards/rollout_reward_func/mean": 0.6816588640213013, "rewards/rollout_reward_func/std": 1.167272686958313, "sampling/importance_sampling_ratio/max": 1.1859722137451172, "sampling/importance_sampling_ratio/mean": 0.6810490489006042, "sampling/importance_sampling_ratio/min": 0.00027309200959280133, "sampling/sampling_logp_difference/max": 1.3303356170654297, "sampling/sampling_logp_difference/mean": 0.26798316836357117, "step": 817, "step_time": 12.800386022994644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6739291418343782, "epoch": 0.00409, "grad_norm": 0.049533337354660034, "kl": 0.1935303844511509, "learning_rate": 7.999943407462496e-06, "loss": -0.0899, "step": 818, "step_time": 5.923611403995892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 5.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.625838726758957, "epoch": 0.004095, "frac_reward_zero_std": 0.0, "grad_norm": 0.052006397396326065, "kl": 0.4664272330701351, "learning_rate": 7.99994326263246e-06, "loss": -0.0715, "num_tokens": 10725405.0, "reward": 1.2819490432739258, "reward_std": 1.1933164596557617, "rewards/rollout_reward_func/mean": 1.2819490432739258, "rewards/rollout_reward_func/std": 1.1933164596557617, "sampling/importance_sampling_ratio/max": 1.1229017972946167, "sampling/importance_sampling_ratio/mean": 0.7361521124839783, "sampling/importance_sampling_ratio/min": 5.660386364070291e-07, "sampling/sampling_logp_difference/max": 1.8502599000930786, "sampling/sampling_logp_difference/mean": 0.27584075927734375, "step": 819, "step_time": 11.323312608001288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6231374442577362, "epoch": 0.0041, "grad_norm": 0.052733130753040314, "kl": 0.467009786516428, "learning_rate": 7.999943117617342e-06, "loss": -0.0714, "step": 820, "step_time": 5.283480111014796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 5.928571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8864512145519257, "epoch": 0.004105, "frac_reward_zero_std": 0.0, "grad_norm": 0.1532861888408661, "kl": 0.2020549215376377, "learning_rate": 7.99994297241714e-06, "loss": -0.0769, "num_tokens": 10752124.0, "reward": 1.112114429473877, "reward_std": 0.8128613829612732, "rewards/rollout_reward_func/mean": 1.112114429473877, "rewards/rollout_reward_func/std": 0.812861442565918, "sampling/importance_sampling_ratio/max": 1.2765742540359497, "sampling/importance_sampling_ratio/mean": 0.6488606333732605, "sampling/importance_sampling_ratio/min": 5.097713255963754e-07, "sampling/sampling_logp_difference/max": 2.3843963146209717, "sampling/sampling_logp_difference/mean": 0.3768719434738159, "step": 821, "step_time": 12.312355775997275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.877402424812317, "epoch": 0.00411, "grad_norm": 0.14707370102405548, "kl": 0.20502158254384995, "learning_rate": 7.999942827031854e-06, "loss": -0.0775, "step": 822, "step_time": 6.009036557996296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4561266899108887, "epoch": 0.004115, "frac_reward_zero_std": 0.5, "grad_norm": 0.9129485487937927, "kl": 5.768064685165882, "learning_rate": 7.999942681461483e-06, "loss": -0.0259, "num_tokens": 10774601.0, "reward": 1.4036455154418945, "reward_std": 1.1803408861160278, "rewards/rollout_reward_func/mean": 1.4036455154418945, "rewards/rollout_reward_func/std": 1.1803408861160278, "sampling/importance_sampling_ratio/max": 1.1084656715393066, "sampling/importance_sampling_ratio/mean": 0.7923044562339783, "sampling/importance_sampling_ratio/min": 0.00021728838328272104, "sampling/sampling_logp_difference/max": 1.7457895278930664, "sampling/sampling_logp_difference/mean": 0.20692461729049683, "step": 823, "step_time": 11.307068284993875 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 1.4627657234668732, "epoch": 0.00412, "grad_norm": 0.434101939201355, "kl": 2.8058885373175144, "learning_rate": 7.999942535706029e-06, "loss": -0.037, "step": 824, "step_time": 4.833551446979982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 4.285714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2119636721909046, "epoch": 0.004125, "frac_reward_zero_std": 0.0, "grad_norm": 0.019149307161569595, "kl": 0.35296743735671043, "learning_rate": 7.999942389765491e-06, "loss": -0.077, "num_tokens": 10797843.0, "reward": 1.5750081539154053, "reward_std": 0.9531961679458618, "rewards/rollout_reward_func/mean": 1.5750081539154053, "rewards/rollout_reward_func/std": 0.9531962275505066, "sampling/importance_sampling_ratio/max": 1.1260498762130737, "sampling/importance_sampling_ratio/mean": 0.8545821309089661, "sampling/importance_sampling_ratio/min": 6.356319499900565e-05, "sampling/sampling_logp_difference/max": 1.6701055765151978, "sampling/sampling_logp_difference/mean": 0.26976633071899414, "step": 825, "step_time": 10.31463391998841 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 1.2229433413594961, "epoch": 0.00413, "grad_norm": 0.018949158489704132, "kl": 0.31785648688673973, "learning_rate": 7.99994224363987e-06, "loss": -0.0769, "step": 826, "step_time": 4.984461044005002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.3125, "completions/mean_terminated_length": 6.625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.1653512716293335, "epoch": 0.004135, "frac_reward_zero_std": 0.0, "grad_norm": 0.034644681960344315, "kl": 0.11201510578393936, "learning_rate": 7.999942097329164e-06, "loss": -0.0864, "num_tokens": 10824211.0, "reward": 0.4897882044315338, "reward_std": 1.2696839570999146, "rewards/rollout_reward_func/mean": 0.4897882044315338, "rewards/rollout_reward_func/std": 1.269684076309204, "sampling/importance_sampling_ratio/max": 1.1495094299316406, "sampling/importance_sampling_ratio/mean": 0.3494528830051422, "sampling/importance_sampling_ratio/min": 1.1360030839568935e-05, "sampling/sampling_logp_difference/max": 1.8427373170852661, "sampling/sampling_logp_difference/mean": 0.43803682923316956, "step": 827, "step_time": 12.321530483008246 }, { "clip_ratio/high_max": 0.009999999776482582, "clip_ratio/high_mean": 0.004999999888241291, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004999999888241291, "entropy": 3.1835713386535645, "epoch": 0.00414, "grad_norm": 0.04143045097589493, "kl": 0.09987662918865681, "learning_rate": 7.999941950833372e-06, "loss": -0.0861, "step": 828, "step_time": 5.275068166985875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 9.0625, "completions/mean_terminated_length": 4.900000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0601842403411865, "epoch": 0.004145, "frac_reward_zero_std": 0.0, "grad_norm": 0.12630796432495117, "kl": 0.27502594189718366, "learning_rate": 7.999941804152499e-06, "loss": -0.0649, "num_tokens": 10856138.0, "reward": -0.16419893503189087, "reward_std": 0.9878852367401123, "rewards/rollout_reward_func/mean": -0.16419893503189087, "rewards/rollout_reward_func/std": 0.9878852963447571, "sampling/importance_sampling_ratio/max": 1.0987610816955566, "sampling/importance_sampling_ratio/mean": 0.4730442762374878, "sampling/importance_sampling_ratio/min": 2.2188030925462954e-05, "sampling/sampling_logp_difference/max": 1.4549307823181152, "sampling/sampling_logp_difference/mean": 0.3587467074394226, "step": 829, "step_time": 14.154885545998695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0663221031427383, "epoch": 0.00415, "grad_norm": 0.1671353280544281, "kl": 0.26290800608694553, "learning_rate": 7.999941657286542e-06, "loss": -0.0639, "step": 830, "step_time": 6.873469002006459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 6.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.521833658218384, "epoch": 0.004155, "frac_reward_zero_std": 0.0, "grad_norm": 0.15060335397720337, "kl": 0.14882145822048187, "learning_rate": 7.9999415102355e-06, "loss": -0.1045, "num_tokens": 10885849.0, "reward": 0.721853494644165, "reward_std": 1.2414981126785278, "rewards/rollout_reward_func/mean": 0.721853494644165, "rewards/rollout_reward_func/std": 1.2414982318878174, "sampling/importance_sampling_ratio/max": 1.2329553365707397, "sampling/importance_sampling_ratio/mean": 0.504432201385498, "sampling/importance_sampling_ratio/min": 8.466278814012185e-06, "sampling/sampling_logp_difference/max": 1.785667061805725, "sampling/sampling_logp_difference/mean": 0.337566614151001, "step": 831, "step_time": 14.0387384880014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.520468294620514, "epoch": 0.00416, "grad_norm": 0.14757290482521057, "kl": 0.14698426239192486, "learning_rate": 7.999941362999375e-06, "loss": -0.1043, "step": 832, "step_time": 6.343631683004787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 8.6875, "completions/mean_terminated_length": 4.300000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.131381869316101, "epoch": 0.004165, "frac_reward_zero_std": 0.0, "grad_norm": 0.33439406752586365, "kl": 0.13732497859746218, "learning_rate": 7.999941215578166e-06, "loss": -0.0913, "num_tokens": 10916881.0, "reward": 0.15906241536140442, "reward_std": 1.2351319789886475, "rewards/rollout_reward_func/mean": 0.15906241536140442, "rewards/rollout_reward_func/std": 1.235132098197937, "sampling/importance_sampling_ratio/max": 1.363123893737793, "sampling/importance_sampling_ratio/mean": 0.5921889543533325, "sampling/importance_sampling_ratio/min": 2.8122865842306055e-05, "sampling/sampling_logp_difference/max": 1.738304853439331, "sampling/sampling_logp_difference/mean": 0.3386213183403015, "step": 833, "step_time": 13.861696352993022 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.131689578294754, "epoch": 0.00417, "grad_norm": 0.05362118408083916, "kl": 0.1409064084291458, "learning_rate": 7.999941067971871e-06, "loss": -0.0938, "step": 834, "step_time": 6.133240811002906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8321914970874786, "epoch": 0.004175, "frac_reward_zero_std": 0.0, "grad_norm": 0.0667107105255127, "kl": 0.18525516986846924, "learning_rate": 7.999940920180496e-06, "loss": -0.089, "num_tokens": 10941635.0, "reward": 0.7531164288520813, "reward_std": 1.343638300895691, "rewards/rollout_reward_func/mean": 0.7531164288520813, "rewards/rollout_reward_func/std": 1.3436381816864014, "sampling/importance_sampling_ratio/max": 1.1228346824645996, "sampling/importance_sampling_ratio/mean": 0.6117309927940369, "sampling/importance_sampling_ratio/min": 3.9473834476666525e-05, "sampling/sampling_logp_difference/max": 1.6837780475616455, "sampling/sampling_logp_difference/mean": 0.33657756447792053, "step": 835, "step_time": 13.489789547995315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8258413076400757, "epoch": 0.00418, "grad_norm": 0.05641843006014824, "kl": 0.18724186718463898, "learning_rate": 7.999940772204035e-06, "loss": -0.0894, "step": 836, "step_time": 6.412389028002508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.875, "completions/mean_terminated_length": 5.636363983154297, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0531975626945496, "epoch": 0.004185, "frac_reward_zero_std": 0.0, "grad_norm": 0.4704631567001343, "kl": 0.35694241896271706, "learning_rate": 7.99994062404249e-06, "loss": -0.0933, "num_tokens": 10972385.0, "reward": 0.7621716260910034, "reward_std": 1.2924801111221313, "rewards/rollout_reward_func/mean": 0.7621716260910034, "rewards/rollout_reward_func/std": 1.2924801111221313, "sampling/importance_sampling_ratio/max": 1.320028305053711, "sampling/importance_sampling_ratio/mean": 0.5491994619369507, "sampling/importance_sampling_ratio/min": 1.6855117792147212e-05, "sampling/sampling_logp_difference/max": 1.7563191652297974, "sampling/sampling_logp_difference/mean": 0.36617016792297363, "step": 837, "step_time": 13.424299523016089 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.0350237786769867, "epoch": 0.00419, "grad_norm": 0.07296384871006012, "kl": 0.3737679161131382, "learning_rate": 7.999940475695862e-06, "loss": -0.0939, "step": 838, "step_time": 6.32519748598861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 4.800000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4991641342639923, "epoch": 0.004195, "frac_reward_zero_std": 0.0, "grad_norm": 0.28579092025756836, "kl": 0.20975103043019772, "learning_rate": 7.999940327164149e-06, "loss": 0.0107, "num_tokens": 10998710.0, "reward": -0.47224488854408264, "reward_std": 0.8134857416152954, "rewards/rollout_reward_func/mean": -0.47224488854408264, "rewards/rollout_reward_func/std": 0.8134857416152954, "sampling/importance_sampling_ratio/max": 1.1502498388290405, "sampling/importance_sampling_ratio/mean": 0.4774426817893982, "sampling/importance_sampling_ratio/min": 2.5333108624181477e-06, "sampling/sampling_logp_difference/max": 1.6344003677368164, "sampling/sampling_logp_difference/mean": 0.3766632080078125, "step": 839, "step_time": 11.884255258002668 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.4960398375988007, "epoch": 0.0042, "grad_norm": 0.272161066532135, "kl": 0.2135972287505865, "learning_rate": 7.999940178447354e-06, "loss": 0.0099, "step": 840, "step_time": 5.611155888007488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.5625, "completions/mean_terminated_length": 5.5625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2336970940232277, "epoch": 0.004205, "frac_reward_zero_std": 0.5, "grad_norm": 0.04553261399269104, "kl": 0.36979222297668457, "learning_rate": 7.999940029545474e-06, "loss": 0.0225, "num_tokens": 11019238.0, "reward": 0.6965699791908264, "reward_std": 1.442270278930664, "rewards/rollout_reward_func/mean": 0.6965699791908264, "rewards/rollout_reward_func/std": 1.442270278930664, "sampling/importance_sampling_ratio/max": 1.1750295162200928, "sampling/importance_sampling_ratio/mean": 0.7958374619483948, "sampling/importance_sampling_ratio/min": 0.00012313741899561137, "sampling/sampling_logp_difference/max": 1.6806148290634155, "sampling/sampling_logp_difference/mean": 0.18185684084892273, "step": 841, "step_time": 9.101094281009864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2553936690092087, "epoch": 0.00421, "grad_norm": 0.047547198832035065, "kl": 0.3659484386444092, "learning_rate": 7.99993988045851e-06, "loss": 0.0224, "step": 842, "step_time": 5.156900621004752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.875, "completions/mean_terminated_length": 5.1666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7891291379928589, "epoch": 0.004215, "frac_reward_zero_std": 0.0, "grad_norm": 0.15058119595050812, "kl": 0.20882786251604557, "learning_rate": 7.999939731186463e-06, "loss": -0.111, "num_tokens": 11053285.0, "reward": 0.8782936930656433, "reward_std": 1.102596640586853, "rewards/rollout_reward_func/mean": 0.8782936930656433, "rewards/rollout_reward_func/std": 1.1025965213775635, "sampling/importance_sampling_ratio/max": 1.2145075798034668, "sampling/importance_sampling_ratio/mean": 0.6559919714927673, "sampling/importance_sampling_ratio/min": 3.4155240427935496e-05, "sampling/sampling_logp_difference/max": 1.2467927932739258, "sampling/sampling_logp_difference/mean": 0.29465538263320923, "step": 843, "step_time": 14.850643477999256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.784761682152748, "epoch": 0.00422, "grad_norm": 0.14380548894405365, "kl": 0.20931800454854965, "learning_rate": 7.999939581729332e-06, "loss": -0.1113, "step": 844, "step_time": 7.405166506985552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 6.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.513494983315468, "epoch": 0.004225, "frac_reward_zero_std": 0.0, "grad_norm": 0.10157055407762527, "kl": 0.1160867908038199, "learning_rate": 7.999939432087117e-06, "loss": -0.0853, "num_tokens": 11087516.0, "reward": -0.18512018024921417, "reward_std": 0.8871685266494751, "rewards/rollout_reward_func/mean": -0.18512018024921417, "rewards/rollout_reward_func/std": 0.8871685266494751, "sampling/importance_sampling_ratio/max": 1.2826555967330933, "sampling/importance_sampling_ratio/mean": 0.5376195311546326, "sampling/importance_sampling_ratio/min": 1.1276604094945242e-09, "sampling/sampling_logp_difference/max": 2.6590776443481445, "sampling/sampling_logp_difference/mean": 0.436644583940506, "step": 845, "step_time": 17.399912998022046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.506706789135933, "epoch": 0.00423, "grad_norm": 0.1002756729722023, "kl": 0.11745303589850664, "learning_rate": 7.999939282259817e-06, "loss": -0.0858, "step": 846, "step_time": 8.102777933992911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.375, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.9775509238243103, "epoch": 0.004235, "frac_reward_zero_std": 0.0, "grad_norm": 0.14600446820259094, "kl": 0.08995197992771864, "learning_rate": 7.999939132247436e-06, "loss": -0.0755, "num_tokens": 11113043.0, "reward": 0.3640959560871124, "reward_std": 1.3600393533706665, "rewards/rollout_reward_func/mean": 0.3640959560871124, "rewards/rollout_reward_func/std": 1.3600393533706665, "sampling/importance_sampling_ratio/max": 1.2551544904708862, "sampling/importance_sampling_ratio/mean": 0.29920774698257446, "sampling/importance_sampling_ratio/min": 1.2932366189488675e-05, "sampling/sampling_logp_difference/max": 2.107877016067505, "sampling/sampling_logp_difference/mean": 0.4060254991054535, "step": 847, "step_time": 15.321621100010816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.973543643951416, "epoch": 0.00424, "grad_norm": 0.11907859891653061, "kl": 0.09371706657111645, "learning_rate": 7.99993898204997e-06, "loss": -0.0763, "step": 848, "step_time": 5.8750024949986255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 5.153846263885498, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.447402510792017, "epoch": 0.004245, "frac_reward_zero_std": 0.0, "grad_norm": 0.15500067174434662, "kl": 0.2444838099181652, "learning_rate": 7.99993883166742e-06, "loss": -0.0794, "num_tokens": 11146391.0, "reward": 0.800282895565033, "reward_std": 1.2013510465621948, "rewards/rollout_reward_func/mean": 0.800282895565033, "rewards/rollout_reward_func/std": 1.2013510465621948, "sampling/importance_sampling_ratio/max": 1.227487325668335, "sampling/importance_sampling_ratio/mean": 0.7658858895301819, "sampling/importance_sampling_ratio/min": 4.907419133814983e-05, "sampling/sampling_logp_difference/max": 1.7394585609436035, "sampling/sampling_logp_difference/mean": 0.2624633312225342, "step": 849, "step_time": 13.900797706985031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4463256746530533, "epoch": 0.00425, "grad_norm": 0.13771559298038483, "kl": 0.24534516781568527, "learning_rate": 7.999938681099786e-06, "loss": -0.08, "step": 850, "step_time": 6.8884775880142115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 7.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.6116846799850464, "epoch": 0.004255, "frac_reward_zero_std": 0.0, "grad_norm": 0.06009206175804138, "kl": 0.14253370091319084, "learning_rate": 7.99993853034707e-06, "loss": -0.0734, "num_tokens": 11173626.0, "reward": -0.007053375244140625, "reward_std": 1.2927159070968628, "rewards/rollout_reward_func/mean": -0.007053375244140625, "rewards/rollout_reward_func/std": 1.2927159070968628, "sampling/importance_sampling_ratio/max": 1.0595284700393677, "sampling/importance_sampling_ratio/mean": 0.4061715006828308, "sampling/importance_sampling_ratio/min": 0.00047735407133586705, "sampling/sampling_logp_difference/max": 1.7207235097885132, "sampling/sampling_logp_difference/mean": 0.36321115493774414, "step": 851, "step_time": 13.983448083992698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.602494955062866, "epoch": 0.00426, "grad_norm": 0.0584246963262558, "kl": 0.1477433443069458, "learning_rate": 7.999938379409268e-06, "loss": -0.0734, "step": 852, "step_time": 6.727313663010136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 9.4375, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3786210119724274, "epoch": 0.004265, "frac_reward_zero_std": 0.0, "grad_norm": 0.025981415063142776, "kl": 0.21788983419537544, "learning_rate": 7.999938228286386e-06, "loss": -0.0904, "num_tokens": 11202952.0, "reward": 0.6555957198143005, "reward_std": 1.3240816593170166, "rewards/rollout_reward_func/mean": 0.6555957198143005, "rewards/rollout_reward_func/std": 1.324081540107727, "sampling/importance_sampling_ratio/max": 1.0732609033584595, "sampling/importance_sampling_ratio/mean": 0.4748278260231018, "sampling/importance_sampling_ratio/min": 2.154210733351647e-06, "sampling/sampling_logp_difference/max": 1.6318550109863281, "sampling/sampling_logp_difference/mean": 0.3961362838745117, "step": 853, "step_time": 15.150724130988237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.372746020555496, "epoch": 0.00427, "grad_norm": 0.026348907500505447, "kl": 0.22921830788254738, "learning_rate": 7.999938076978417e-06, "loss": -0.0904, "step": 854, "step_time": 6.002629028997035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 4.6666669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7592878639698029, "epoch": 0.004275, "frac_reward_zero_std": 0.0, "grad_norm": 0.08706136792898178, "kl": 0.17271962389349937, "learning_rate": 7.999937925485365e-06, "loss": -0.0861, "num_tokens": 11228641.0, "reward": 1.112923264503479, "reward_std": 1.1947970390319824, "rewards/rollout_reward_func/mean": 1.112923264503479, "rewards/rollout_reward_func/std": 1.194797158241272, "sampling/importance_sampling_ratio/max": 1.279091715812683, "sampling/importance_sampling_ratio/mean": 0.7427465915679932, "sampling/importance_sampling_ratio/min": 2.437720922898734e-06, "sampling/sampling_logp_difference/max": 2.01021671295166, "sampling/sampling_logp_difference/mean": 0.3086724877357483, "step": 855, "step_time": 14.267486308017396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7523875907063484, "epoch": 0.00428, "grad_norm": 0.08268283307552338, "kl": 0.17138909921050072, "learning_rate": 7.999937773807229e-06, "loss": -0.0861, "step": 856, "step_time": 6.640548252980807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 5.777777671813965, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.790985554456711, "epoch": 0.004285, "frac_reward_zero_std": 0.0, "grad_norm": 0.047584209591150284, "kl": 0.15078712441027164, "learning_rate": 7.99993762194401e-06, "loss": -0.0861, "num_tokens": 11252116.0, "reward": -0.5268649458885193, "reward_std": 0.6980211734771729, "rewards/rollout_reward_func/mean": -0.5268649458885193, "rewards/rollout_reward_func/std": 0.6980211734771729, "sampling/importance_sampling_ratio/max": 1.1641556024551392, "sampling/importance_sampling_ratio/mean": 0.47068893909454346, "sampling/importance_sampling_ratio/min": 1.3899320947530214e-06, "sampling/sampling_logp_difference/max": 2.197974681854248, "sampling/sampling_logp_difference/mean": 0.4403994679450989, "step": 857, "step_time": 12.136273248994257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7864275574684143, "epoch": 0.00429, "grad_norm": 0.048925381153821945, "kl": 0.14388003386557102, "learning_rate": 7.999937469895707e-06, "loss": -0.0862, "step": 858, "step_time": 5.749618453992298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 6.461538791656494, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1966873705387115, "epoch": 0.004295, "frac_reward_zero_std": 0.0, "grad_norm": 0.15348291397094727, "kl": 0.20245087705552578, "learning_rate": 7.999937317662321e-06, "loss": -0.0644, "num_tokens": 11285187.0, "reward": -0.06031277775764465, "reward_std": 1.0212533473968506, "rewards/rollout_reward_func/mean": -0.06031277775764465, "rewards/rollout_reward_func/std": 1.0212534666061401, "sampling/importance_sampling_ratio/max": 1.208593726158142, "sampling/importance_sampling_ratio/mean": 0.579110324382782, "sampling/importance_sampling_ratio/min": 6.765602302039042e-05, "sampling/sampling_logp_difference/max": 1.7322173118591309, "sampling/sampling_logp_difference/mean": 0.369143545627594, "step": 859, "step_time": 14.562716751999687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.194557473063469, "epoch": 0.0043, "grad_norm": 0.164774090051651, "kl": 0.2196742193773389, "learning_rate": 7.999937165243852e-06, "loss": -0.0652, "step": 860, "step_time": 7.134515645011561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.9375, "completions/mean_terminated_length": 5.266666889190674, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1421558558940887, "epoch": 0.004305, "frac_reward_zero_std": 0.0, "grad_norm": 0.4026740789413452, "kl": 0.35493360087275505, "learning_rate": 7.999937012640299e-06, "loss": -0.03, "num_tokens": 11318538.0, "reward": 0.4331784248352051, "reward_std": 1.2227329015731812, "rewards/rollout_reward_func/mean": 0.4331784248352051, "rewards/rollout_reward_func/std": 1.2227330207824707, "sampling/importance_sampling_ratio/max": 2.0476207733154297, "sampling/importance_sampling_ratio/mean": 0.8196788430213928, "sampling/importance_sampling_ratio/min": 0.001877648290246725, "sampling/sampling_logp_difference/max": 1.2908377647399902, "sampling/sampling_logp_difference/mean": 0.18460583686828613, "step": 861, "step_time": 14.25702874000126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 1.1477511823177338, "epoch": 0.00431, "grad_norm": 0.24490146338939667, "kl": 0.3626093603670597, "learning_rate": 7.99993685985166e-06, "loss": -0.0313, "step": 862, "step_time": 7.057498207010212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.125, "completions/mean_terminated_length": 5.857142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4851494431495667, "epoch": 0.004315, "frac_reward_zero_std": 0.5, "grad_norm": 0.020833292976021767, "kl": 0.20286240428686142, "learning_rate": 7.99993670687794e-06, "loss": -0.0252, "num_tokens": 11342024.0, "reward": 0.8998991847038269, "reward_std": 1.2863558530807495, "rewards/rollout_reward_func/mean": 0.8998991847038269, "rewards/rollout_reward_func/std": 1.286355972290039, "sampling/importance_sampling_ratio/max": 1.1114898920059204, "sampling/importance_sampling_ratio/mean": 0.7530775666236877, "sampling/importance_sampling_ratio/min": 1.24331882034312e-05, "sampling/sampling_logp_difference/max": 1.643033504486084, "sampling/sampling_logp_difference/mean": 0.20655028522014618, "step": 863, "step_time": 14.512174416988273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4865414798259735, "epoch": 0.00432, "grad_norm": 0.022874698042869568, "kl": 0.20204323902726173, "learning_rate": 7.999936553719135e-06, "loss": -0.0252, "step": 864, "step_time": 6.848412239996833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4352267384529114, "epoch": 0.004325, "frac_reward_zero_std": 0.0, "grad_norm": 0.1951637715101242, "kl": 0.1670269537717104, "learning_rate": 7.999936400375248e-06, "loss": -0.1242, "num_tokens": 11367975.0, "reward": 0.3165757656097412, "reward_std": 1.3869067430496216, "rewards/rollout_reward_func/mean": 0.3165757656097412, "rewards/rollout_reward_func/std": 1.3869067430496216, "sampling/importance_sampling_ratio/max": 1.154788851737976, "sampling/importance_sampling_ratio/mean": 0.4855983257293701, "sampling/importance_sampling_ratio/min": 1.2156135653640376e-07, "sampling/sampling_logp_difference/max": 1.6818516254425049, "sampling/sampling_logp_difference/mean": 0.3728284239768982, "step": 865, "step_time": 13.582714689982822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4349989891052246, "epoch": 0.00433, "grad_norm": 0.18422141671180725, "kl": 0.16822145320475101, "learning_rate": 7.999936246846276e-06, "loss": -0.1245, "step": 866, "step_time": 5.721581848003552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 4.692307949066162, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9040580242872238, "epoch": 0.004335, "frac_reward_zero_std": 0.0, "grad_norm": 0.16225764155387878, "kl": 0.4439983256161213, "learning_rate": 7.99993609313222e-06, "loss": -0.0896, "num_tokens": 11392618.0, "reward": 1.0310660600662231, "reward_std": 1.2384496927261353, "rewards/rollout_reward_func/mean": 1.0310660600662231, "rewards/rollout_reward_func/std": 1.2384496927261353, "sampling/importance_sampling_ratio/max": 1.214707851409912, "sampling/importance_sampling_ratio/mean": 0.7549196481704712, "sampling/importance_sampling_ratio/min": 2.370211404922884e-07, "sampling/sampling_logp_difference/max": 2.1540825366973877, "sampling/sampling_logp_difference/mean": 0.3890719711780548, "step": 867, "step_time": 15.433660476002842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 1.8970379382371902, "epoch": 0.00434, "grad_norm": 0.15505611896514893, "kl": 0.4648476466536522, "learning_rate": 7.999935939233083e-06, "loss": -0.0901, "step": 868, "step_time": 6.881706399013638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 5.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9592423886060715, "epoch": 0.004345, "frac_reward_zero_std": 0.0, "grad_norm": 0.12392169237136841, "kl": 0.3745288960635662, "learning_rate": 7.999935785148863e-06, "loss": -0.0789, "num_tokens": 11420386.0, "reward": 0.1345807909965515, "reward_std": 1.31923246383667, "rewards/rollout_reward_func/mean": 0.1345807909965515, "rewards/rollout_reward_func/std": 1.3192325830459595, "sampling/importance_sampling_ratio/max": 1.213457703590393, "sampling/importance_sampling_ratio/mean": 0.5394402146339417, "sampling/importance_sampling_ratio/min": 3.0152084491419373e-06, "sampling/sampling_logp_difference/max": 1.97432279586792, "sampling/sampling_logp_difference/mean": 0.36525070667266846, "step": 869, "step_time": 13.303293922988814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9519331827759743, "epoch": 0.00435, "grad_norm": 0.11808925122022629, "kl": 0.3760904986411333, "learning_rate": 7.999935630879557e-06, "loss": -0.0789, "step": 870, "step_time": 5.9927138250059215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 6.6666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1337779462337494, "epoch": 0.004355, "frac_reward_zero_std": 0.0, "grad_norm": 0.04279566556215286, "kl": 0.18665154464542866, "learning_rate": 7.999935476425169e-06, "loss": -0.0595, "num_tokens": 11445933.0, "reward": 0.38322320580482483, "reward_std": 1.3924624919891357, "rewards/rollout_reward_func/mean": 0.38322320580482483, "rewards/rollout_reward_func/std": 1.3924624919891357, "sampling/importance_sampling_ratio/max": 1.197407603263855, "sampling/importance_sampling_ratio/mean": 0.4917853772640228, "sampling/importance_sampling_ratio/min": 9.243945532944053e-05, "sampling/sampling_logp_difference/max": 2.0062379837036133, "sampling/sampling_logp_difference/mean": 0.3835243582725525, "step": 871, "step_time": 12.85902508001891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1346395313739777, "epoch": 0.00436, "grad_norm": 0.038771603256464005, "kl": 0.18982175923883915, "learning_rate": 7.999935321785696e-06, "loss": -0.0596, "step": 872, "step_time": 6.1962218080152525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.1501353718340397, "epoch": 0.004365, "frac_reward_zero_std": 0.5, "grad_norm": 0.0856052115559578, "kl": 0.36906807869672775, "learning_rate": 7.99993516696114e-06, "loss": 0.0105, "num_tokens": 11469738.0, "reward": 1.5829112529754639, "reward_std": 0.869890570640564, "rewards/rollout_reward_func/mean": 1.5829112529754639, "rewards/rollout_reward_func/std": 0.869890570640564, "sampling/importance_sampling_ratio/max": 1.251655101776123, "sampling/importance_sampling_ratio/mean": 1.073332667350769, "sampling/importance_sampling_ratio/min": 1.0134644508361816, "sampling/sampling_logp_difference/max": 0.14123249053955078, "sampling/sampling_logp_difference/mean": 0.01739220693707466, "step": 873, "step_time": 10.009964180993848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.14850661531090736, "epoch": 0.00437, "grad_norm": 0.08879661560058594, "kl": 0.3693423271179199, "learning_rate": 7.999935011951501e-06, "loss": 0.0103, "step": 874, "step_time": 5.756537156994455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 5.142857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6014213263988495, "epoch": 0.004375, "frac_reward_zero_std": 0.0, "grad_norm": 0.16822972893714905, "kl": 0.21734173595905304, "learning_rate": 7.999934856756776e-06, "loss": -0.0699, "num_tokens": 11498271.0, "reward": 0.45573675632476807, "reward_std": 1.2519232034683228, "rewards/rollout_reward_func/mean": 0.45573675632476807, "rewards/rollout_reward_func/std": 1.2519233226776123, "sampling/importance_sampling_ratio/max": 1.1961652040481567, "sampling/importance_sampling_ratio/mean": 0.7301774024963379, "sampling/importance_sampling_ratio/min": 0.0005334105808287859, "sampling/sampling_logp_difference/max": 1.4337363243103027, "sampling/sampling_logp_difference/mean": 0.22668612003326416, "step": 875, "step_time": 15.037420330991154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.599139541387558, "epoch": 0.00438, "grad_norm": 0.16332341730594635, "kl": 0.2171259131282568, "learning_rate": 7.999934701376971e-06, "loss": -0.0705, "step": 876, "step_time": 7.56871224999486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6318637821823359, "epoch": 0.004385, "frac_reward_zero_std": 0.0, "grad_norm": 0.056302618235349655, "kl": 0.7718765735626221, "learning_rate": 7.999934545812082e-06, "loss": -0.073, "num_tokens": 11527576.0, "reward": 1.4754072427749634, "reward_std": 0.944832444190979, "rewards/rollout_reward_func/mean": 1.4754072427749634, "rewards/rollout_reward_func/std": 0.944832444190979, "sampling/importance_sampling_ratio/max": 1.1955506801605225, "sampling/importance_sampling_ratio/mean": 0.9124116897583008, "sampling/importance_sampling_ratio/min": 0.0028603628743439913, "sampling/sampling_logp_difference/max": 1.499678611755371, "sampling/sampling_logp_difference/mean": 0.1519860029220581, "step": 877, "step_time": 12.460068615037017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6356756910681725, "epoch": 0.00439, "grad_norm": 0.05472185090184212, "kl": 0.7481809854507446, "learning_rate": 7.999934390062109e-06, "loss": -0.073, "step": 878, "step_time": 6.230380747001618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.928571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.537688136100769, "epoch": 0.004395, "frac_reward_zero_std": 0.0, "grad_norm": 0.08759310841560364, "kl": 0.5510787405073643, "learning_rate": 7.999934234127053e-06, "loss": -0.0649, "num_tokens": 11556875.0, "reward": 0.011420279741287231, "reward_std": 0.862694263458252, "rewards/rollout_reward_func/mean": 0.011420279741287231, "rewards/rollout_reward_func/std": 0.862694263458252, "sampling/importance_sampling_ratio/max": 1.140578269958496, "sampling/importance_sampling_ratio/mean": 0.7131383419036865, "sampling/importance_sampling_ratio/min": 0.00010283130541210994, "sampling/sampling_logp_difference/max": 2.0480833053588867, "sampling/sampling_logp_difference/mean": 0.2878348231315613, "step": 879, "step_time": 11.180678919990896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5448584109544754, "epoch": 0.0044, "grad_norm": 0.09256471693515778, "kl": 0.5538108348846436, "learning_rate": 7.999934078006912e-06, "loss": -0.0646, "step": 880, "step_time": 5.306597914008307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.3125, "completions/mean_terminated_length": 4.4166669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1979867219924927, "epoch": 0.004405, "frac_reward_zero_std": 0.0, "grad_norm": 0.029961245134472847, "kl": 0.17115682642906904, "learning_rate": 7.999933921701688e-06, "loss": -0.0617, "num_tokens": 11583974.0, "reward": 0.5400928258895874, "reward_std": 1.279180884361267, "rewards/rollout_reward_func/mean": 0.5400928258895874, "rewards/rollout_reward_func/std": 1.2791807651519775, "sampling/importance_sampling_ratio/max": 1.1183627843856812, "sampling/importance_sampling_ratio/mean": 0.642763614654541, "sampling/importance_sampling_ratio/min": 1.646176250869047e-11, "sampling/sampling_logp_difference/max": 2.095445156097412, "sampling/sampling_logp_difference/mean": 0.41215038299560547, "step": 881, "step_time": 12.323725085007027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.197561264038086, "epoch": 0.00441, "grad_norm": 0.028707940131425858, "kl": 0.16775250062346458, "learning_rate": 7.999933765211382e-06, "loss": -0.0618, "step": 882, "step_time": 6.679275126996799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.095927633345127, "epoch": 0.004415, "frac_reward_zero_std": 0.5, "grad_norm": 0.1359649896621704, "kl": 0.3789946585893631, "learning_rate": 7.99993360853599e-06, "loss": -0.0598, "num_tokens": 11604008.0, "reward": -0.2075825333595276, "reward_std": 0.8748027086257935, "rewards/rollout_reward_func/mean": -0.2075825333595276, "rewards/rollout_reward_func/std": 0.8748027682304382, "sampling/importance_sampling_ratio/max": 1.2642402648925781, "sampling/importance_sampling_ratio/mean": 0.9015305042266846, "sampling/importance_sampling_ratio/min": 1.902165968203917e-05, "sampling/sampling_logp_difference/max": 1.6224943399429321, "sampling/sampling_logp_difference/mean": 0.19868871569633484, "step": 883, "step_time": 8.552380294975592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0957655683159828, "epoch": 0.00442, "grad_norm": 0.12451113015413284, "kl": 0.3873336762189865, "learning_rate": 7.999933451675518e-06, "loss": -0.0601, "step": 884, "step_time": 4.7358643309999024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.9375, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2326539158821106, "epoch": 0.004425, "frac_reward_zero_std": 0.0, "grad_norm": 0.38371792435646057, "kl": 1.3717073500156403, "learning_rate": 7.999933294629961e-06, "loss": -0.0713, "num_tokens": 11635897.0, "reward": 1.0692743062973022, "reward_std": 1.2218495607376099, "rewards/rollout_reward_func/mean": 1.0692743062973022, "rewards/rollout_reward_func/std": 1.2218495607376099, "sampling/importance_sampling_ratio/max": 1.1979707479476929, "sampling/importance_sampling_ratio/mean": 0.8294919729232788, "sampling/importance_sampling_ratio/min": 0.0010281371651217341, "sampling/sampling_logp_difference/max": 1.403925895690918, "sampling/sampling_logp_difference/mean": 0.21529299020767212, "step": 885, "step_time": 13.832083779998356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2395811676979065, "epoch": 0.00443, "grad_norm": 0.3484966456890106, "kl": 1.1772355660796165, "learning_rate": 7.99993313739932e-06, "loss": -0.0735, "step": 886, "step_time": 6.734064640011638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.066667079925537, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3720430992543697, "epoch": 0.004435, "frac_reward_zero_std": 0.0, "grad_norm": 0.12325872480869293, "kl": 0.5562518164515495, "learning_rate": 7.999932979983598e-06, "loss": -0.0506, "num_tokens": 11656818.0, "reward": 0.6381364464759827, "reward_std": 1.2836884260177612, "rewards/rollout_reward_func/mean": 0.6381364464759827, "rewards/rollout_reward_func/std": 1.2836884260177612, "sampling/importance_sampling_ratio/max": 1.1337666511535645, "sampling/importance_sampling_ratio/mean": 0.650957465171814, "sampling/importance_sampling_ratio/min": 0.0006256828201003373, "sampling/sampling_logp_difference/max": 1.634596347808838, "sampling/sampling_logp_difference/mean": 0.25619643926620483, "step": 887, "step_time": 9.589780392008834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3944271076470613, "epoch": 0.00444, "grad_norm": 0.1240953654050827, "kl": 0.5572810024023056, "learning_rate": 7.99993282238279e-06, "loss": -0.0509, "step": 888, "step_time": 5.029676748992642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 6.500000476837158, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0841815173625946, "epoch": 0.004445, "frac_reward_zero_std": 0.0, "grad_norm": 0.12234166264533997, "kl": 0.22303779795765877, "learning_rate": 7.999932664596901e-06, "loss": -0.0428, "num_tokens": 11686811.0, "reward": -0.6138581037521362, "reward_std": 0.7171888947486877, "rewards/rollout_reward_func/mean": -0.6138581037521362, "rewards/rollout_reward_func/std": 0.7171888947486877, "sampling/importance_sampling_ratio/max": 1.2147128582000732, "sampling/importance_sampling_ratio/mean": 0.6271239519119263, "sampling/importance_sampling_ratio/min": 1.6634365238132887e-05, "sampling/sampling_logp_difference/max": 2.139066219329834, "sampling/sampling_logp_difference/mean": 0.3629915714263916, "step": 889, "step_time": 13.688173920003464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0912367701530457, "epoch": 0.00445, "grad_norm": 0.12331164628267288, "kl": 0.22200965881347656, "learning_rate": 7.999932506625927e-06, "loss": -0.0429, "step": 890, "step_time": 6.793560152000282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 4.266666889190674, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8760345280170441, "epoch": 0.004455, "frac_reward_zero_std": 0.5, "grad_norm": 0.044051285833120346, "kl": 0.28650978952646255, "learning_rate": 7.99993234846987e-06, "loss": -0.0208, "num_tokens": 11711713.0, "reward": 0.7952165603637695, "reward_std": 1.266811728477478, "rewards/rollout_reward_func/mean": 0.7952165603637695, "rewards/rollout_reward_func/std": 1.266811728477478, "sampling/importance_sampling_ratio/max": 1.132020115852356, "sampling/importance_sampling_ratio/mean": 0.8765954971313477, "sampling/importance_sampling_ratio/min": 0.00023032822355162352, "sampling/sampling_logp_difference/max": 1.3516614437103271, "sampling/sampling_logp_difference/mean": 0.14484721422195435, "step": 891, "step_time": 12.47451954499411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8853121176362038, "epoch": 0.00446, "grad_norm": 0.044032301753759384, "kl": 0.2820751741528511, "learning_rate": 7.999932190128729e-06, "loss": -0.0209, "step": 892, "step_time": 6.430567270988831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.428571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8189489766955376, "epoch": 0.004465, "frac_reward_zero_std": 0.0, "grad_norm": 0.04262866452336311, "kl": 0.3950805738568306, "learning_rate": 7.999932031602506e-06, "loss": -0.0684, "num_tokens": 11734679.0, "reward": 1.341847538948059, "reward_std": 1.0078604221343994, "rewards/rollout_reward_func/mean": 1.341847538948059, "rewards/rollout_reward_func/std": 1.0078604221343994, "sampling/importance_sampling_ratio/max": 1.176590085029602, "sampling/importance_sampling_ratio/mean": 0.8392305374145508, "sampling/importance_sampling_ratio/min": 0.00045238572056405246, "sampling/sampling_logp_difference/max": 1.5802305936813354, "sampling/sampling_logp_difference/mean": 0.1752517819404602, "step": 893, "step_time": 10.873945903993445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8224918246269226, "epoch": 0.00447, "grad_norm": 0.043604396283626556, "kl": 0.399198692291975, "learning_rate": 7.9999318728912e-06, "loss": -0.0683, "step": 894, "step_time": 5.437808314018184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.0625, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9995464272797108, "epoch": 0.004475, "frac_reward_zero_std": 0.5, "grad_norm": 0.39371392130851746, "kl": 1.6274952292442322, "learning_rate": 7.99993171399481e-06, "loss": -0.0151, "num_tokens": 11756212.0, "reward": 1.3136756420135498, "reward_std": 1.1309139728546143, "rewards/rollout_reward_func/mean": 1.3136756420135498, "rewards/rollout_reward_func/std": 1.1309139728546143, "sampling/importance_sampling_ratio/max": 1.114357829093933, "sampling/importance_sampling_ratio/mean": 0.8703905344009399, "sampling/importance_sampling_ratio/min": 1.7516775187687017e-05, "sampling/sampling_logp_difference/max": 2.0629868507385254, "sampling/sampling_logp_difference/mean": 0.1892482340335846, "step": 895, "step_time": 9.762084289992345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0127325113862753, "epoch": 0.00448, "grad_norm": 0.34654122591018677, "kl": 1.4716707915067673, "learning_rate": 7.999931554913336e-06, "loss": -0.0175, "step": 896, "step_time": 5.165766117992462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0352146849036217, "epoch": 0.004485, "frac_reward_zero_std": 0.0, "grad_norm": 0.11276547610759735, "kl": 0.22681380808353424, "learning_rate": 7.99993139564678e-06, "loss": -0.0747, "num_tokens": 11780439.0, "reward": 1.4792265892028809, "reward_std": 0.9256194233894348, "rewards/rollout_reward_func/mean": 1.4792265892028809, "rewards/rollout_reward_func/std": 0.9256194233894348, "sampling/importance_sampling_ratio/max": 1.1367028951644897, "sampling/importance_sampling_ratio/mean": 0.8635478019714355, "sampling/importance_sampling_ratio/min": 0.00013161395327188075, "sampling/sampling_logp_difference/max": 1.1509933471679688, "sampling/sampling_logp_difference/mean": 0.21569833159446716, "step": 897, "step_time": 12.192317111010198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0642238408327103, "epoch": 0.00449, "grad_norm": 0.12191562354564667, "kl": 0.2238272875547409, "learning_rate": 7.99993123619514e-06, "loss": -0.0744, "step": 898, "step_time": 5.8149209930124925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8627476468682289, "epoch": 0.004495, "frac_reward_zero_std": 0.5, "grad_norm": 0.05080645903944969, "kl": 0.22088462114334106, "learning_rate": 7.999931076558417e-06, "loss": -0.0533, "num_tokens": 11799563.0, "reward": 0.7364560961723328, "reward_std": 1.311622142791748, "rewards/rollout_reward_func/mean": 0.7364560961723328, "rewards/rollout_reward_func/std": 1.311622142791748, "sampling/importance_sampling_ratio/max": 1.0634409189224243, "sampling/importance_sampling_ratio/mean": 0.8459975719451904, "sampling/importance_sampling_ratio/min": 7.108055433491245e-05, "sampling/sampling_logp_difference/max": 1.9877867698669434, "sampling/sampling_logp_difference/mean": 0.16904863715171814, "step": 899, "step_time": 10.442650373996003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8706979900598526, "epoch": 0.0045, "grad_norm": 0.05176341161131859, "kl": 0.2207649201154709, "learning_rate": 7.999930916736611e-06, "loss": -0.0532, "step": 900, "step_time": 4.808568762004143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.428571701049805, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2318268418312073, "epoch": 0.004505, "frac_reward_zero_std": 0.0, "grad_norm": 0.10561531782150269, "kl": 0.5183672420680523, "learning_rate": 7.999930756729721e-06, "loss": -0.0706, "num_tokens": 11820976.0, "reward": 1.0415325164794922, "reward_std": 1.4030342102050781, "rewards/rollout_reward_func/mean": 1.0415325164794922, "rewards/rollout_reward_func/std": 1.4030342102050781, "sampling/importance_sampling_ratio/max": 1.117882251739502, "sampling/importance_sampling_ratio/mean": 0.7373330593109131, "sampling/importance_sampling_ratio/min": 0.0017707343213260174, "sampling/sampling_logp_difference/max": 1.3543057441711426, "sampling/sampling_logp_difference/mean": 0.16821140050888062, "step": 901, "step_time": 9.676619614008814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2390804439783096, "epoch": 0.00451, "grad_norm": 0.10157090425491333, "kl": 0.51599046215415, "learning_rate": 7.999930596537749e-06, "loss": -0.0707, "step": 902, "step_time": 5.028552236006362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.1875, "completions/mean_terminated_length": 4.1875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.181241936981678, "epoch": 0.004515, "frac_reward_zero_std": 0.5, "grad_norm": 0.055298127233982086, "kl": 0.4444047510623932, "learning_rate": 7.999930436160693e-06, "loss": -0.0246, "num_tokens": 11836812.0, "reward": 1.9374473094940186, "reward_std": 0.08066616207361221, "rewards/rollout_reward_func/mean": 1.9374473094940186, "rewards/rollout_reward_func/std": 0.08066617697477341, "sampling/importance_sampling_ratio/max": 1.0510278940200806, "sampling/importance_sampling_ratio/mean": 0.977666974067688, "sampling/importance_sampling_ratio/min": 0.20233003795146942, "sampling/sampling_logp_difference/max": 1.0451185703277588, "sampling/sampling_logp_difference/mean": 0.030503934249281883, "step": 903, "step_time": 6.109647751000011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1826474592089653, "epoch": 0.00452, "grad_norm": 0.04664648324251175, "kl": 0.4762047007679939, "learning_rate": 7.999930275598554e-06, "loss": -0.0248, "step": 904, "step_time": 3.06786941899918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 4.769230842590332, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2936237901449203, "epoch": 0.004525, "frac_reward_zero_std": 0.0, "grad_norm": 0.103336401283741, "kl": 0.1828017458319664, "learning_rate": 7.999930114851332e-06, "loss": -0.0327, "num_tokens": 11868734.0, "reward": 0.6601594686508179, "reward_std": 1.2787530422210693, "rewards/rollout_reward_func/mean": 0.6601594686508179, "rewards/rollout_reward_func/std": 1.2787530422210693, "sampling/importance_sampling_ratio/max": 1.1024818420410156, "sampling/importance_sampling_ratio/mean": 0.7133450508117676, "sampling/importance_sampling_ratio/min": 0.0010405085049569607, "sampling/sampling_logp_difference/max": 1.0921388864517212, "sampling/sampling_logp_difference/mean": 0.1692904233932495, "step": 905, "step_time": 14.83410026798083 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.2900096625089645, "epoch": 0.00453, "grad_norm": 0.07858368754386902, "kl": 0.18262089416384697, "learning_rate": 7.999929953919027e-06, "loss": -0.0332, "step": 906, "step_time": 6.251269400017918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.085755791515112, "epoch": 0.004535, "frac_reward_zero_std": 0.0, "grad_norm": 0.14036041498184204, "kl": 0.2473154328763485, "learning_rate": 7.999929792801638e-06, "loss": -0.0484, "num_tokens": 11903521.0, "reward": 0.8270338773727417, "reward_std": 1.2204474210739136, "rewards/rollout_reward_func/mean": 0.8270338773727417, "rewards/rollout_reward_func/std": 1.220447301864624, "sampling/importance_sampling_ratio/max": 1.159177541732788, "sampling/importance_sampling_ratio/mean": 0.8483080863952637, "sampling/importance_sampling_ratio/min": 1.332562715106178e-05, "sampling/sampling_logp_difference/max": 1.7770575284957886, "sampling/sampling_logp_difference/mean": 0.2042398601770401, "step": 907, "step_time": 16.829124351017526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0962323620915413, "epoch": 0.00454, "grad_norm": 0.14576131105422974, "kl": 0.24587645754218102, "learning_rate": 7.999929631499167e-06, "loss": -0.0488, "step": 908, "step_time": 8.015565295994747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6333806216716766, "epoch": 0.004545, "frac_reward_zero_std": 0.0, "grad_norm": 0.1284628063440323, "kl": 0.1857922002673149, "learning_rate": 7.999929470011612e-06, "loss": -0.0905, "num_tokens": 11932696.0, "reward": 0.846831202507019, "reward_std": 1.2558491230010986, "rewards/rollout_reward_func/mean": 0.846831202507019, "rewards/rollout_reward_func/std": 1.2558491230010986, "sampling/importance_sampling_ratio/max": 1.1100424528121948, "sampling/importance_sampling_ratio/mean": 0.7436226010322571, "sampling/importance_sampling_ratio/min": 6.813574873376638e-05, "sampling/sampling_logp_difference/max": 1.4864259958267212, "sampling/sampling_logp_difference/mean": 0.22764956951141357, "step": 909, "step_time": 12.363760788008221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6383287608623505, "epoch": 0.00455, "grad_norm": 0.12949088215827942, "kl": 0.1856214702129364, "learning_rate": 7.999929308338974e-06, "loss": -0.0905, "step": 910, "step_time": 5.331163865004783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 5.428571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3777908384799957, "epoch": 0.004555, "frac_reward_zero_std": 0.0, "grad_norm": 0.11510363966226578, "kl": 0.5226271040737629, "learning_rate": 7.999929146481254e-06, "loss": -0.0904, "num_tokens": 11963579.0, "reward": 0.09494386613368988, "reward_std": 1.1298493146896362, "rewards/rollout_reward_func/mean": 0.09494386613368988, "rewards/rollout_reward_func/std": 1.1298493146896362, "sampling/importance_sampling_ratio/max": 1.1906816959381104, "sampling/importance_sampling_ratio/mean": 0.5062389373779297, "sampling/importance_sampling_ratio/min": 3.0417726520681754e-05, "sampling/sampling_logp_difference/max": 1.7464725971221924, "sampling/sampling_logp_difference/mean": 0.37330466508865356, "step": 911, "step_time": 15.319189724003081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.378842294216156, "epoch": 0.00456, "grad_norm": 0.1141907125711441, "kl": 0.5119522400200367, "learning_rate": 7.99992898443845e-06, "loss": -0.0905, "step": 912, "step_time": 7.653057084986358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 5.785714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8091556429862976, "epoch": 0.004565, "frac_reward_zero_std": 0.0, "grad_norm": 0.18630389869213104, "kl": 0.36347965709865093, "learning_rate": 7.999928822210561e-06, "loss": -0.0671, "num_tokens": 11991115.0, "reward": 0.380241721868515, "reward_std": 1.2144474983215332, "rewards/rollout_reward_func/mean": 0.380241721868515, "rewards/rollout_reward_func/std": 1.2144474983215332, "sampling/importance_sampling_ratio/max": 1.0816198587417603, "sampling/importance_sampling_ratio/mean": 0.564018964767456, "sampling/importance_sampling_ratio/min": 0.00027137072174809873, "sampling/sampling_logp_difference/max": 1.4575650691986084, "sampling/sampling_logp_difference/mean": 0.2855983376502991, "step": 913, "step_time": 14.13278017900302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7973542213439941, "epoch": 0.00457, "grad_norm": 0.188431516289711, "kl": 0.3473372086882591, "learning_rate": 7.999928659797591e-06, "loss": -0.0678, "step": 914, "step_time": 6.417191074011498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.625, "completions/mean_terminated_length": 4.933333396911621, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1385681852698326, "epoch": 0.004575, "frac_reward_zero_std": 0.0, "grad_norm": 0.17379778623580933, "kl": 1.1697247475385666, "learning_rate": 7.999928497199539e-06, "loss": -0.0663, "num_tokens": 12020857.0, "reward": 1.5712218284606934, "reward_std": 0.797995924949646, "rewards/rollout_reward_func/mean": 1.5712218284606934, "rewards/rollout_reward_func/std": 0.7979959845542908, "sampling/importance_sampling_ratio/max": 1.0908095836639404, "sampling/importance_sampling_ratio/mean": 0.8508509397506714, "sampling/importance_sampling_ratio/min": 1.2405213055899367e-05, "sampling/sampling_logp_difference/max": 1.887359619140625, "sampling/sampling_logp_difference/mean": 0.2951086759567261, "step": 915, "step_time": 13.050755066986312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1459881737828255, "epoch": 0.00458, "grad_norm": 0.15540701150894165, "kl": 1.0989395081996918, "learning_rate": 7.999928334416403e-06, "loss": -0.0666, "step": 916, "step_time": 6.706493108998984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.8125, "completions/mean_terminated_length": 4.357142925262451, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7006651908159256, "epoch": 0.004585, "frac_reward_zero_std": 0.0, "grad_norm": 0.1786355972290039, "kl": 0.894138116389513, "learning_rate": 7.999928171448182e-06, "loss": -0.0819, "num_tokens": 12050707.0, "reward": 1.0154579877853394, "reward_std": 1.100327491760254, "rewards/rollout_reward_func/mean": 1.0154579877853394, "rewards/rollout_reward_func/std": 1.100327491760254, "sampling/importance_sampling_ratio/max": 1.1610552072525024, "sampling/importance_sampling_ratio/mean": 0.6292041540145874, "sampling/importance_sampling_ratio/min": 0.000519661873113364, "sampling/sampling_logp_difference/max": 1.4720567464828491, "sampling/sampling_logp_difference/mean": 0.3069493770599365, "step": 917, "step_time": 13.961648908007191 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.677945926785469, "epoch": 0.00459, "grad_norm": 0.16293293237686157, "kl": 0.8757287859916687, "learning_rate": 7.999928008294881e-06, "loss": -0.0824, "step": 918, "step_time": 6.266163990003406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 8.6875, "completions/mean_terminated_length": 4.300000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.370148330926895, "epoch": 0.004595, "frac_reward_zero_std": 0.0, "grad_norm": 0.20893284678459167, "kl": 0.14201923832297325, "learning_rate": 7.999927844956495e-06, "loss": -0.1073, "num_tokens": 12082152.0, "reward": 0.6856631636619568, "reward_std": 1.315547227859497, "rewards/rollout_reward_func/mean": 0.6856631636619568, "rewards/rollout_reward_func/std": 1.315547227859497, "sampling/importance_sampling_ratio/max": 1.155168890953064, "sampling/importance_sampling_ratio/mean": 0.5954748392105103, "sampling/importance_sampling_ratio/min": 8.734826906220405e-08, "sampling/sampling_logp_difference/max": 2.079059362411499, "sampling/sampling_logp_difference/mean": 0.38063061237335205, "step": 919, "step_time": 14.499560048992862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3447760343551636, "epoch": 0.0046, "grad_norm": 0.18617771565914154, "kl": 0.14419039897620678, "learning_rate": 7.999927681433027e-06, "loss": -0.1085, "step": 920, "step_time": 6.404720041988185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1979391761124134, "epoch": 0.004605, "frac_reward_zero_std": 0.0, "grad_norm": 0.1577172726392746, "kl": 0.7516435012221336, "learning_rate": 7.999927517724476e-06, "loss": -0.0491, "num_tokens": 12112048.0, "reward": 0.962979793548584, "reward_std": 1.0958337783813477, "rewards/rollout_reward_func/mean": 0.962979793548584, "rewards/rollout_reward_func/std": 1.0958337783813477, "sampling/importance_sampling_ratio/max": 1.130603313446045, "sampling/importance_sampling_ratio/mean": 0.8302792310714722, "sampling/importance_sampling_ratio/min": 2.5044441827049013e-06, "sampling/sampling_logp_difference/max": 2.582740545272827, "sampling/sampling_logp_difference/mean": 0.2589987516403198, "step": 921, "step_time": 12.521292785007972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1671600434929132, "epoch": 0.00461, "grad_norm": 0.1555567979812622, "kl": 0.7697450816631317, "learning_rate": 7.999927353830841e-06, "loss": -0.05, "step": 922, "step_time": 6.643993731006049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1335111185908318, "epoch": 0.004615, "frac_reward_zero_std": 0.0, "grad_norm": 0.049052510410547256, "kl": 0.2704019881784916, "learning_rate": 7.999927189752123e-06, "loss": -0.0597, "num_tokens": 12146548.0, "reward": 0.14254732429981232, "reward_std": 1.121012806892395, "rewards/rollout_reward_func/mean": 0.14254732429981232, "rewards/rollout_reward_func/std": 1.1210126876831055, "sampling/importance_sampling_ratio/max": 1.1803590059280396, "sampling/importance_sampling_ratio/mean": 0.6756865382194519, "sampling/importance_sampling_ratio/min": 5.199436259317736e-07, "sampling/sampling_logp_difference/max": 1.9541516304016113, "sampling/sampling_logp_difference/mean": 0.40572863817214966, "step": 923, "step_time": 15.40313152699673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.119833540171385, "epoch": 0.00462, "grad_norm": 0.04608306288719177, "kl": 0.2890398669987917, "learning_rate": 7.999927025488322e-06, "loss": -0.0594, "step": 924, "step_time": 6.930588767005247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.86463288590312, "epoch": 0.004625, "frac_reward_zero_std": 0.5, "grad_norm": 0.012309660203754902, "kl": 0.26667820662260056, "learning_rate": 7.999926861039439e-06, "loss": -0.058, "num_tokens": 12169061.0, "reward": 1.233947992324829, "reward_std": 1.0279457569122314, "rewards/rollout_reward_func/mean": 1.233947992324829, "rewards/rollout_reward_func/std": 1.0279457569122314, "sampling/importance_sampling_ratio/max": 1.1291800737380981, "sampling/importance_sampling_ratio/mean": 0.8492640256881714, "sampling/importance_sampling_ratio/min": 5.042596967541613e-05, "sampling/sampling_logp_difference/max": 1.5292534828186035, "sampling/sampling_logp_difference/mean": 0.169093519449234, "step": 925, "step_time": 13.27568276101374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8619007356464863, "epoch": 0.00463, "grad_norm": 0.011879066936671734, "kl": 0.2668200805783272, "learning_rate": 7.999926696405474e-06, "loss": -0.058, "step": 926, "step_time": 5.299336152980686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4406751245260239, "epoch": 0.004635, "frac_reward_zero_std": 0.0, "grad_norm": 0.03397485241293907, "kl": 0.3287428915500641, "learning_rate": 7.999926531586423e-06, "loss": -0.1077, "num_tokens": 12201048.0, "reward": 1.0226149559020996, "reward_std": 1.2732576131820679, "rewards/rollout_reward_func/mean": 1.0226149559020996, "rewards/rollout_reward_func/std": 1.2732576131820679, "sampling/importance_sampling_ratio/max": 1.1134370565414429, "sampling/importance_sampling_ratio/mean": 0.7218515276908875, "sampling/importance_sampling_ratio/min": 1.0271064638800453e-06, "sampling/sampling_logp_difference/max": 1.9965423345565796, "sampling/sampling_logp_difference/mean": 0.3406047821044922, "step": 927, "step_time": 13.920188821022748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4334496054798365, "epoch": 0.00464, "grad_norm": 0.03079327940940857, "kl": 0.3361770734190941, "learning_rate": 7.999926366582292e-06, "loss": -0.1078, "step": 928, "step_time": 6.893873426990467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.611293125897646, "epoch": 0.004645, "frac_reward_zero_std": 0.5, "grad_norm": 0.03466847911477089, "kl": 0.2255944237112999, "learning_rate": 7.999926201393075e-06, "loss": -0.0185, "num_tokens": 12226753.0, "reward": 0.8600994348526001, "reward_std": 1.3201709985733032, "rewards/rollout_reward_func/mean": 0.8600994348526001, "rewards/rollout_reward_func/std": 1.3201711177825928, "sampling/importance_sampling_ratio/max": 1.1477171182632446, "sampling/importance_sampling_ratio/mean": 0.9171329736709595, "sampling/importance_sampling_ratio/min": 0.00022604608966503292, "sampling/sampling_logp_difference/max": 1.620753526687622, "sampling/sampling_logp_difference/mean": 0.13813050091266632, "step": 929, "step_time": 14.333195303013781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6028493195772171, "epoch": 0.00465, "grad_norm": 0.032117802649736404, "kl": 0.225663922727108, "learning_rate": 7.999926036018777e-06, "loss": -0.0184, "step": 930, "step_time": 6.860858640007791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 4.642857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.263008750975132, "epoch": 0.004655, "frac_reward_zero_std": 0.0, "grad_norm": 0.0471537709236145, "kl": 0.23391050845384598, "learning_rate": 7.999925870459396e-06, "loss": -0.0773, "num_tokens": 12246776.0, "reward": 0.3709121346473694, "reward_std": 1.4967050552368164, "rewards/rollout_reward_func/mean": 0.3709121346473694, "rewards/rollout_reward_func/std": 1.4967050552368164, "sampling/importance_sampling_ratio/max": 1.2827545404434204, "sampling/importance_sampling_ratio/mean": 0.8815009593963623, "sampling/importance_sampling_ratio/min": 7.572390313725919e-05, "sampling/sampling_logp_difference/max": 1.768977403640747, "sampling/sampling_logp_difference/mean": 0.24261334538459778, "step": 931, "step_time": 10.013264142980915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2574554309248924, "epoch": 0.00466, "grad_norm": 0.043408945202827454, "kl": 0.2325589284300804, "learning_rate": 7.999925704714932e-06, "loss": -0.0775, "step": 932, "step_time": 4.986781828003586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8720046654343605, "epoch": 0.004665, "frac_reward_zero_std": 0.5, "grad_norm": 0.24456310272216797, "kl": 1.1429531574249268, "learning_rate": 7.999925538785385e-06, "loss": -0.0325, "num_tokens": 12272509.0, "reward": 1.1512119770050049, "reward_std": 1.2315648794174194, "rewards/rollout_reward_func/mean": 1.1512119770050049, "rewards/rollout_reward_func/std": 1.231564998626709, "sampling/importance_sampling_ratio/max": 1.049422025680542, "sampling/importance_sampling_ratio/mean": 0.8225363492965698, "sampling/importance_sampling_ratio/min": 0.0009848377667367458, "sampling/sampling_logp_difference/max": 1.330599308013916, "sampling/sampling_logp_difference/mean": 0.14793290197849274, "step": 933, "step_time": 16.26726204501756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8731475099921227, "epoch": 0.00467, "grad_norm": 0.20335982739925385, "kl": 0.948751661926508, "learning_rate": 7.999925372670756e-06, "loss": -0.034, "step": 934, "step_time": 7.98466120500234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0901015400886536, "epoch": 0.004675, "frac_reward_zero_std": 0.0, "grad_norm": 0.04177892208099365, "kl": 0.6428431589156389, "learning_rate": 7.999925206371044e-06, "loss": -0.0574, "num_tokens": 12296152.0, "reward": 0.6949704885482788, "reward_std": 1.2321373224258423, "rewards/rollout_reward_func/mean": 0.6949704885482788, "rewards/rollout_reward_func/std": 1.2321373224258423, "sampling/importance_sampling_ratio/max": 1.17511785030365, "sampling/importance_sampling_ratio/mean": 0.7681471705436707, "sampling/importance_sampling_ratio/min": 0.0021419308613985777, "sampling/sampling_logp_difference/max": 1.4873709678649902, "sampling/sampling_logp_difference/mean": 0.1685972809791565, "step": 935, "step_time": 14.046669912990183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0892622843384743, "epoch": 0.00468, "grad_norm": 0.046862371265888214, "kl": 0.705357575789094, "learning_rate": 7.999925039886247e-06, "loss": -0.0575, "step": 936, "step_time": 6.444762379993335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0168662331998348, "epoch": 0.004685, "frac_reward_zero_std": 0.5, "grad_norm": 0.019801169633865356, "kl": 0.24324822053313255, "learning_rate": 7.99992487321637e-06, "loss": -0.0531, "num_tokens": 12319264.0, "reward": 1.5420095920562744, "reward_std": 0.9236685633659363, "rewards/rollout_reward_func/mean": 1.5420095920562744, "rewards/rollout_reward_func/std": 0.9236685633659363, "sampling/importance_sampling_ratio/max": 1.188915729522705, "sampling/importance_sampling_ratio/mean": 0.9197106957435608, "sampling/importance_sampling_ratio/min": 3.4080824828919276e-09, "sampling/sampling_logp_difference/max": 2.081481456756592, "sampling/sampling_logp_difference/mean": 0.2414722889661789, "step": 937, "step_time": 12.360680540004978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0134344846010208, "epoch": 0.00469, "grad_norm": 0.02005598694086075, "kl": 0.24284236505627632, "learning_rate": 7.999924706361409e-06, "loss": -0.0531, "step": 938, "step_time": 6.133296778993099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.9375, "completions/mean_terminated_length": 5.727272987365723, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.0169089436531067, "epoch": 0.004695, "frac_reward_zero_std": 0.0, "grad_norm": 0.08346277475357056, "kl": 0.5817823447287083, "learning_rate": 7.999924539321365e-06, "loss": -0.0811, "num_tokens": 12350823.0, "reward": 0.7825966477394104, "reward_std": 1.2010911703109741, "rewards/rollout_reward_func/mean": 0.7825966477394104, "rewards/rollout_reward_func/std": 1.2010911703109741, "sampling/importance_sampling_ratio/max": 1.3003621101379395, "sampling/importance_sampling_ratio/mean": 0.5322045087814331, "sampling/importance_sampling_ratio/min": 6.678886332878164e-10, "sampling/sampling_logp_difference/max": 2.6931746006011963, "sampling/sampling_logp_difference/mean": 0.5474496483802795, "step": 939, "step_time": 14.103834562993143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.0204309821128845, "epoch": 0.0047, "grad_norm": 0.07831587642431259, "kl": 0.5507171750068665, "learning_rate": 7.999924372096238e-06, "loss": -0.0812, "step": 940, "step_time": 6.9409577890182845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5104509443044662, "epoch": 0.004705, "frac_reward_zero_std": 0.5, "grad_norm": 0.0058684321120381355, "kl": 0.30530766397714615, "learning_rate": 7.999924204686028e-06, "loss": -0.0382, "num_tokens": 12374704.0, "reward": 1.3897476196289062, "reward_std": 0.6951157450675964, "rewards/rollout_reward_func/mean": 1.3897476196289062, "rewards/rollout_reward_func/std": 0.6951158046722412, "sampling/importance_sampling_ratio/max": 1.2696807384490967, "sampling/importance_sampling_ratio/mean": 1.0263607501983643, "sampling/importance_sampling_ratio/min": 0.0047923494130373, "sampling/sampling_logp_difference/max": 0.9753051400184631, "sampling/sampling_logp_difference/mean": 0.0837043970823288, "step": 941, "step_time": 10.604087192987208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5144735239446163, "epoch": 0.00471, "grad_norm": 0.0060731153935194016, "kl": 0.3025095835328102, "learning_rate": 7.999924037090736e-06, "loss": -0.0382, "step": 942, "step_time": 5.224891140984255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 5.4166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9557094871997833, "epoch": 0.004715, "frac_reward_zero_std": 0.0, "grad_norm": 0.03369292616844177, "kl": 0.4195261374115944, "learning_rate": 7.999923869310362e-06, "loss": -0.0755, "num_tokens": 12404592.0, "reward": 0.4030603766441345, "reward_std": 1.2781745195388794, "rewards/rollout_reward_func/mean": 0.4030603766441345, "rewards/rollout_reward_func/std": 1.2781744003295898, "sampling/importance_sampling_ratio/max": 1.2195665836334229, "sampling/importance_sampling_ratio/mean": 0.6622142791748047, "sampling/importance_sampling_ratio/min": 0.00017392887093592435, "sampling/sampling_logp_difference/max": 1.738092064857483, "sampling/sampling_logp_difference/mean": 0.32300156354904175, "step": 943, "step_time": 15.27792116899218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9555418193340302, "epoch": 0.00472, "grad_norm": 0.034608375281095505, "kl": 0.4204486422240734, "learning_rate": 7.999923701344902e-06, "loss": -0.0755, "step": 944, "step_time": 7.023238718000357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 4.153846263885498, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5223852843046188, "epoch": 0.004725, "frac_reward_zero_std": 0.0, "grad_norm": 0.12562112510204315, "kl": 0.2424536906182766, "learning_rate": 7.999923533194363e-06, "loss": -0.0672, "num_tokens": 12433810.0, "reward": 0.7750762701034546, "reward_std": 1.4204384088516235, "rewards/rollout_reward_func/mean": 0.7750762701034546, "rewards/rollout_reward_func/std": 1.4204386472702026, "sampling/importance_sampling_ratio/max": 1.2185159921646118, "sampling/importance_sampling_ratio/mean": 0.7909594774246216, "sampling/importance_sampling_ratio/min": 2.618149892441579e-06, "sampling/sampling_logp_difference/max": 2.0932106971740723, "sampling/sampling_logp_difference/mean": 0.32284414768218994, "step": 945, "step_time": 12.692481313002645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5213147476315498, "epoch": 0.00473, "grad_norm": 0.12496742606163025, "kl": 0.24432895332574844, "learning_rate": 7.99992336485874e-06, "loss": -0.0676, "step": 946, "step_time": 6.604368977001286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7689770255237818, "epoch": 0.004735, "frac_reward_zero_std": 0.5, "grad_norm": 0.1216219812631607, "kl": 0.5160957090556622, "learning_rate": 7.999923196338033e-06, "loss": -0.0401, "num_tokens": 12457352.0, "reward": 1.5557513236999512, "reward_std": 0.9485524892807007, "rewards/rollout_reward_func/mean": 1.5557513236999512, "rewards/rollout_reward_func/std": 0.9485524892807007, "sampling/importance_sampling_ratio/max": 1.152165412902832, "sampling/importance_sampling_ratio/mean": 0.8720518350601196, "sampling/importance_sampling_ratio/min": 0.0012191848363727331, "sampling/sampling_logp_difference/max": 1.357311725616455, "sampling/sampling_logp_difference/mean": 0.13599640130996704, "step": 947, "step_time": 12.402378518992919 }, { "clip_ratio/high_max": 0.010869565419852734, "clip_ratio/high_mean": 0.005434782709926367, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005434782709926367, "entropy": 0.7750512044876814, "epoch": 0.00474, "grad_norm": 0.07724209129810333, "kl": 0.429522342979908, "learning_rate": 7.999923027632243e-06, "loss": -0.0408, "step": 948, "step_time": 5.666317166978843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 5.466667175292969, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6429987512528896, "epoch": 0.004745, "frac_reward_zero_std": 0.0, "grad_norm": 0.1808691769838333, "kl": 0.23338330164551735, "learning_rate": 7.999922858741373e-06, "loss": -0.096, "num_tokens": 12477482.0, "reward": 0.5754592418670654, "reward_std": 1.2769025564193726, "rewards/rollout_reward_func/mean": 0.5754592418670654, "rewards/rollout_reward_func/std": 1.276902675628662, "sampling/importance_sampling_ratio/max": 1.1042945384979248, "sampling/importance_sampling_ratio/mean": 0.7310895919799805, "sampling/importance_sampling_ratio/min": 3.3284839446423575e-05, "sampling/sampling_logp_difference/max": 1.8601844310760498, "sampling/sampling_logp_difference/mean": 0.32422614097595215, "step": 949, "step_time": 9.493829685001401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6428093202412128, "epoch": 0.00475, "grad_norm": 0.17391763627529144, "kl": 0.2306949980556965, "learning_rate": 7.999922689665418e-06, "loss": -0.0961, "step": 950, "step_time": 4.866304418013897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.0625, "completions/mean_terminated_length": 5.909090995788574, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.148975431919098, "epoch": 0.004755, "frac_reward_zero_std": 0.0, "grad_norm": 0.17950284481048584, "kl": 0.20935411658138037, "learning_rate": 7.999922520404381e-06, "loss": -0.1013, "num_tokens": 12510474.0, "reward": 0.1283225417137146, "reward_std": 1.399326205253601, "rewards/rollout_reward_func/mean": 0.1283225417137146, "rewards/rollout_reward_func/std": 1.399326205253601, "sampling/importance_sampling_ratio/max": 1.4005906581878662, "sampling/importance_sampling_ratio/mean": 0.5010855197906494, "sampling/importance_sampling_ratio/min": 0.0002053244534181431, "sampling/sampling_logp_difference/max": 1.7950856685638428, "sampling/sampling_logp_difference/mean": 0.3561288118362427, "step": 951, "step_time": 18.087738418980734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1522610783576965, "epoch": 0.00476, "grad_norm": 0.1740771383047104, "kl": 0.21173761319369078, "learning_rate": 7.99992235095826e-06, "loss": -0.1014, "step": 952, "step_time": 7.998768028002814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9044979438185692, "epoch": 0.004765, "frac_reward_zero_std": 0.0, "grad_norm": 0.11879769712686539, "kl": 1.1353535652160645, "learning_rate": 7.999922181327058e-06, "loss": -0.0467, "num_tokens": 12535438.0, "reward": 1.4641971588134766, "reward_std": 0.8925155997276306, "rewards/rollout_reward_func/mean": 1.4641971588134766, "rewards/rollout_reward_func/std": 0.8925155997276306, "sampling/importance_sampling_ratio/max": 1.1621705293655396, "sampling/importance_sampling_ratio/mean": 0.8799188137054443, "sampling/importance_sampling_ratio/min": 0.0017272938275709748, "sampling/sampling_logp_difference/max": 2.1572093963623047, "sampling/sampling_logp_difference/mean": 0.167730450630188, "step": 953, "step_time": 12.660007399012102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9016223326325417, "epoch": 0.00477, "grad_norm": 0.12064992636442184, "kl": 1.0834451541304588, "learning_rate": 7.999922011510774e-06, "loss": -0.0473, "step": 954, "step_time": 6.3490934900182765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 6.769230842590332, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.6426595747470856, "epoch": 0.004775, "frac_reward_zero_std": 0.0, "grad_norm": 0.11742104589939117, "kl": 0.17687557265162468, "learning_rate": 7.999921841509405e-06, "loss": -0.0471, "num_tokens": 12568922.0, "reward": -0.4465121626853943, "reward_std": 0.9284676313400269, "rewards/rollout_reward_func/mean": -0.4465121626853943, "rewards/rollout_reward_func/std": 0.9284676313400269, "sampling/importance_sampling_ratio/max": 1.180728554725647, "sampling/importance_sampling_ratio/mean": 0.4658060371875763, "sampling/importance_sampling_ratio/min": 2.1630063201882876e-05, "sampling/sampling_logp_difference/max": 1.6973557472229004, "sampling/sampling_logp_difference/mean": 0.37887585163116455, "step": 955, "step_time": 17.16298704300425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.6476567089557648, "epoch": 0.00478, "grad_norm": 0.11412397772073746, "kl": 0.1775544174015522, "learning_rate": 7.999921671322954e-06, "loss": -0.0475, "step": 956, "step_time": 7.73297385002661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.3125, "completions/mean_terminated_length": 4.4166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7290117144584656, "epoch": 0.004785, "frac_reward_zero_std": 0.0, "grad_norm": 0.10412100702524185, "kl": 0.16988901793956757, "learning_rate": 7.999921500951423e-06, "loss": -0.0806, "num_tokens": 12602261.0, "reward": 0.35011106729507446, "reward_std": 1.1982691287994385, "rewards/rollout_reward_func/mean": 0.35011106729507446, "rewards/rollout_reward_func/std": 1.1982691287994385, "sampling/importance_sampling_ratio/max": 1.3215663433074951, "sampling/importance_sampling_ratio/mean": 0.7182450294494629, "sampling/importance_sampling_ratio/min": 0.0002734664885792881, "sampling/sampling_logp_difference/max": 1.296763300895691, "sampling/sampling_logp_difference/mean": 0.28250154852867126, "step": 957, "step_time": 19.507780989006278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7409340739250183, "epoch": 0.00479, "grad_norm": 0.11889240145683289, "kl": 0.1678694151341915, "learning_rate": 7.999921330394807e-06, "loss": -0.081, "step": 958, "step_time": 9.377181859017583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8918073177337646, "epoch": 0.004795, "frac_reward_zero_std": 0.0, "grad_norm": 0.03225482255220413, "kl": 0.5037573352456093, "learning_rate": 7.999921159653109e-06, "loss": -0.0653, "num_tokens": 12627798.0, "reward": -0.46391531825065613, "reward_std": 0.6391421556472778, "rewards/rollout_reward_func/mean": -0.46391531825065613, "rewards/rollout_reward_func/std": 0.6391421556472778, "sampling/importance_sampling_ratio/max": 1.1707613468170166, "sampling/importance_sampling_ratio/mean": 0.667548418045044, "sampling/importance_sampling_ratio/min": 2.2428612282965332e-06, "sampling/sampling_logp_difference/max": 2.0763516426086426, "sampling/sampling_logp_difference/mean": 0.3590083718299866, "step": 959, "step_time": 13.198343533993466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8983577191829681, "epoch": 0.0048, "grad_norm": 0.03049091249704361, "kl": 0.4792245924472809, "learning_rate": 7.999920988726329e-06, "loss": -0.0654, "step": 960, "step_time": 5.980440960003762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.124690353870392, "epoch": 0.004805, "frac_reward_zero_std": 0.0, "grad_norm": 0.12493299692869186, "kl": 0.1307863644324243, "learning_rate": 7.999920817614464e-06, "loss": -0.0613, "num_tokens": 12662020.0, "reward": -0.0070021748542785645, "reward_std": 1.0855214595794678, "rewards/rollout_reward_func/mean": -0.0070021748542785645, "rewards/rollout_reward_func/std": 1.0855215787887573, "sampling/importance_sampling_ratio/max": 1.0571595430374146, "sampling/importance_sampling_ratio/mean": 0.5999336242675781, "sampling/importance_sampling_ratio/min": 6.385159849742195e-07, "sampling/sampling_logp_difference/max": 2.076160430908203, "sampling/sampling_logp_difference/mean": 0.360835462808609, "step": 961, "step_time": 15.910904585005483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1535279154777527, "epoch": 0.00481, "grad_norm": 0.13638989627361298, "kl": 0.12848191894590855, "learning_rate": 7.999920646317517e-06, "loss": -0.062, "step": 962, "step_time": 6.88712857599603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.9375, "completions/mean_terminated_length": 4.200000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9306903332471848, "epoch": 0.004815, "frac_reward_zero_std": 0.0, "grad_norm": 0.24411384761333466, "kl": 0.36285536736249924, "learning_rate": 7.999920474835489e-06, "loss": -0.0455, "num_tokens": 12688168.0, "reward": 1.2001900672912598, "reward_std": 1.2031372785568237, "rewards/rollout_reward_func/mean": 1.2001900672912598, "rewards/rollout_reward_func/std": 1.2031372785568237, "sampling/importance_sampling_ratio/max": 1.0886555910110474, "sampling/importance_sampling_ratio/mean": 0.9052826166152954, "sampling/importance_sampling_ratio/min": 1.1833603252853209e-07, "sampling/sampling_logp_difference/max": 2.002429485321045, "sampling/sampling_logp_difference/mean": 0.22298255562782288, "step": 963, "step_time": 9.731670909997774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9804644733667374, "epoch": 0.00482, "grad_norm": 0.2911513149738312, "kl": 0.36794067174196243, "learning_rate": 7.999920303168378e-06, "loss": -0.0476, "step": 964, "step_time": 5.749163774002227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6234137415885925, "epoch": 0.004825, "frac_reward_zero_std": 0.5, "grad_norm": 0.07292183488607407, "kl": 0.1390029899775982, "learning_rate": 7.999920131316184e-06, "loss": -0.0176, "num_tokens": 12713038.0, "reward": 0.6716576814651489, "reward_std": 1.3656516075134277, "rewards/rollout_reward_func/mean": 0.6716576814651489, "rewards/rollout_reward_func/std": 1.3656516075134277, "sampling/importance_sampling_ratio/max": 1.0419853925704956, "sampling/importance_sampling_ratio/mean": 0.6352226734161377, "sampling/importance_sampling_ratio/min": 7.1328402555082e-05, "sampling/sampling_logp_difference/max": 1.795616626739502, "sampling/sampling_logp_difference/mean": 0.21091923117637634, "step": 965, "step_time": 15.573740599997109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6349357664585114, "epoch": 0.00483, "grad_norm": 0.08904949575662613, "kl": 0.13709434121847153, "learning_rate": 7.999919959278909e-06, "loss": -0.017, "step": 966, "step_time": 5.95859706599731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5555411130189896, "epoch": 0.004835, "frac_reward_zero_std": 0.0, "grad_norm": 0.05542389675974846, "kl": 1.0689816027879715, "learning_rate": 7.99991978705655e-06, "loss": -0.0714, "num_tokens": 12739198.0, "reward": 0.5170738697052002, "reward_std": 1.019423246383667, "rewards/rollout_reward_func/mean": 0.5170738697052002, "rewards/rollout_reward_func/std": 1.0194233655929565, "sampling/importance_sampling_ratio/max": 1.096876859664917, "sampling/importance_sampling_ratio/mean": 0.7199969291687012, "sampling/importance_sampling_ratio/min": 6.385047072399175e-06, "sampling/sampling_logp_difference/max": 1.7767995595932007, "sampling/sampling_logp_difference/mean": 0.2780061662197113, "step": 967, "step_time": 14.385898221997195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5739334672689438, "epoch": 0.00484, "grad_norm": 0.055806927382946014, "kl": 1.0162237770855427, "learning_rate": 7.999919614649109e-06, "loss": -0.0713, "step": 968, "step_time": 6.8259807039867155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9491144455969334, "epoch": 0.004845, "frac_reward_zero_std": 0.5, "grad_norm": 0.08827514201402664, "kl": 0.19767773896455765, "learning_rate": 7.999919442056583e-06, "loss": -0.0427, "num_tokens": 12762457.0, "reward": 1.4893661737442017, "reward_std": 1.0040029287338257, "rewards/rollout_reward_func/mean": 1.4893661737442017, "rewards/rollout_reward_func/std": 1.0040029287338257, "sampling/importance_sampling_ratio/max": 1.1017616987228394, "sampling/importance_sampling_ratio/mean": 0.8880549669265747, "sampling/importance_sampling_ratio/min": 3.5901874070987105e-05, "sampling/sampling_logp_difference/max": 1.9094491004943848, "sampling/sampling_logp_difference/mean": 0.1662599742412567, "step": 969, "step_time": 13.279692694995902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9561307989060879, "epoch": 0.00485, "grad_norm": 0.0897524282336235, "kl": 0.1970081627368927, "learning_rate": 7.999919269278977e-06, "loss": -0.0429, "step": 970, "step_time": 6.285473917989293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9160416126251221, "epoch": 0.004855, "frac_reward_zero_std": 0.5, "grad_norm": 0.12438780069351196, "kl": 0.20326529815793037, "learning_rate": 7.999919096316289e-06, "loss": -0.0291, "num_tokens": 12784862.0, "reward": 0.9415984153747559, "reward_std": 1.3668220043182373, "rewards/rollout_reward_func/mean": 0.9415984153747559, "rewards/rollout_reward_func/std": 1.3668221235275269, "sampling/importance_sampling_ratio/max": 1.1883134841918945, "sampling/importance_sampling_ratio/mean": 0.8643086552619934, "sampling/importance_sampling_ratio/min": 0.004228606820106506, "sampling/sampling_logp_difference/max": 1.1977115869522095, "sampling/sampling_logp_difference/mean": 0.11349807679653168, "step": 971, "step_time": 12.378623834010796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.9557544738054276, "epoch": 0.00486, "grad_norm": 0.10640837997198105, "kl": 0.20555537939071655, "learning_rate": 7.999918923168518e-06, "loss": -0.0297, "step": 972, "step_time": 5.906313174011302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.375, "completions/mean_terminated_length": 5.428571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.1722342371940613, "epoch": 0.004865, "frac_reward_zero_std": 0.0, "grad_norm": 0.11808070540428162, "kl": 0.08907017670571804, "learning_rate": 7.999918749835663e-06, "loss": -0.104, "num_tokens": 12814039.0, "reward": -0.04780452698469162, "reward_std": 1.2304154634475708, "rewards/rollout_reward_func/mean": -0.04780452698469162, "rewards/rollout_reward_func/std": 1.2304154634475708, "sampling/importance_sampling_ratio/max": 1.1076713800430298, "sampling/importance_sampling_ratio/mean": 0.3661256432533264, "sampling/importance_sampling_ratio/min": 8.05984825547057e-08, "sampling/sampling_logp_difference/max": 2.31166672706604, "sampling/sampling_logp_difference/mean": 0.49378785490989685, "step": 973, "step_time": 15.782604586012894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.179117441177368, "epoch": 0.00487, "grad_norm": 0.11798211932182312, "kl": 0.08803938888013363, "learning_rate": 7.999918576317726e-06, "loss": -0.104, "step": 974, "step_time": 6.002633743992192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.8125, "completions/mean_terminated_length": 5.545454502105713, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3831728100776672, "epoch": 0.004875, "frac_reward_zero_std": 0.0, "grad_norm": 0.15320979058742523, "kl": 0.12564945220947266, "learning_rate": 7.999918402614708e-06, "loss": -0.0804, "num_tokens": 12840403.0, "reward": -0.40968602895736694, "reward_std": 1.0072705745697021, "rewards/rollout_reward_func/mean": -0.40968602895736694, "rewards/rollout_reward_func/std": 1.0072705745697021, "sampling/importance_sampling_ratio/max": 1.067211627960205, "sampling/importance_sampling_ratio/mean": 0.5062934160232544, "sampling/importance_sampling_ratio/min": 0.00023309617245104164, "sampling/sampling_logp_difference/max": 1.3833969831466675, "sampling/sampling_logp_difference/mean": 0.3440590798854828, "step": 975, "step_time": 14.674065828992752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.387864977121353, "epoch": 0.00488, "grad_norm": 0.15885387361049652, "kl": 0.1263095485046506, "learning_rate": 7.999918228726608e-06, "loss": -0.0803, "step": 976, "step_time": 6.561693021998508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.375, "completions/mean_terminated_length": 5.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.965168058872223, "epoch": 0.004885, "frac_reward_zero_std": 0.0, "grad_norm": 0.07599015533924103, "kl": 0.4046807512640953, "learning_rate": 7.999918054653424e-06, "loss": -0.0761, "num_tokens": 12865435.0, "reward": 0.12282884120941162, "reward_std": 1.4303593635559082, "rewards/rollout_reward_func/mean": 0.12282884120941162, "rewards/rollout_reward_func/std": 1.4303596019744873, "sampling/importance_sampling_ratio/max": 1.052350640296936, "sampling/importance_sampling_ratio/mean": 0.3132224380970001, "sampling/importance_sampling_ratio/min": 1.7067117497049367e-08, "sampling/sampling_logp_difference/max": 2.3293914794921875, "sampling/sampling_logp_difference/mean": 0.4369345009326935, "step": 977, "step_time": 13.336231607987429 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "entropy": 2.9439927339553833, "epoch": 0.00489, "grad_norm": 0.06726393103599548, "kl": 0.42112695798277855, "learning_rate": 7.999917880395159e-06, "loss": -0.0764, "step": 978, "step_time": 5.31153864802036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 4.714285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6618042215704918, "epoch": 0.004895, "frac_reward_zero_std": 0.0, "grad_norm": 0.05989507585763931, "kl": 0.20744065195322037, "learning_rate": 7.99991770595181e-06, "loss": -0.0759, "num_tokens": 12884210.0, "reward": 1.5871789455413818, "reward_std": 0.9422606229782104, "rewards/rollout_reward_func/mean": 1.5871789455413818, "rewards/rollout_reward_func/std": 0.9422606825828552, "sampling/importance_sampling_ratio/max": 1.0624232292175293, "sampling/importance_sampling_ratio/mean": 0.8358370065689087, "sampling/importance_sampling_ratio/min": 1.4124067604370794e-07, "sampling/sampling_logp_difference/max": 1.9379887580871582, "sampling/sampling_logp_difference/mean": 0.3812241554260254, "step": 979, "step_time": 9.566828392999014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6511621363461018, "epoch": 0.0049, "grad_norm": 0.05378831923007965, "kl": 0.2133297584950924, "learning_rate": 7.999917531323379e-06, "loss": -0.0761, "step": 980, "step_time": 4.91931507200934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.375, "completions/mean_terminated_length": 3.857142925262451, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1612302120774984, "epoch": 0.004905, "frac_reward_zero_std": 0.5, "grad_norm": 0.1737418919801712, "kl": 0.24206549301743507, "learning_rate": 7.999917356509865e-06, "loss": -0.0459, "num_tokens": 12905009.0, "reward": 1.3733049631118774, "reward_std": 1.0969384908676147, "rewards/rollout_reward_func/mean": 1.3733049631118774, "rewards/rollout_reward_func/std": 1.0969384908676147, "sampling/importance_sampling_ratio/max": 1.0464732646942139, "sampling/importance_sampling_ratio/mean": 0.8575553297996521, "sampling/importance_sampling_ratio/min": 6.804633585488773e-07, "sampling/sampling_logp_difference/max": 1.737269401550293, "sampling/sampling_logp_difference/mean": 0.20869942009449005, "step": 981, "step_time": 10.389826209997409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1547661907970905, "epoch": 0.00491, "grad_norm": 0.17951472103595734, "kl": 0.24272574484348297, "learning_rate": 7.999917181511269e-06, "loss": -0.0461, "step": 982, "step_time": 5.1827030679851305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0311255156993866, "epoch": 0.004915, "frac_reward_zero_std": 0.0, "grad_norm": 0.08001622557640076, "kl": 0.21108851581811905, "learning_rate": 7.999917006327593e-06, "loss": -0.0884, "num_tokens": 12928078.0, "reward": 1.2581220865249634, "reward_std": 1.0697815418243408, "rewards/rollout_reward_func/mean": 1.2581220865249634, "rewards/rollout_reward_func/std": 1.0697815418243408, "sampling/importance_sampling_ratio/max": 1.1323848962783813, "sampling/importance_sampling_ratio/mean": 0.8490185141563416, "sampling/importance_sampling_ratio/min": 0.002869238145649433, "sampling/sampling_logp_difference/max": 1.4832699298858643, "sampling/sampling_logp_difference/mean": 0.15939997136592865, "step": 983, "step_time": 11.587732739993953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0225862227380276, "epoch": 0.00492, "grad_norm": 0.08023562282323837, "kl": 0.2117576003074646, "learning_rate": 7.999916830958833e-06, "loss": -0.0885, "step": 984, "step_time": 5.280036258001928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.5625, "completions/mean_terminated_length": 4.555555820465088, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.0412401854991913, "epoch": 0.004925, "frac_reward_zero_std": 0.0, "grad_norm": 0.12350935488939285, "kl": 0.18197142146527767, "learning_rate": 7.999916655404989e-06, "loss": -0.073, "num_tokens": 12957453.0, "reward": -0.2328949272632599, "reward_std": 1.0591342449188232, "rewards/rollout_reward_func/mean": -0.2328949272632599, "rewards/rollout_reward_func/std": 1.0591343641281128, "sampling/importance_sampling_ratio/max": 1.0637037754058838, "sampling/importance_sampling_ratio/mean": 0.3923279941082001, "sampling/importance_sampling_ratio/min": 3.9518502603641537e-07, "sampling/sampling_logp_difference/max": 1.7189431190490723, "sampling/sampling_logp_difference/mean": 0.47150975465774536, "step": 985, "step_time": 16.66536946600536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.02771458029747, "epoch": 0.00493, "grad_norm": 0.12277918308973312, "kl": 0.1881118370220065, "learning_rate": 7.999916479666064e-06, "loss": -0.0733, "step": 986, "step_time": 8.30959571499261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 9.4375, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.414854556322098, "epoch": 0.004935, "frac_reward_zero_std": 0.0, "grad_norm": 0.16558705270290375, "kl": 0.18192031513899565, "learning_rate": 7.999916303742057e-06, "loss": -0.1042, "num_tokens": 12983091.0, "reward": 0.5680533647537231, "reward_std": 1.407642126083374, "rewards/rollout_reward_func/mean": 0.5680533647537231, "rewards/rollout_reward_func/std": 1.407642126083374, "sampling/importance_sampling_ratio/max": 1.1258249282836914, "sampling/importance_sampling_ratio/mean": 0.5264537930488586, "sampling/importance_sampling_ratio/min": 9.262697858503088e-05, "sampling/sampling_logp_difference/max": 1.8438836336135864, "sampling/sampling_logp_difference/mean": 0.3714631199836731, "step": 987, "step_time": 14.283218393000425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4006016850471497, "epoch": 0.00494, "grad_norm": 0.14027714729309082, "kl": 0.18294505029916763, "learning_rate": 7.999916127632967e-06, "loss": -0.1053, "step": 988, "step_time": 5.380995024024742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.428571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.159331601113081, "epoch": 0.004945, "frac_reward_zero_std": 0.5, "grad_norm": 0.06351876258850098, "kl": 0.4632725417613983, "learning_rate": 7.999915951338796e-06, "loss": -0.0574, "num_tokens": 13006279.0, "reward": 1.4327566623687744, "reward_std": 1.0688925981521606, "rewards/rollout_reward_func/mean": 1.4327566623687744, "rewards/rollout_reward_func/std": 1.0688925981521606, "sampling/importance_sampling_ratio/max": 1.1291911602020264, "sampling/importance_sampling_ratio/mean": 0.8633093237876892, "sampling/importance_sampling_ratio/min": 6.785688100308107e-08, "sampling/sampling_logp_difference/max": 1.7715157270431519, "sampling/sampling_logp_difference/mean": 0.2755095660686493, "step": 989, "step_time": 13.594647207006346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1447706297039986, "epoch": 0.00495, "grad_norm": 0.06161923334002495, "kl": 0.4652155674993992, "learning_rate": 7.99991577485954e-06, "loss": -0.0575, "step": 990, "step_time": 6.772692744983942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.1875, "completions/mean_terminated_length": 5.583333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.110097348690033, "epoch": 0.004955, "frac_reward_zero_std": 0.0, "grad_norm": 0.08920228481292725, "kl": 0.14959883876144886, "learning_rate": 7.999915598195205e-06, "loss": -0.0932, "num_tokens": 13038264.0, "reward": 0.1529296338558197, "reward_std": 1.2987622022628784, "rewards/rollout_reward_func/mean": 0.1529296338558197, "rewards/rollout_reward_func/std": 1.2987622022628784, "sampling/importance_sampling_ratio/max": 1.0892235040664673, "sampling/importance_sampling_ratio/mean": 0.5959908962249756, "sampling/importance_sampling_ratio/min": 2.438454430375714e-05, "sampling/sampling_logp_difference/max": 2.5471949577331543, "sampling/sampling_logp_difference/mean": 0.36235904693603516, "step": 991, "step_time": 14.117951746025938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0979965031147003, "epoch": 0.00496, "grad_norm": 0.08453431725502014, "kl": 0.1504577398300171, "learning_rate": 7.999915421345786e-06, "loss": -0.093, "step": 992, "step_time": 6.381579602981219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 4.6875, "completions/mean_terminated_length": 4.6875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.471652790904045, "epoch": 0.004965, "frac_reward_zero_std": 0.0, "grad_norm": 0.33713725209236145, "kl": 0.310749813914299, "learning_rate": 7.999915244311286e-06, "loss": -0.071, "num_tokens": 13065078.0, "reward": 0.0934680625796318, "reward_std": 1.2397651672363281, "rewards/rollout_reward_func/mean": 0.0934680625796318, "rewards/rollout_reward_func/std": 1.2397651672363281, "sampling/importance_sampling_ratio/max": 1.1760144233703613, "sampling/importance_sampling_ratio/mean": 0.6856298446655273, "sampling/importance_sampling_ratio/min": 0.0008016981300897896, "sampling/sampling_logp_difference/max": 1.454392433166504, "sampling/sampling_logp_difference/mean": 0.23519787192344666, "step": 993, "step_time": 11.619589859983535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009615384973585606, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009615384973585606, "entropy": 1.4808944761753082, "epoch": 0.00497, "grad_norm": 0.3146250247955322, "kl": 0.32685522735118866, "learning_rate": 7.999915067091701e-06, "loss": -0.0722, "step": 994, "step_time": 5.988730407989351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.315079502761364, "epoch": 0.004975, "frac_reward_zero_std": 0.5, "grad_norm": 0.006946665234863758, "kl": 0.2181644793599844, "learning_rate": 7.999914889687037e-06, "loss": -0.0464, "num_tokens": 13089333.0, "reward": 0.7349300384521484, "reward_std": 1.3097825050354004, "rewards/rollout_reward_func/mean": 0.7349300384521484, "rewards/rollout_reward_func/std": 1.30978262424469, "sampling/importance_sampling_ratio/max": 1.030434250831604, "sampling/importance_sampling_ratio/mean": 0.7686924934387207, "sampling/importance_sampling_ratio/min": 6.9574925873894244e-06, "sampling/sampling_logp_difference/max": 1.3512403964996338, "sampling/sampling_logp_difference/mean": 0.21785809099674225, "step": 995, "step_time": 14.836990180978319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3129800986498594, "epoch": 0.00498, "grad_norm": 0.00647615734487772, "kl": 0.21790648251771927, "learning_rate": 7.999914712097288e-06, "loss": -0.0464, "step": 996, "step_time": 5.753895064000972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4370326474308968, "epoch": 0.004985, "frac_reward_zero_std": 0.0, "grad_norm": 0.01646311767399311, "kl": 0.14219597121700644, "learning_rate": 7.999914534322459e-06, "loss": -0.1178, "num_tokens": 13120633.0, "reward": 0.40896251797676086, "reward_std": 1.3219138383865356, "rewards/rollout_reward_func/mean": 0.40896251797676086, "rewards/rollout_reward_func/std": 1.3219139575958252, "sampling/importance_sampling_ratio/max": 1.0857194662094116, "sampling/importance_sampling_ratio/mean": 0.4531251788139343, "sampling/importance_sampling_ratio/min": 1.8522850950830616e-05, "sampling/sampling_logp_difference/max": 1.7244514226913452, "sampling/sampling_logp_difference/mean": 0.40234827995300293, "step": 997, "step_time": 16.61770295900351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4351625852286816, "epoch": 0.00499, "grad_norm": 0.015228606760501862, "kl": 0.14222395652905107, "learning_rate": 7.999914356362547e-06, "loss": -0.1178, "step": 998, "step_time": 6.439292497001588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.075145900249481, "epoch": 0.004995, "frac_reward_zero_std": 0.0, "grad_norm": 0.057948578149080276, "kl": 0.47631538286805153, "learning_rate": 7.999914178217553e-06, "loss": -0.0965, "num_tokens": 13143885.0, "reward": 0.8274744153022766, "reward_std": 1.5217703580856323, "rewards/rollout_reward_func/mean": 0.8274744153022766, "rewards/rollout_reward_func/std": 1.5217704772949219, "sampling/importance_sampling_ratio/max": 1.0965067148208618, "sampling/importance_sampling_ratio/mean": 0.6543292999267578, "sampling/importance_sampling_ratio/min": 2.3517113256765754e-10, "sampling/sampling_logp_difference/max": 2.3378944396972656, "sampling/sampling_logp_difference/mean": 0.4352513253688812, "step": 999, "step_time": 13.75452242299798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0683272778987885, "epoch": 0.005, "grad_norm": 0.060985688120126724, "kl": 0.49577005952596664, "learning_rate": 7.999913999887476e-06, "loss": -0.0965, "step": 1000, "step_time": 5.045049698004732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 4.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3923394978046417, "epoch": 0.005005, "frac_reward_zero_std": 0.0, "grad_norm": 0.7772773504257202, "kl": 3.29535099118948, "learning_rate": 7.999913821372318e-06, "loss": -0.0632, "num_tokens": 13177052.0, "reward": 0.03188568353652954, "reward_std": 1.3085092306137085, "rewards/rollout_reward_func/mean": 0.03188568353652954, "rewards/rollout_reward_func/std": 1.308509349822998, "sampling/importance_sampling_ratio/max": 1.056681513786316, "sampling/importance_sampling_ratio/mean": 0.4305807948112488, "sampling/importance_sampling_ratio/min": 1.8137565348297358e-05, "sampling/sampling_logp_difference/max": 2.1515254974365234, "sampling/sampling_logp_difference/mean": 0.38040730357170105, "step": 1001, "step_time": 16.672897245996865 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 2.4004111289978027, "epoch": 0.00501, "grad_norm": 0.32125669717788696, "kl": 1.4395738504827023, "learning_rate": 7.999913642672078e-06, "loss": -0.0705, "step": 1002, "step_time": 7.518372895006905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.663836717605591, "epoch": 0.005015, "frac_reward_zero_std": 0.0, "grad_norm": 0.29124754667282104, "kl": 0.10914342105388641, "learning_rate": 7.999913463786754e-06, "loss": -0.1158, "num_tokens": 13204078.0, "reward": -0.14522260427474976, "reward_std": 1.0419100522994995, "rewards/rollout_reward_func/mean": -0.14522260427474976, "rewards/rollout_reward_func/std": 1.0419100522994995, "sampling/importance_sampling_ratio/max": 1.2657169103622437, "sampling/importance_sampling_ratio/mean": 0.5199475884437561, "sampling/importance_sampling_ratio/min": 3.198685953975655e-05, "sampling/sampling_logp_difference/max": 1.8376660346984863, "sampling/sampling_logp_difference/mean": 0.3576434254646301, "step": 1003, "step_time": 14.370302964001894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.6655355989933014, "epoch": 0.00502, "grad_norm": 0.27733907103538513, "kl": 0.10936888866126537, "learning_rate": 7.99991328471635e-06, "loss": -0.1159, "step": 1004, "step_time": 5.3254350109928055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.1875, "completions/mean_terminated_length": 6.700000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.1675617694854736, "epoch": 0.005025, "frac_reward_zero_std": 0.0, "grad_norm": 0.06929407268762589, "kl": 0.11799463629722595, "learning_rate": 7.999913105460862e-06, "loss": -0.088, "num_tokens": 13233790.0, "reward": 0.10944998264312744, "reward_std": 1.3120783567428589, "rewards/rollout_reward_func/mean": 0.10944998264312744, "rewards/rollout_reward_func/std": 1.3120783567428589, "sampling/importance_sampling_ratio/max": 1.072278618812561, "sampling/importance_sampling_ratio/mean": 0.4453037977218628, "sampling/importance_sampling_ratio/min": 2.377276757670188e-07, "sampling/sampling_logp_difference/max": 2.8074421882629395, "sampling/sampling_logp_difference/mean": 0.534946084022522, "step": 1005, "step_time": 16.293948547012405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.1674565076828003, "epoch": 0.00503, "grad_norm": 0.06231396272778511, "kl": 0.1221182206645608, "learning_rate": 7.999912926020293e-06, "loss": -0.088, "step": 1006, "step_time": 6.835860084014712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.625, "completions/mean_terminated_length": 5.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.872415214776993, "epoch": 0.005035, "frac_reward_zero_std": 0.0, "grad_norm": 0.06640477478504181, "kl": 0.1274752588942647, "learning_rate": 7.999912746394643e-06, "loss": -0.0828, "num_tokens": 13264631.0, "reward": -0.3417517840862274, "reward_std": 1.1204328536987305, "rewards/rollout_reward_func/mean": -0.3417517840862274, "rewards/rollout_reward_func/std": 1.12043297290802, "sampling/importance_sampling_ratio/max": 1.0508794784545898, "sampling/importance_sampling_ratio/mean": 0.2960575222969055, "sampling/importance_sampling_ratio/min": 2.0745867459481815e-06, "sampling/sampling_logp_difference/max": 2.1370670795440674, "sampling/sampling_logp_difference/mean": 0.4677940905094147, "step": 1007, "step_time": 17.295221086984384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8734205663204193, "epoch": 0.00504, "grad_norm": 0.06882590055465698, "kl": 0.1249053068459034, "learning_rate": 7.999912566583908e-06, "loss": -0.0827, "step": 1008, "step_time": 7.253725279006176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8468300551176071, "epoch": 0.005045, "frac_reward_zero_std": 0.0, "grad_norm": 0.0435558557510376, "kl": 0.2232966348528862, "learning_rate": 7.999912386588093e-06, "loss": -0.0738, "num_tokens": 13299563.0, "reward": 0.5841485261917114, "reward_std": 1.362634539604187, "rewards/rollout_reward_func/mean": 0.5841485261917114, "rewards/rollout_reward_func/std": 1.3626344203948975, "sampling/importance_sampling_ratio/max": 1.0650484561920166, "sampling/importance_sampling_ratio/mean": 0.6501653790473938, "sampling/importance_sampling_ratio/min": 8.164111932273954e-05, "sampling/sampling_logp_difference/max": 1.6992136240005493, "sampling/sampling_logp_difference/mean": 0.2847430408000946, "step": 1009, "step_time": 17.960681944008684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.849673017859459, "epoch": 0.00505, "grad_norm": 0.04609455540776253, "kl": 0.22308273240923882, "learning_rate": 7.999912206407195e-06, "loss": -0.0737, "step": 1010, "step_time": 7.919483017991297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.8125, "completions/mean_terminated_length": 5.083333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0303929448127747, "epoch": 0.005055, "frac_reward_zero_std": 0.0, "grad_norm": 0.12763464450836182, "kl": 0.2022153101861477, "learning_rate": 7.999912026041215e-06, "loss": -0.0801, "num_tokens": 13326914.0, "reward": -0.07443991303443909, "reward_std": 0.7061072587966919, "rewards/rollout_reward_func/mean": -0.07443991303443909, "rewards/rollout_reward_func/std": 0.7061071991920471, "sampling/importance_sampling_ratio/max": 1.088724136352539, "sampling/importance_sampling_ratio/mean": 0.5837680101394653, "sampling/importance_sampling_ratio/min": 0.00012413809599820524, "sampling/sampling_logp_difference/max": 1.5715621709823608, "sampling/sampling_logp_difference/mean": 0.3245109021663666, "step": 1011, "step_time": 16.11831053001515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.027041345834732, "epoch": 0.00506, "grad_norm": 0.14295175671577454, "kl": 0.1913781277835369, "learning_rate": 7.999911845490154e-06, "loss": -0.0812, "step": 1012, "step_time": 6.730327456010855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 5.090909004211426, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.226347144693136, "epoch": 0.005065, "frac_reward_zero_std": 0.0, "grad_norm": 0.031331729143857956, "kl": 0.16284096986055374, "learning_rate": 7.999911664754011e-06, "loss": -0.1011, "num_tokens": 13360730.0, "reward": 0.7062766551971436, "reward_std": 1.1933878660202026, "rewards/rollout_reward_func/mean": 0.7062766551971436, "rewards/rollout_reward_func/std": 1.1933879852294922, "sampling/importance_sampling_ratio/max": 1.1916446685791016, "sampling/importance_sampling_ratio/mean": 0.6541247963905334, "sampling/importance_sampling_ratio/min": 5.021718607167713e-06, "sampling/sampling_logp_difference/max": 1.8052362203598022, "sampling/sampling_logp_difference/mean": 0.3968290090560913, "step": 1013, "step_time": 16.312099617003696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2260910496115685, "epoch": 0.00507, "grad_norm": 0.03281581029295921, "kl": 0.1627949494868517, "learning_rate": 7.999911483832785e-06, "loss": -0.101, "step": 1014, "step_time": 7.65406037599314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.0625, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8286135867238045, "epoch": 0.005075, "frac_reward_zero_std": 0.5, "grad_norm": 0.09738011658191681, "kl": 0.31274695694446564, "learning_rate": 7.999911302726476e-06, "loss": -0.0381, "num_tokens": 13384865.0, "reward": 0.7484679222106934, "reward_std": 1.296507477760315, "rewards/rollout_reward_func/mean": 0.7484679222106934, "rewards/rollout_reward_func/std": 1.2965075969696045, "sampling/importance_sampling_ratio/max": 1.0676347017288208, "sampling/importance_sampling_ratio/mean": 0.8615055680274963, "sampling/importance_sampling_ratio/min": 0.0018162743654102087, "sampling/sampling_logp_difference/max": 1.2977447509765625, "sampling/sampling_logp_difference/mean": 0.11900758743286133, "step": 1015, "step_time": 12.641270569016342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8235649615526199, "epoch": 0.00508, "grad_norm": 0.0945579782128334, "kl": 0.3075447753071785, "learning_rate": 7.999911121435088e-06, "loss": -0.0384, "step": 1016, "step_time": 6.586245873986627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 5.733333587646484, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4150178134441376, "epoch": 0.005085, "frac_reward_zero_std": 0.0, "grad_norm": 0.09176552295684814, "kl": 0.6638038363307714, "learning_rate": 7.999910939958615e-06, "loss": -0.0678, "num_tokens": 13411292.0, "reward": 0.5390459299087524, "reward_std": 1.5043903589248657, "rewards/rollout_reward_func/mean": 0.5390459299087524, "rewards/rollout_reward_func/std": 1.5043903589248657, "sampling/importance_sampling_ratio/max": 1.1075083017349243, "sampling/importance_sampling_ratio/mean": 0.7095940709114075, "sampling/importance_sampling_ratio/min": 0.0007014479488134384, "sampling/sampling_logp_difference/max": 1.48842191696167, "sampling/sampling_logp_difference/mean": 0.24795275926589966, "step": 1017, "step_time": 12.084419131002505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4197121411561966, "epoch": 0.00509, "grad_norm": 0.09046906232833862, "kl": 0.6129512712359428, "learning_rate": 7.999910758297063e-06, "loss": -0.0679, "step": 1018, "step_time": 5.756661787003395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9776521623134613, "epoch": 0.005095, "frac_reward_zero_std": 0.0, "grad_norm": 0.23522642254829407, "kl": 0.21045831218361855, "learning_rate": 7.999910576450427e-06, "loss": -0.0309, "num_tokens": 13440035.0, "reward": 0.28018617630004883, "reward_std": 1.2393243312835693, "rewards/rollout_reward_func/mean": 0.28018617630004883, "rewards/rollout_reward_func/std": 1.2393243312835693, "sampling/importance_sampling_ratio/max": 1.0661710500717163, "sampling/importance_sampling_ratio/mean": 0.8686482906341553, "sampling/importance_sampling_ratio/min": 6.299664528341964e-05, "sampling/sampling_logp_difference/max": 1.5690598487854004, "sampling/sampling_logp_difference/mean": 0.15512892603874207, "step": 1019, "step_time": 13.557873172976542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9877706207334995, "epoch": 0.0051, "grad_norm": 0.24458225071430206, "kl": 0.2088216245174408, "learning_rate": 7.999910394418708e-06, "loss": -0.0312, "step": 1020, "step_time": 6.217490909009939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8112188279628754, "epoch": 0.005105, "frac_reward_zero_std": 0.0, "grad_norm": 0.04806939885020256, "kl": 0.14132758602499962, "learning_rate": 7.99991021220191e-06, "loss": -0.1089, "num_tokens": 13465819.0, "reward": 0.9157446026802063, "reward_std": 1.3889049291610718, "rewards/rollout_reward_func/mean": 0.9157446026802063, "rewards/rollout_reward_func/std": 1.3889049291610718, "sampling/importance_sampling_ratio/max": 1.0327739715576172, "sampling/importance_sampling_ratio/mean": 0.6349939703941345, "sampling/importance_sampling_ratio/min": 0.00014477453078143299, "sampling/sampling_logp_difference/max": 1.3329486846923828, "sampling/sampling_logp_difference/mean": 0.27398791909217834, "step": 1021, "step_time": 13.29310853802599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8105089962482452, "epoch": 0.00511, "grad_norm": 0.05448240414261818, "kl": 0.14099782519042492, "learning_rate": 7.999910029800027e-06, "loss": -0.1088, "step": 1022, "step_time": 5.334989043985843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.16828157752752304, "epoch": 0.005115, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005638732691295445, "kl": 0.33799730241298676, "learning_rate": 7.999909847213064e-06, "loss": 0.0008, "num_tokens": 13484209.0, "reward": 1.6035562753677368, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.6035562753677368, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.0294772386550903, "sampling/importance_sampling_ratio/mean": 1.0063467025756836, "sampling/importance_sampling_ratio/min": 0.9823023080825806, "sampling/sampling_logp_difference/max": 0.04503694921731949, "sampling/sampling_logp_difference/mean": 0.008159428834915161, "step": 1023, "step_time": 6.19154161500046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1689821071922779, "epoch": 0.00512, "grad_norm": 0.0005588586791418493, "kl": 0.3379911333322525, "learning_rate": 7.999909664441018e-06, "loss": 0.0008, "step": 1024, "step_time": 3.408306684999843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.4596625566482544, "epoch": 0.005125, "frac_reward_zero_std": 0.5, "grad_norm": 0.008197121322154999, "kl": 0.22840291261672974, "learning_rate": 7.99990948148389e-06, "loss": -0.0383, "num_tokens": 13502084.0, "reward": 0.44228240847587585, "reward_std": 1.4188969135284424, "rewards/rollout_reward_func/mean": 0.44228240847587585, "rewards/rollout_reward_func/std": 1.418897032737732, "sampling/importance_sampling_ratio/max": 1.0236517190933228, "sampling/importance_sampling_ratio/mean": 0.9489533305168152, "sampling/importance_sampling_ratio/min": 0.0008781572105363011, "sampling/sampling_logp_difference/max": 1.1383055448532104, "sampling/sampling_logp_difference/mean": 0.08429829776287079, "step": 1025, "step_time": 9.431369847996393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.46076609939336777, "epoch": 0.00513, "grad_norm": 0.008155659772455692, "kl": 0.22829775139689445, "learning_rate": 7.999909298341683e-06, "loss": -0.0383, "step": 1026, "step_time": 5.253457041995716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 4.9166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4744698256254196, "epoch": 0.005135, "frac_reward_zero_std": 0.0, "grad_norm": 0.03288056701421738, "kl": 0.18604872282594442, "learning_rate": 7.999909115014391e-06, "loss": -0.0971, "num_tokens": 13532590.0, "reward": 0.9236626625061035, "reward_std": 1.2975395917892456, "rewards/rollout_reward_func/mean": 0.9236626625061035, "rewards/rollout_reward_func/std": 1.2975395917892456, "sampling/importance_sampling_ratio/max": 1.097099781036377, "sampling/importance_sampling_ratio/mean": 0.7168139219284058, "sampling/importance_sampling_ratio/min": 0.00042099118581973016, "sampling/sampling_logp_difference/max": 1.6225860118865967, "sampling/sampling_logp_difference/mean": 0.26585695147514343, "step": 1027, "step_time": 16.48491159499099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4761549308896065, "epoch": 0.00514, "grad_norm": 0.03289981186389923, "kl": 0.18637138232588768, "learning_rate": 7.999908931502018e-06, "loss": -0.0971, "step": 1028, "step_time": 7.742531794006936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3151644580066204, "epoch": 0.005145, "frac_reward_zero_std": 0.0, "grad_norm": 0.09310189634561539, "kl": 2.3728351034224033, "learning_rate": 7.999908747804563e-06, "loss": -0.0843, "num_tokens": 13562673.0, "reward": 0.8919792175292969, "reward_std": 1.2364697456359863, "rewards/rollout_reward_func/mean": 0.8919792175292969, "rewards/rollout_reward_func/std": 1.2364697456359863, "sampling/importance_sampling_ratio/max": 1.0682480335235596, "sampling/importance_sampling_ratio/mean": 0.7056445479393005, "sampling/importance_sampling_ratio/min": 1.6114248865051195e-05, "sampling/sampling_logp_difference/max": 1.9293336868286133, "sampling/sampling_logp_difference/mean": 0.2604254484176636, "step": 1029, "step_time": 14.27808714400453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3138066455721855, "epoch": 0.00515, "grad_norm": 0.08840509504079819, "kl": 2.252616088837385, "learning_rate": 7.999908563922027e-06, "loss": -0.0845, "step": 1030, "step_time": 6.271789959981106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.3125, "completions/mean_terminated_length": 4.4166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.836109310388565, "epoch": 0.005155, "frac_reward_zero_std": 0.0, "grad_norm": 0.19702179729938507, "kl": 0.38485950976610184, "learning_rate": 7.99990837985441e-06, "loss": -0.0524, "num_tokens": 13590795.0, "reward": 0.2109423279762268, "reward_std": 0.9833577275276184, "rewards/rollout_reward_func/mean": 0.2109423279762268, "rewards/rollout_reward_func/std": 0.9833577275276184, "sampling/importance_sampling_ratio/max": 1.0453671216964722, "sampling/importance_sampling_ratio/mean": 0.5310918092727661, "sampling/importance_sampling_ratio/min": 0.00022943661315366626, "sampling/sampling_logp_difference/max": 1.4788347482681274, "sampling/sampling_logp_difference/mean": 0.2945214807987213, "step": 1031, "step_time": 15.292011436002213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8281218111515045, "epoch": 0.00516, "grad_norm": 0.2054913491010666, "kl": 0.3806201219558716, "learning_rate": 7.99990819560171e-06, "loss": -0.0528, "step": 1032, "step_time": 6.470117812990793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8661027438938618, "epoch": 0.005165, "frac_reward_zero_std": 0.5, "grad_norm": 0.1441674828529358, "kl": 0.6450191773474216, "learning_rate": 7.999908011163928e-06, "loss": -0.0307, "num_tokens": 13614568.0, "reward": 0.9849625825881958, "reward_std": 1.2196426391601562, "rewards/rollout_reward_func/mean": 0.9849625825881958, "rewards/rollout_reward_func/std": 1.2196426391601562, "sampling/importance_sampling_ratio/max": 1.1343443393707275, "sampling/importance_sampling_ratio/mean": 0.8375261425971985, "sampling/importance_sampling_ratio/min": 0.00010504764213692397, "sampling/sampling_logp_difference/max": 2.171816110610962, "sampling/sampling_logp_difference/mean": 0.16447962820529938, "step": 1033, "step_time": 14.250935316013056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8642017096281052, "epoch": 0.00517, "grad_norm": 0.10317801684141159, "kl": 0.5486396849155426, "learning_rate": 7.999907826541062e-06, "loss": -0.0313, "step": 1034, "step_time": 7.270448071983992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 5.785714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7722365856170654, "epoch": 0.005175, "frac_reward_zero_std": 0.0, "grad_norm": 0.08021502941846848, "kl": 0.5191344507038593, "learning_rate": 7.999907641733117e-06, "loss": -0.0717, "num_tokens": 13637819.0, "reward": 1.261744499206543, "reward_std": 1.0423380136489868, "rewards/rollout_reward_func/mean": 1.261744499206543, "rewards/rollout_reward_func/std": 1.0423381328582764, "sampling/importance_sampling_ratio/max": 1.0999089479446411, "sampling/importance_sampling_ratio/mean": 0.6511881351470947, "sampling/importance_sampling_ratio/min": 4.5674237725279454e-08, "sampling/sampling_logp_difference/max": 1.7652090787887573, "sampling/sampling_logp_difference/mean": 0.2815083861351013, "step": 1035, "step_time": 14.743873227009317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.770539402961731, "epoch": 0.00518, "grad_norm": 0.07704168558120728, "kl": 0.5291010290384293, "learning_rate": 7.99990745674009e-06, "loss": -0.072, "step": 1036, "step_time": 5.63448502200481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3379138261079788, "epoch": 0.005185, "frac_reward_zero_std": 0.5, "grad_norm": 0.10462907701730728, "kl": 0.4881795346736908, "learning_rate": 7.99990727156198e-06, "loss": -0.0407, "num_tokens": 13664428.0, "reward": 0.9051414132118225, "reward_std": 1.2734460830688477, "rewards/rollout_reward_func/mean": 0.9051414132118225, "rewards/rollout_reward_func/std": 1.2734460830688477, "sampling/importance_sampling_ratio/max": 1.0345731973648071, "sampling/importance_sampling_ratio/mean": 0.6808450222015381, "sampling/importance_sampling_ratio/min": 0.0001574763737153262, "sampling/sampling_logp_difference/max": 1.535942792892456, "sampling/sampling_logp_difference/mean": 0.20377102494239807, "step": 1037, "step_time": 15.681907610982307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3337315246462822, "epoch": 0.00519, "grad_norm": 0.09350459277629852, "kl": 0.45169734209775925, "learning_rate": 7.999907086198791e-06, "loss": -0.0411, "step": 1038, "step_time": 6.982644375995733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.1875, "completions/mean_terminated_length": 4.1875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6682744920253754, "epoch": 0.005195, "frac_reward_zero_std": 0.0, "grad_norm": 0.15822172164916992, "kl": 1.1075872257351875, "learning_rate": 7.999906900650518e-06, "loss": -0.0441, "num_tokens": 13683916.0, "reward": 0.6874213814735413, "reward_std": 1.4621150493621826, "rewards/rollout_reward_func/mean": 0.6874213814735413, "rewards/rollout_reward_func/std": 1.4621150493621826, "sampling/importance_sampling_ratio/max": 1.03071928024292, "sampling/importance_sampling_ratio/mean": 0.8224105834960938, "sampling/importance_sampling_ratio/min": 0.060533881187438965, "sampling/sampling_logp_difference/max": 1.3675298690795898, "sampling/sampling_logp_difference/mean": 0.10178539156913757, "step": 1039, "step_time": 8.594800677994499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6806157007813454, "epoch": 0.0052, "grad_norm": 0.13430924713611603, "kl": 0.9582837000489235, "learning_rate": 7.999906714917164e-06, "loss": -0.0445, "step": 1040, "step_time": 4.872341548994882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5219022929668427, "epoch": 0.005205, "frac_reward_zero_std": 0.5, "grad_norm": 0.02150968834757805, "kl": 0.21833433955907822, "learning_rate": 7.999906528998727e-06, "loss": -0.0422, "num_tokens": 13706763.0, "reward": 0.9566179513931274, "reward_std": 1.3816112279891968, "rewards/rollout_reward_func/mean": 0.9566179513931274, "rewards/rollout_reward_func/std": 1.3816112279891968, "sampling/importance_sampling_ratio/max": 1.0693475008010864, "sampling/importance_sampling_ratio/mean": 0.7049062252044678, "sampling/importance_sampling_ratio/min": 2.296451202710159e-05, "sampling/sampling_logp_difference/max": 1.6163227558135986, "sampling/sampling_logp_difference/mean": 0.20024700462818146, "step": 1041, "step_time": 14.570505016992684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5241915881633759, "epoch": 0.00521, "grad_norm": 0.021965045481920242, "kl": 0.21831346303224564, "learning_rate": 7.99990634289521e-06, "loss": -0.0423, "step": 1042, "step_time": 6.190928494994296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.5625, "completions/mean_terminated_length": 4.555555820465088, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.6929160952568054, "epoch": 0.005215, "frac_reward_zero_std": 0.0, "grad_norm": 0.23888371884822845, "kl": 0.11468271166086197, "learning_rate": 7.99990615660661e-06, "loss": -0.086, "num_tokens": 13736483.0, "reward": 0.06731168925762177, "reward_std": 1.240870475769043, "rewards/rollout_reward_func/mean": 0.06731168925762177, "rewards/rollout_reward_func/std": 1.240870475769043, "sampling/importance_sampling_ratio/max": 1.0651040077209473, "sampling/importance_sampling_ratio/mean": 0.4060794413089752, "sampling/importance_sampling_ratio/min": 7.390513474092586e-06, "sampling/sampling_logp_difference/max": 2.0934247970581055, "sampling/sampling_logp_difference/mean": 0.42189550399780273, "step": 1043, "step_time": 12.501671172009083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.6871271431446075, "epoch": 0.00522, "grad_norm": 0.23842309415340424, "kl": 0.11408131290227175, "learning_rate": 7.999905970132928e-06, "loss": -0.0868, "step": 1044, "step_time": 5.605699624007684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.076923370361328, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4176495745778084, "epoch": 0.005225, "frac_reward_zero_std": 0.0, "grad_norm": 0.2354019433259964, "kl": 0.24566609784960747, "learning_rate": 7.999905783474166e-06, "loss": -0.0844, "num_tokens": 13765290.0, "reward": 0.9006334543228149, "reward_std": 1.1914383172988892, "rewards/rollout_reward_func/mean": 0.9006334543228149, "rewards/rollout_reward_func/std": 1.1914384365081787, "sampling/importance_sampling_ratio/max": 1.0881333351135254, "sampling/importance_sampling_ratio/mean": 0.7342699766159058, "sampling/importance_sampling_ratio/min": 0.00037069327663630247, "sampling/sampling_logp_difference/max": 1.022578239440918, "sampling/sampling_logp_difference/mean": 0.22407713532447815, "step": 1045, "step_time": 12.371067132989992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02500000037252903, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02500000037252903, "entropy": 1.4113108962774277, "epoch": 0.00523, "grad_norm": 0.14964154362678528, "kl": 0.2450931929051876, "learning_rate": 7.999905596630321e-06, "loss": -0.0855, "step": 1046, "step_time": 5.37773933999415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.82777339220047, "epoch": 0.005235, "frac_reward_zero_std": 0.5, "grad_norm": 0.11223480105400085, "kl": 0.19298508763313293, "learning_rate": 7.999905409601396e-06, "loss": -0.0411, "num_tokens": 13789444.0, "reward": 0.7030669450759888, "reward_std": 1.2664821147918701, "rewards/rollout_reward_func/mean": 0.7030669450759888, "rewards/rollout_reward_func/std": 1.2664822340011597, "sampling/importance_sampling_ratio/max": 1.0595974922180176, "sampling/importance_sampling_ratio/mean": 0.6950412392616272, "sampling/importance_sampling_ratio/min": 1.5172731764323544e-06, "sampling/sampling_logp_difference/max": 1.7627592086791992, "sampling/sampling_logp_difference/mean": 0.2607760429382324, "step": 1047, "step_time": 14.950019880983746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8368675112724304, "epoch": 0.00524, "grad_norm": 0.12153232842683792, "kl": 0.18931091018021107, "learning_rate": 7.999905222387389e-06, "loss": -0.0416, "step": 1048, "step_time": 6.434699327015551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 4.615384578704834, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9569545984268188, "epoch": 0.005245, "frac_reward_zero_std": 0.0, "grad_norm": 0.16899563372135162, "kl": 0.3059968315064907, "learning_rate": 7.999905034988298e-06, "loss": -0.0828, "num_tokens": 13812151.0, "reward": -0.37683388590812683, "reward_std": 1.1274921894073486, "rewards/rollout_reward_func/mean": -0.37683388590812683, "rewards/rollout_reward_func/std": 1.1274923086166382, "sampling/importance_sampling_ratio/max": 1.1048109531402588, "sampling/importance_sampling_ratio/mean": 0.7082148790359497, "sampling/importance_sampling_ratio/min": 2.535545320370147e-07, "sampling/sampling_logp_difference/max": 1.9573798179626465, "sampling/sampling_logp_difference/mean": 0.42675095796585083, "step": 1049, "step_time": 12.64586245000828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 1.9909835010766983, "epoch": 0.00525, "grad_norm": 0.13724450767040253, "kl": 0.26103105768561363, "learning_rate": 7.999904847404128e-06, "loss": -0.0837, "step": 1050, "step_time": 6.152653751996695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.875, "completions/mean_terminated_length": 5.1666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.900716096162796, "epoch": 0.005255, "frac_reward_zero_std": 0.5, "grad_norm": 0.01900384947657585, "kl": 0.2244197130203247, "learning_rate": 7.999904659634875e-06, "loss": -0.0396, "num_tokens": 13834851.0, "reward": 1.0340396165847778, "reward_std": 1.3650498390197754, "rewards/rollout_reward_func/mean": 1.0340396165847778, "rewards/rollout_reward_func/std": 1.365049958229065, "sampling/importance_sampling_ratio/max": 1.0534818172454834, "sampling/importance_sampling_ratio/mean": 0.647818386554718, "sampling/importance_sampling_ratio/min": 1.0467667266311764e-07, "sampling/sampling_logp_difference/max": 1.6852943897247314, "sampling/sampling_logp_difference/mean": 0.2857912480831146, "step": 1051, "step_time": 13.853683216992067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.901894360780716, "epoch": 0.00526, "grad_norm": 0.018892871215939522, "kl": 0.22681745141744614, "learning_rate": 7.999904471680541e-06, "loss": -0.0396, "step": 1052, "step_time": 6.042705701984232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 5.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.373821973800659, "epoch": 0.005265, "frac_reward_zero_std": 0.0, "grad_norm": 0.13874396681785583, "kl": 0.591581042855978, "learning_rate": 7.999904283541125e-06, "loss": -0.073, "num_tokens": 13865028.0, "reward": 0.11613133549690247, "reward_std": 1.2380249500274658, "rewards/rollout_reward_func/mean": 0.11613133549690247, "rewards/rollout_reward_func/std": 1.2380249500274658, "sampling/importance_sampling_ratio/max": 1.2359257936477661, "sampling/importance_sampling_ratio/mean": 0.5332789421081543, "sampling/importance_sampling_ratio/min": 1.6422438875451917e-07, "sampling/sampling_logp_difference/max": 2.1101818084716797, "sampling/sampling_logp_difference/mean": 0.40666860342025757, "step": 1053, "step_time": 15.289578308016644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3728063702583313, "epoch": 0.00527, "grad_norm": 0.13946771621704102, "kl": 0.5920413993299007, "learning_rate": 7.999904095216627e-06, "loss": -0.0731, "step": 1054, "step_time": 6.171697381010745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7858774065971375, "epoch": 0.005275, "frac_reward_zero_std": 0.0, "grad_norm": 0.037926748394966125, "kl": 0.19097153097391129, "learning_rate": 7.999903906707048e-06, "loss": -0.0972, "num_tokens": 13894472.0, "reward": 0.6169992685317993, "reward_std": 1.3987129926681519, "rewards/rollout_reward_func/mean": 0.6169992685317993, "rewards/rollout_reward_func/std": 1.3987129926681519, "sampling/importance_sampling_ratio/max": 1.1347960233688354, "sampling/importance_sampling_ratio/mean": 0.5996033549308777, "sampling/importance_sampling_ratio/min": 0.0001999193918891251, "sampling/sampling_logp_difference/max": 1.679100513458252, "sampling/sampling_logp_difference/mean": 0.3124525249004364, "step": 1055, "step_time": 16.452045691010426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7907936573028564, "epoch": 0.00528, "grad_norm": 0.03940623253583908, "kl": 0.1921121384948492, "learning_rate": 7.999903718012388e-06, "loss": -0.0971, "step": 1056, "step_time": 7.555370848989696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1868321783840656, "epoch": 0.005285, "frac_reward_zero_std": 0.0, "grad_norm": 0.16748006641864777, "kl": 0.16664094105362892, "learning_rate": 7.999903529132647e-06, "loss": -0.0934, "num_tokens": 13919271.0, "reward": 0.21898192167282104, "reward_std": 1.362083911895752, "rewards/rollout_reward_func/mean": 0.21898192167282104, "rewards/rollout_reward_func/std": 1.3620840311050415, "sampling/importance_sampling_ratio/max": 1.2115516662597656, "sampling/importance_sampling_ratio/mean": 0.5683777332305908, "sampling/importance_sampling_ratio/min": 3.129443086891115e-07, "sampling/sampling_logp_difference/max": 2.1373648643493652, "sampling/sampling_logp_difference/mean": 0.4028313457965851, "step": 1057, "step_time": 15.502850810007658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 2.179240457713604, "epoch": 0.00529, "grad_norm": 0.1259249746799469, "kl": 0.1697754804044962, "learning_rate": 7.999903340067822e-06, "loss": -0.094, "step": 1058, "step_time": 5.938870409998344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.1875, "completions/mean_terminated_length": 8.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.8446674942970276, "epoch": 0.005295, "frac_reward_zero_std": 0.0, "grad_norm": 0.07403957098722458, "kl": 0.08677994180470705, "learning_rate": 7.999903150817919e-06, "loss": -0.0785, "num_tokens": 13944344.0, "reward": -0.5164035558700562, "reward_std": 1.1170413494110107, "rewards/rollout_reward_func/mean": -0.5164035558700562, "rewards/rollout_reward_func/std": 1.1170413494110107, "sampling/importance_sampling_ratio/max": 1.0735474824905396, "sampling/importance_sampling_ratio/mean": 0.2001325488090515, "sampling/importance_sampling_ratio/min": 1.2255904380253924e-07, "sampling/sampling_logp_difference/max": 2.204979419708252, "sampling/sampling_logp_difference/mean": 0.5693594217300415, "step": 1059, "step_time": 13.548292748004314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.851576864719391, "epoch": 0.0053, "grad_norm": 0.06891163438558578, "kl": 0.08811763348057866, "learning_rate": 7.99990296138293e-06, "loss": -0.0787, "step": 1060, "step_time": 5.269588159018895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.781240850687027, "epoch": 0.005305, "frac_reward_zero_std": 0.5, "grad_norm": 0.014863302931189537, "kl": 0.14256404526531696, "learning_rate": 7.999902771762864e-06, "loss": -0.0381, "num_tokens": 13968240.0, "reward": 0.4708166718482971, "reward_std": 1.3115216493606567, "rewards/rollout_reward_func/mean": 0.4708166718482971, "rewards/rollout_reward_func/std": 1.3115217685699463, "sampling/importance_sampling_ratio/max": 1.1366209983825684, "sampling/importance_sampling_ratio/mean": 0.5869830250740051, "sampling/importance_sampling_ratio/min": 5.076440174889285e-07, "sampling/sampling_logp_difference/max": 2.142573595046997, "sampling/sampling_logp_difference/mean": 0.23546823859214783, "step": 1061, "step_time": 15.13777256301546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.782749354839325, "epoch": 0.00531, "grad_norm": 0.0147150419652462, "kl": 0.144409392029047, "learning_rate": 7.999902581957715e-06, "loss": -0.0381, "step": 1062, "step_time": 5.346700344991405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.625, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.987999975681305, "epoch": 0.005315, "frac_reward_zero_std": 0.0, "grad_norm": 0.04889003187417984, "kl": 0.10902734100818634, "learning_rate": 7.999902391967483e-06, "loss": -0.1068, "num_tokens": 14001800.0, "reward": 0.23749028146266937, "reward_std": 1.25753653049469, "rewards/rollout_reward_func/mean": 0.23749028146266937, "rewards/rollout_reward_func/std": 1.25753653049469, "sampling/importance_sampling_ratio/max": 1.1300816535949707, "sampling/importance_sampling_ratio/mean": 0.4601016938686371, "sampling/importance_sampling_ratio/min": 1.4691106116515584e-05, "sampling/sampling_logp_difference/max": 2.035181999206543, "sampling/sampling_logp_difference/mean": 0.39062830805778503, "step": 1063, "step_time": 16.962641818012344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.985663414001465, "epoch": 0.00532, "grad_norm": 0.04623783379793167, "kl": 0.10954258777201176, "learning_rate": 7.99990220179217e-06, "loss": -0.1069, "step": 1064, "step_time": 7.035009238010389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4906055927276611, "epoch": 0.005325, "frac_reward_zero_std": 0.5, "grad_norm": 0.3915543258190155, "kl": 0.1902727596461773, "learning_rate": 7.999902011431777e-06, "loss": -0.0565, "num_tokens": 14021887.0, "reward": 1.2702186107635498, "reward_std": 1.2464960813522339, "rewards/rollout_reward_func/mean": 1.2702186107635498, "rewards/rollout_reward_func/std": 1.2464960813522339, "sampling/importance_sampling_ratio/max": 1.054527759552002, "sampling/importance_sampling_ratio/mean": 0.7603482604026794, "sampling/importance_sampling_ratio/min": 0.00016414055426139385, "sampling/sampling_logp_difference/max": 1.4555213451385498, "sampling/sampling_logp_difference/mean": 0.1815081387758255, "step": 1065, "step_time": 12.589235499995993 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 1.4721640348434448, "epoch": 0.00533, "grad_norm": 0.12554852664470673, "kl": 0.19282346218824387, "learning_rate": 7.999901820886303e-06, "loss": -0.0582, "step": 1066, "step_time": 5.04298591297993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 5.090909004211426, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7221285328269005, "epoch": 0.005335, "frac_reward_zero_std": 0.0, "grad_norm": 0.4463253617286682, "kl": 0.1588586112484336, "learning_rate": 7.999901630155745e-06, "loss": -0.0964, "num_tokens": 14047758.0, "reward": -0.19526079297065735, "reward_std": 0.808151125907898, "rewards/rollout_reward_func/mean": -0.19526079297065735, "rewards/rollout_reward_func/std": 0.808151125907898, "sampling/importance_sampling_ratio/max": 1.259968876838684, "sampling/importance_sampling_ratio/mean": 0.6590825319290161, "sampling/importance_sampling_ratio/min": 5.883379117221921e-07, "sampling/sampling_logp_difference/max": 1.7910736799240112, "sampling/sampling_logp_difference/mean": 0.3112253248691559, "step": 1067, "step_time": 17.464846364004188 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.6971126310527325, "epoch": 0.00534, "grad_norm": 0.04334885999560356, "kl": 0.1638568378984928, "learning_rate": 7.999901439240108e-06, "loss": -0.0974, "step": 1068, "step_time": 6.248353347997181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.3125, "completions/mean_terminated_length": 4.818181991577148, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4754176288843155, "epoch": 0.005345, "frac_reward_zero_std": 0.0, "grad_norm": 0.1472657024860382, "kl": 1.1864728033542633, "learning_rate": 7.999901248139388e-06, "loss": -0.054, "num_tokens": 14077768.0, "reward": 0.5460367202758789, "reward_std": 1.3983228206634521, "rewards/rollout_reward_func/mean": 0.5460367202758789, "rewards/rollout_reward_func/std": 1.3983229398727417, "sampling/importance_sampling_ratio/max": 1.0470401048660278, "sampling/importance_sampling_ratio/mean": 0.5896389484405518, "sampling/importance_sampling_ratio/min": 0.00017908001609612256, "sampling/sampling_logp_difference/max": 1.78047513961792, "sampling/sampling_logp_difference/mean": 0.21480903029441833, "step": 1069, "step_time": 16.172881321021123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4722938686609268, "epoch": 0.00535, "grad_norm": 0.13351571559906006, "kl": 1.1266948394477367, "learning_rate": 7.999901056853587e-06, "loss": -0.0542, "step": 1070, "step_time": 6.23907626599248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 6.133333683013916, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5929343104362488, "epoch": 0.005355, "frac_reward_zero_std": 0.0, "grad_norm": 0.11915795505046844, "kl": 0.5758934915065765, "learning_rate": 7.999900865382704e-06, "loss": -0.0815, "num_tokens": 14101387.0, "reward": 1.0651934146881104, "reward_std": 1.3481839895248413, "rewards/rollout_reward_func/mean": 1.0651934146881104, "rewards/rollout_reward_func/std": 1.3481839895248413, "sampling/importance_sampling_ratio/max": 1.044358491897583, "sampling/importance_sampling_ratio/mean": 0.723608136177063, "sampling/importance_sampling_ratio/min": 4.536591222858988e-05, "sampling/sampling_logp_difference/max": 2.244877815246582, "sampling/sampling_logp_difference/mean": 0.29564353823661804, "step": 1071, "step_time": 12.898482103002607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5861462727189064, "epoch": 0.00536, "grad_norm": 0.12330631166696548, "kl": 0.6027267202734947, "learning_rate": 7.999900673726741e-06, "loss": -0.0811, "step": 1072, "step_time": 5.517399110001861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 5.230769634246826, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1998007893562317, "epoch": 0.005365, "frac_reward_zero_std": 0.0, "grad_norm": 0.47370874881744385, "kl": 2.776223052293062, "learning_rate": 7.999900481885698e-06, "loss": -0.0215, "num_tokens": 14126494.0, "reward": 0.7169806957244873, "reward_std": 1.4357390403747559, "rewards/rollout_reward_func/mean": 0.7169806957244873, "rewards/rollout_reward_func/std": 1.4357390403747559, "sampling/importance_sampling_ratio/max": 1.1989961862564087, "sampling/importance_sampling_ratio/mean": 0.5432488918304443, "sampling/importance_sampling_ratio/min": 4.269355979857892e-09, "sampling/sampling_logp_difference/max": 1.9883108139038086, "sampling/sampling_logp_difference/mean": 0.4400995075702667, "step": 1073, "step_time": 13.866737418997218 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 2.208117425441742, "epoch": 0.00537, "grad_norm": 0.28194984793663025, "kl": 2.007158286869526, "learning_rate": 7.999900289859572e-06, "loss": -0.0248, "step": 1074, "step_time": 5.882598180003697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0412218794226646, "epoch": 0.005375, "frac_reward_zero_std": 0.0, "grad_norm": 0.06037834659218788, "kl": 0.6726051792502403, "learning_rate": 7.999900097648363e-06, "loss": -0.0691, "num_tokens": 14145330.0, "reward": 0.2667516767978668, "reward_std": 0.8657797574996948, "rewards/rollout_reward_func/mean": 0.2667516767978668, "rewards/rollout_reward_func/std": 0.8657797574996948, "sampling/importance_sampling_ratio/max": 1.0187875032424927, "sampling/importance_sampling_ratio/mean": 0.8234670758247375, "sampling/importance_sampling_ratio/min": 5.2234172471798956e-05, "sampling/sampling_logp_difference/max": 1.805712342262268, "sampling/sampling_logp_difference/mean": 0.20120331645011902, "step": 1075, "step_time": 10.499830525004654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0508058108389378, "epoch": 0.00538, "grad_norm": 0.05252915993332863, "kl": 0.5135648101568222, "learning_rate": 7.999899905252075e-06, "loss": -0.0691, "step": 1076, "step_time": 4.765202654016321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7851767241954803, "epoch": 0.005385, "frac_reward_zero_std": 0.0, "grad_norm": 0.11022898554801941, "kl": 0.27573978528380394, "learning_rate": 7.999899712670703e-06, "loss": -0.0793, "num_tokens": 14171278.0, "reward": 0.6777665615081787, "reward_std": 1.3918818235397339, "rewards/rollout_reward_func/mean": 0.6777665615081787, "rewards/rollout_reward_func/std": 1.3918818235397339, "sampling/importance_sampling_ratio/max": 1.0308513641357422, "sampling/importance_sampling_ratio/mean": 0.6428725123405457, "sampling/importance_sampling_ratio/min": 2.7002966529465766e-09, "sampling/sampling_logp_difference/max": 2.4498772621154785, "sampling/sampling_logp_difference/mean": 0.3553067147731781, "step": 1077, "step_time": 11.752841712979716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.807657077908516, "epoch": 0.00539, "grad_norm": 0.11741513758897781, "kl": 0.26146509498357773, "learning_rate": 7.999899519904254e-06, "loss": -0.0795, "step": 1078, "step_time": 5.2808403210074175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.125, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.7419652342796326, "epoch": 0.005395, "frac_reward_zero_std": 0.0, "grad_norm": 0.09509345889091492, "kl": 0.18363283015787601, "learning_rate": 7.99989932695272e-06, "loss": -0.0678, "num_tokens": 14200257.0, "reward": 0.2530611753463745, "reward_std": 1.2976582050323486, "rewards/rollout_reward_func/mean": 0.2530611753463745, "rewards/rollout_reward_func/std": 1.2976582050323486, "sampling/importance_sampling_ratio/max": 1.1104425191879272, "sampling/importance_sampling_ratio/mean": 0.39976316690444946, "sampling/importance_sampling_ratio/min": 1.837667156223688e-07, "sampling/sampling_logp_difference/max": 1.7635254859924316, "sampling/sampling_logp_difference/mean": 0.43567463755607605, "step": 1079, "step_time": 14.098486104994663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 2.7556357383728027, "epoch": 0.0054, "grad_norm": 0.10422186553478241, "kl": 0.1877069491893053, "learning_rate": 7.999899133816106e-06, "loss": -0.0675, "step": 1080, "step_time": 6.181748286006041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.3125, "completions/mean_terminated_length": 5.3125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3933980911970139, "epoch": 0.005405, "frac_reward_zero_std": 0.0, "grad_norm": 0.05616847798228264, "kl": 0.44911423325538635, "learning_rate": 7.999898940494413e-06, "loss": -0.0312, "num_tokens": 14224818.0, "reward": 0.5185094475746155, "reward_std": 1.4173744916915894, "rewards/rollout_reward_func/mean": 0.5185094475746155, "rewards/rollout_reward_func/std": 1.4173744916915894, "sampling/importance_sampling_ratio/max": 1.0407516956329346, "sampling/importance_sampling_ratio/mean": 0.610292911529541, "sampling/importance_sampling_ratio/min": 0.004220037721097469, "sampling/sampling_logp_difference/max": 1.4808108806610107, "sampling/sampling_logp_difference/mean": 0.2453659623861313, "step": 1081, "step_time": 11.40494475098967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3990204483270645, "epoch": 0.00541, "grad_norm": 0.0625869557261467, "kl": 0.4306650906801224, "learning_rate": 7.999898746987633e-06, "loss": -0.031, "step": 1082, "step_time": 6.072802253009286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 4.9166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6439718455076218, "epoch": 0.005415, "frac_reward_zero_std": 0.0, "grad_norm": 0.18393854796886444, "kl": 0.19126569479703903, "learning_rate": 7.999898553295778e-06, "loss": -0.0761, "num_tokens": 14252447.0, "reward": 0.8907895088195801, "reward_std": 1.206287145614624, "rewards/rollout_reward_func/mean": 0.8907895088195801, "rewards/rollout_reward_func/std": 1.206287145614624, "sampling/importance_sampling_ratio/max": 1.0387111902236938, "sampling/importance_sampling_ratio/mean": 0.5561679005622864, "sampling/importance_sampling_ratio/min": 0.00019569769210647792, "sampling/sampling_logp_difference/max": 1.7400929927825928, "sampling/sampling_logp_difference/mean": 0.24576103687286377, "step": 1083, "step_time": 12.838980900982278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6281075179576874, "epoch": 0.00542, "grad_norm": 0.18206563591957092, "kl": 0.1953494045883417, "learning_rate": 7.99989835941884e-06, "loss": -0.0767, "step": 1084, "step_time": 6.2462563529843464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.3125, "completions/mean_terminated_length": 5.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.8247132003307343, "epoch": 0.005425, "frac_reward_zero_std": 0.0, "grad_norm": 0.15470784902572632, "kl": 0.31107062101364136, "learning_rate": 7.999898165356819e-06, "loss": -0.0948, "num_tokens": 14284426.0, "reward": 0.555252730846405, "reward_std": 1.2746176719665527, "rewards/rollout_reward_func/mean": 0.555252730846405, "rewards/rollout_reward_func/std": 1.2746176719665527, "sampling/importance_sampling_ratio/max": 1.1559029817581177, "sampling/importance_sampling_ratio/mean": 0.48351168632507324, "sampling/importance_sampling_ratio/min": 7.501436272150386e-08, "sampling/sampling_logp_difference/max": 1.8450130224227905, "sampling/sampling_logp_difference/mean": 0.47048550844192505, "step": 1085, "step_time": 16.23351870899205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.801780790090561, "epoch": 0.00543, "grad_norm": 0.15155170857906342, "kl": 0.32360077649354935, "learning_rate": 7.999897971109718e-06, "loss": -0.0956, "step": 1086, "step_time": 7.815393193013733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 4.285714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2733184322714806, "epoch": 0.005435, "frac_reward_zero_std": 0.0, "grad_norm": 0.05005829036235809, "kl": 0.2417919784784317, "learning_rate": 7.999897776677535e-06, "loss": -0.0549, "num_tokens": 14315831.0, "reward": 0.9488006234169006, "reward_std": 1.0446525812149048, "rewards/rollout_reward_func/mean": 0.9488006234169006, "rewards/rollout_reward_func/std": 1.0446527004241943, "sampling/importance_sampling_ratio/max": 1.2544878721237183, "sampling/importance_sampling_ratio/mean": 0.8707274198532104, "sampling/importance_sampling_ratio/min": 7.805367658875184e-07, "sampling/sampling_logp_difference/max": 1.5464262962341309, "sampling/sampling_logp_difference/mean": 0.20246659219264984, "step": 1087, "step_time": 16.42797699700168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2491934709250927, "epoch": 0.00544, "grad_norm": 0.044984184205532074, "kl": 0.24634401872754097, "learning_rate": 7.999897582060272e-06, "loss": -0.055, "step": 1088, "step_time": 7.963823582002078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.8125, "completions/mean_terminated_length": 5.133333683013916, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1689170002937317, "epoch": 0.005445, "frac_reward_zero_std": 0.0, "grad_norm": 0.3951907157897949, "kl": 0.2850365899503231, "learning_rate": 7.999897387257927e-06, "loss": -0.0104, "num_tokens": 14346733.0, "reward": 0.5263902544975281, "reward_std": 1.2569454908370972, "rewards/rollout_reward_func/mean": 0.5263902544975281, "rewards/rollout_reward_func/std": 1.2569456100463867, "sampling/importance_sampling_ratio/max": 1.1897677183151245, "sampling/importance_sampling_ratio/mean": 0.6423738598823547, "sampling/importance_sampling_ratio/min": 0.003901220392435789, "sampling/sampling_logp_difference/max": 1.858567476272583, "sampling/sampling_logp_difference/mean": 0.1782824993133545, "step": 1089, "step_time": 14.88341762800701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.1615629643201828, "epoch": 0.00545, "grad_norm": 0.2819083333015442, "kl": 0.28388652577996254, "learning_rate": 7.9998971922705e-06, "loss": -0.0115, "step": 1090, "step_time": 7.33448314301495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.538461685180664, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5899077206850052, "epoch": 0.005455, "frac_reward_zero_std": 0.0, "grad_norm": 0.04056524857878685, "kl": 0.19209174625575542, "learning_rate": 7.999896997097994e-06, "loss": -0.1039, "num_tokens": 14378050.0, "reward": 1.0500411987304688, "reward_std": 1.1576803922653198, "rewards/rollout_reward_func/mean": 1.0500411987304688, "rewards/rollout_reward_func/std": 1.1576803922653198, "sampling/importance_sampling_ratio/max": 1.0335227251052856, "sampling/importance_sampling_ratio/mean": 0.7039973735809326, "sampling/importance_sampling_ratio/min": 0.0007979783695191145, "sampling/sampling_logp_difference/max": 1.5829282999038696, "sampling/sampling_logp_difference/mean": 0.2458077073097229, "step": 1091, "step_time": 15.859824945015134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.587959200143814, "epoch": 0.00546, "grad_norm": 0.039930086582899094, "kl": 0.1914457008242607, "learning_rate": 7.999896801740405e-06, "loss": -0.1039, "step": 1092, "step_time": 7.927192096001818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1037636660039425, "epoch": 0.005465, "frac_reward_zero_std": 0.5, "grad_norm": 0.04072267562150955, "kl": 0.19799118861556053, "learning_rate": 7.999896606197736e-06, "loss": -0.0243, "num_tokens": 14398899.0, "reward": 0.35855215787887573, "reward_std": 1.1280592679977417, "rewards/rollout_reward_func/mean": 0.35855215787887573, "rewards/rollout_reward_func/std": 1.1280592679977417, "sampling/importance_sampling_ratio/max": 1.0243216753005981, "sampling/importance_sampling_ratio/mean": 0.7462679147720337, "sampling/importance_sampling_ratio/min": 0.0009472270030528307, "sampling/sampling_logp_difference/max": 1.198777437210083, "sampling/sampling_logp_difference/mean": 0.1509062945842743, "step": 1093, "step_time": 10.600163989001885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1024423986673355, "epoch": 0.00547, "grad_norm": 0.03959483653306961, "kl": 0.19683364778757095, "learning_rate": 7.999896410469986e-06, "loss": -0.0244, "step": 1094, "step_time": 5.001544744998682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.3125, "completions/mean_terminated_length": 4.600000381469727, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5420164167881012, "epoch": 0.005475, "frac_reward_zero_std": 0.0, "grad_norm": 0.14317046105861664, "kl": 0.37403709441423416, "learning_rate": 7.999896214557154e-06, "loss": -0.06, "num_tokens": 14424564.0, "reward": -0.24386776983737946, "reward_std": 0.8856898546218872, "rewards/rollout_reward_func/mean": -0.24386776983737946, "rewards/rollout_reward_func/std": 0.885689914226532, "sampling/importance_sampling_ratio/max": 1.2463628053665161, "sampling/importance_sampling_ratio/mean": 0.6709485054016113, "sampling/importance_sampling_ratio/min": 0.001520223100669682, "sampling/sampling_logp_difference/max": 1.7635383605957031, "sampling/sampling_logp_difference/mean": 0.2589717209339142, "step": 1095, "step_time": 12.430006633003359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5744996070861816, "epoch": 0.00548, "grad_norm": 0.15869668126106262, "kl": 0.36927054077386856, "learning_rate": 7.999896018459242e-06, "loss": -0.0605, "step": 1096, "step_time": 5.998510674995487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 7.200000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.2017447352409363, "epoch": 0.005485, "frac_reward_zero_std": 0.0, "grad_norm": 0.10519043356180191, "kl": 0.30792453046888113, "learning_rate": 7.99989582217625e-06, "loss": -0.0501, "num_tokens": 14456772.0, "reward": 0.030896030366420746, "reward_std": 1.073675513267517, "rewards/rollout_reward_func/mean": 0.030896030366420746, "rewards/rollout_reward_func/std": 1.073675513267517, "sampling/importance_sampling_ratio/max": 1.3999989032745361, "sampling/importance_sampling_ratio/mean": 0.4074765741825104, "sampling/importance_sampling_ratio/min": 5.386353336689353e-07, "sampling/sampling_logp_difference/max": 2.6907620429992676, "sampling/sampling_logp_difference/mean": 0.5040849447250366, "step": 1097, "step_time": 16.035597542009782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.201650023460388, "epoch": 0.00549, "grad_norm": 0.11298834532499313, "kl": 0.28700193390250206, "learning_rate": 7.999895625708174e-06, "loss": -0.05, "step": 1098, "step_time": 6.445528367999941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.538461685180664, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.189096175134182, "epoch": 0.005495, "frac_reward_zero_std": 0.5, "grad_norm": 0.020326141268014908, "kl": 0.19931058585643768, "learning_rate": 7.999895429055018e-06, "loss": -0.0592, "num_tokens": 14478366.0, "reward": 1.2592202425003052, "reward_std": 1.267746925354004, "rewards/rollout_reward_func/mean": 1.2592202425003052, "rewards/rollout_reward_func/std": 1.267746925354004, "sampling/importance_sampling_ratio/max": 1.0823312997817993, "sampling/importance_sampling_ratio/mean": 0.7628671526908875, "sampling/importance_sampling_ratio/min": 4.383424675324932e-05, "sampling/sampling_logp_difference/max": 1.925741195678711, "sampling/sampling_logp_difference/mean": 0.1802702397108078, "step": 1099, "step_time": 12.262707052010228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1896263360977173, "epoch": 0.0055, "grad_norm": 0.02014000341296196, "kl": 0.19935812801122665, "learning_rate": 7.999895232216782e-06, "loss": -0.0591, "step": 1100, "step_time": 4.999424794994411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 9.4375, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.470675453543663, "epoch": 0.005505, "frac_reward_zero_std": 0.0, "grad_norm": 0.0610191710293293, "kl": 0.25177014246582985, "learning_rate": 7.999895035193464e-06, "loss": -0.1061, "num_tokens": 14510002.0, "reward": 0.6307829022407532, "reward_std": 1.2293940782546997, "rewards/rollout_reward_func/mean": 0.6307829022407532, "rewards/rollout_reward_func/std": 1.2293941974639893, "sampling/importance_sampling_ratio/max": 1.071282148361206, "sampling/importance_sampling_ratio/mean": 0.5062506794929504, "sampling/importance_sampling_ratio/min": 3.926051022062893e-07, "sampling/sampling_logp_difference/max": 1.8587369918823242, "sampling/sampling_logp_difference/mean": 0.43362903594970703, "step": 1101, "step_time": 15.988502716994844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4803671538829803, "epoch": 0.00551, "grad_norm": 0.06375749409198761, "kl": 0.25387173146009445, "learning_rate": 7.999894837985066e-06, "loss": -0.106, "step": 1102, "step_time": 7.2537164429959375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9829142689704895, "epoch": 0.005515, "frac_reward_zero_std": 0.0, "grad_norm": 0.10636647790670395, "kl": 0.2062000371515751, "learning_rate": 7.999894640591587e-06, "loss": -0.0886, "num_tokens": 14536649.0, "reward": 0.45913609862327576, "reward_std": 1.0059212446212769, "rewards/rollout_reward_func/mean": 0.45913609862327576, "rewards/rollout_reward_func/std": 1.0059212446212769, "sampling/importance_sampling_ratio/max": 1.2957735061645508, "sampling/importance_sampling_ratio/mean": 0.6306989192962646, "sampling/importance_sampling_ratio/min": 1.6990878748401883e-07, "sampling/sampling_logp_difference/max": 1.8834785223007202, "sampling/sampling_logp_difference/mean": 0.3470589518547058, "step": 1103, "step_time": 17.279175339004723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9773402363061905, "epoch": 0.00552, "grad_norm": 0.10138876736164093, "kl": 0.20664479956030846, "learning_rate": 7.999894443013025e-06, "loss": -0.0889, "step": 1104, "step_time": 7.4495200230012415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7514522671699524, "epoch": 0.005525, "frac_reward_zero_std": 0.0, "grad_norm": 0.13821972906589508, "kl": 0.28842444717884064, "learning_rate": 7.999894245249384e-06, "loss": -0.0887, "num_tokens": 14560452.0, "reward": 1.121436357498169, "reward_std": 1.24054753780365, "rewards/rollout_reward_func/mean": 1.121436357498169, "rewards/rollout_reward_func/std": 1.24054753780365, "sampling/importance_sampling_ratio/max": 1.0740242004394531, "sampling/importance_sampling_ratio/mean": 0.650863766670227, "sampling/importance_sampling_ratio/min": 0.00029592617647722363, "sampling/sampling_logp_difference/max": 1.6595683097839355, "sampling/sampling_logp_difference/mean": 0.24779927730560303, "step": 1105, "step_time": 15.148427343010553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013888888992369175, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013888888992369175, "entropy": 1.747389793395996, "epoch": 0.00553, "grad_norm": 0.09534398466348648, "kl": 0.3213112950325012, "learning_rate": 7.999894047300662e-06, "loss": -0.0892, "step": 1106, "step_time": 6.3114517819776665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.1875, "completions/mean_terminated_length": 4.1875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.2500401418656111, "epoch": 0.005535, "frac_reward_zero_std": 0.5, "grad_norm": 0.022157466039061546, "kl": 0.4037199951708317, "learning_rate": 7.999893849166859e-06, "loss": -0.0327, "num_tokens": 14576577.0, "reward": 1.9827601909637451, "reward_std": 0.05148163437843323, "rewards/rollout_reward_func/mean": 1.9827601909637451, "rewards/rollout_reward_func/std": 0.051481615751981735, "sampling/importance_sampling_ratio/max": 1.0251145362854004, "sampling/importance_sampling_ratio/mean": 0.9502969980239868, "sampling/importance_sampling_ratio/min": 0.07392191886901855, "sampling/sampling_logp_difference/max": 1.4931244850158691, "sampling/sampling_logp_difference/mean": 0.04117811471223831, "step": 1107, "step_time": 5.791071253988775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24423145316541195, "epoch": 0.00554, "grad_norm": 0.022532496601343155, "kl": 0.40382765233516693, "learning_rate": 7.999893650847974e-06, "loss": -0.0327, "step": 1108, "step_time": 3.832892485996126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.0625, "completions/mean_terminated_length": 5.44444465637207, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.1565544605255127, "epoch": 0.005545, "frac_reward_zero_std": 0.0, "grad_norm": 0.061493050307035446, "kl": 0.11314027104526758, "learning_rate": 7.999893452344009e-06, "loss": -0.0695, "num_tokens": 14603798.0, "reward": -0.42148321866989136, "reward_std": 1.0728060007095337, "rewards/rollout_reward_func/mean": -0.42148321866989136, "rewards/rollout_reward_func/std": 1.0728060007095337, "sampling/importance_sampling_ratio/max": 1.0975905656814575, "sampling/importance_sampling_ratio/mean": 0.33707892894744873, "sampling/importance_sampling_ratio/min": 4.08922915084986e-06, "sampling/sampling_logp_difference/max": 2.1570491790771484, "sampling/sampling_logp_difference/mean": 0.44262975454330444, "step": 1109, "step_time": 14.156263573997421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.1515568494796753, "epoch": 0.00555, "grad_norm": 0.06612460315227509, "kl": 0.11852910555899143, "learning_rate": 7.999893253654963e-06, "loss": -0.0694, "step": 1110, "step_time": 5.36068807799893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.0625, "completions/mean_terminated_length": 5.909090995788574, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.675063133239746, "epoch": 0.005555, "frac_reward_zero_std": 0.0, "grad_norm": 0.08172250539064407, "kl": 0.2895595282316208, "learning_rate": 7.999893054780836e-06, "loss": -0.0345, "num_tokens": 14636097.0, "reward": 0.06753227114677429, "reward_std": 1.1556798219680786, "rewards/rollout_reward_func/mean": 0.06753227114677429, "rewards/rollout_reward_func/std": 1.1556798219680786, "sampling/importance_sampling_ratio/max": 1.1004877090454102, "sampling/importance_sampling_ratio/mean": 0.49553182721138, "sampling/importance_sampling_ratio/min": 0.00031784779275767505, "sampling/sampling_logp_difference/max": 1.432070255279541, "sampling/sampling_logp_difference/mean": 0.3656165599822998, "step": 1111, "step_time": 15.401684851996833 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 2.6702858209609985, "epoch": 0.00556, "grad_norm": 0.08336877077817917, "kl": 0.2855151779949665, "learning_rate": 7.999892855721627e-06, "loss": -0.0346, "step": 1112, "step_time": 6.274249430003692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.375, "completions/mean_terminated_length": 5.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.698637545108795, "epoch": 0.005565, "frac_reward_zero_std": 0.0, "grad_norm": 0.0816042423248291, "kl": 0.1597256325185299, "learning_rate": 7.999892656477339e-06, "loss": -0.0842, "num_tokens": 14665277.0, "reward": 0.1973561942577362, "reward_std": 1.2558640241622925, "rewards/rollout_reward_func/mean": 0.1973561942577362, "rewards/rollout_reward_func/std": 1.2558640241622925, "sampling/importance_sampling_ratio/max": 1.1508070230484009, "sampling/importance_sampling_ratio/mean": 0.5020132660865784, "sampling/importance_sampling_ratio/min": 2.2983090275374707e-07, "sampling/sampling_logp_difference/max": 2.171938419342041, "sampling/sampling_logp_difference/mean": 0.4670013189315796, "step": 1113, "step_time": 15.201598469007877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.69187194108963, "epoch": 0.00557, "grad_norm": 0.07502976804971695, "kl": 0.16211171820759773, "learning_rate": 7.99989245704797e-06, "loss": -0.0845, "step": 1114, "step_time": 6.196551865985384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.3125, "completions/mean_terminated_length": 6.625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.0103898644447327, "epoch": 0.005575, "frac_reward_zero_std": 0.0, "grad_norm": 0.06619849801063538, "kl": 0.13483176240697503, "learning_rate": 7.999892257433519e-06, "loss": -0.0981, "num_tokens": 14699801.0, "reward": -0.22663751244544983, "reward_std": 1.100429892539978, "rewards/rollout_reward_func/mean": -0.22663751244544983, "rewards/rollout_reward_func/std": 1.100429892539978, "sampling/importance_sampling_ratio/max": 1.1801891326904297, "sampling/importance_sampling_ratio/mean": 0.398269385099411, "sampling/importance_sampling_ratio/min": 1.590880401636241e-06, "sampling/sampling_logp_difference/max": 2.5507142543792725, "sampling/sampling_logp_difference/mean": 0.4833105206489563, "step": 1115, "step_time": 16.911537922002026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.0030606985092163, "epoch": 0.00558, "grad_norm": 0.06314979493618011, "kl": 0.1344982571899891, "learning_rate": 7.999892057633988e-06, "loss": -0.0982, "step": 1116, "step_time": 7.382078085982357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4857729375362396, "epoch": 0.005585, "frac_reward_zero_std": 0.5, "grad_norm": 0.06944622099399567, "kl": 0.1713782250881195, "learning_rate": 7.999891857649375e-06, "loss": -0.0452, "num_tokens": 14720095.0, "reward": -0.2946946918964386, "reward_std": 0.3274137079715729, "rewards/rollout_reward_func/mean": -0.2946946918964386, "rewards/rollout_reward_func/std": 0.3274137079715729, "sampling/importance_sampling_ratio/max": 1.0311801433563232, "sampling/importance_sampling_ratio/mean": 0.7205515503883362, "sampling/importance_sampling_ratio/min": 8.225743658840656e-05, "sampling/sampling_logp_difference/max": 1.631927490234375, "sampling/sampling_logp_difference/mean": 0.19187669456005096, "step": 1117, "step_time": 11.94086845999118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4778988659381866, "epoch": 0.00559, "grad_norm": 0.06979450583457947, "kl": 0.17282376065850258, "learning_rate": 7.999891657479682e-06, "loss": -0.0451, "step": 1118, "step_time": 5.6636455270054284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 4.692307949066162, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5769533812999725, "epoch": 0.005595, "frac_reward_zero_std": 0.5, "grad_norm": 0.03088773973286152, "kl": 0.22301309183239937, "learning_rate": 7.99989145712491e-06, "loss": -0.0032, "num_tokens": 14741701.0, "reward": 0.45539698004722595, "reward_std": 1.502863883972168, "rewards/rollout_reward_func/mean": 0.45539698004722595, "rewards/rollout_reward_func/std": 1.502863883972168, "sampling/importance_sampling_ratio/max": 1.044696569442749, "sampling/importance_sampling_ratio/mean": 0.6074078679084778, "sampling/importance_sampling_ratio/min": 8.913407509680837e-05, "sampling/sampling_logp_difference/max": 1.4217480421066284, "sampling/sampling_logp_difference/mean": 0.24395954608917236, "step": 1119, "step_time": 12.22864376398502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5587618947029114, "epoch": 0.0056, "grad_norm": 0.03337191417813301, "kl": 0.23091566190123558, "learning_rate": 7.999891256585055e-06, "loss": -0.0033, "step": 1120, "step_time": 5.196405485010473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.347252458333969, "epoch": 0.005605, "frac_reward_zero_std": 0.0, "grad_norm": 0.058026786893606186, "kl": 0.17665855586528778, "learning_rate": 7.99989105586012e-06, "loss": -0.0907, "num_tokens": 14769125.0, "reward": 0.1007763147354126, "reward_std": 1.2908339500427246, "rewards/rollout_reward_func/mean": 0.1007763147354126, "rewards/rollout_reward_func/std": 1.2908339500427246, "sampling/importance_sampling_ratio/max": 1.085035800933838, "sampling/importance_sampling_ratio/mean": 0.4775379002094269, "sampling/importance_sampling_ratio/min": 0.0008558075060136616, "sampling/sampling_logp_difference/max": 1.5817577838897705, "sampling/sampling_logp_difference/mean": 0.33313053846359253, "step": 1121, "step_time": 12.939275427997927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.348419636487961, "epoch": 0.00561, "grad_norm": 0.05809192731976509, "kl": 0.17705165594816208, "learning_rate": 7.999890854950104e-06, "loss": -0.0909, "step": 1122, "step_time": 5.315454734998639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.48468607012182474, "epoch": 0.005615, "frac_reward_zero_std": 0.5, "grad_norm": 0.12135706841945648, "kl": 0.286104254424572, "learning_rate": 7.999890653855005e-06, "loss": -0.0427, "num_tokens": 14793963.0, "reward": 1.5894235372543335, "reward_std": 0.9140615463256836, "rewards/rollout_reward_func/mean": 1.5894235372543335, "rewards/rollout_reward_func/std": 0.9140615463256836, "sampling/importance_sampling_ratio/max": 1.189306616783142, "sampling/importance_sampling_ratio/mean": 0.9173704385757446, "sampling/importance_sampling_ratio/min": 0.0032829190604388714, "sampling/sampling_logp_difference/max": 0.9617239832878113, "sampling/sampling_logp_difference/mean": 0.08370031416416168, "step": 1123, "step_time": 13.291374324995559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4911760035902262, "epoch": 0.00562, "grad_norm": 0.11952734738588333, "kl": 0.288377583026886, "learning_rate": 7.999890452574829e-06, "loss": -0.0431, "step": 1124, "step_time": 6.522605065998505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 4.615384578704834, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.650475412607193, "epoch": 0.005625, "frac_reward_zero_std": 0.0, "grad_norm": 0.19726136326789856, "kl": 0.6710335090756416, "learning_rate": 7.999890251109572e-06, "loss": -0.0908, "num_tokens": 14822450.0, "reward": 0.533220648765564, "reward_std": 1.3603863716125488, "rewards/rollout_reward_func/mean": 0.533220648765564, "rewards/rollout_reward_func/std": 1.3603863716125488, "sampling/importance_sampling_ratio/max": 1.1189446449279785, "sampling/importance_sampling_ratio/mean": 0.5699774026870728, "sampling/importance_sampling_ratio/min": 0.0009567803354002535, "sampling/sampling_logp_difference/max": 1.5431404113769531, "sampling/sampling_logp_difference/mean": 0.28906261920928955, "step": 1125, "step_time": 13.08599670001422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6606858968734741, "epoch": 0.00563, "grad_norm": 0.196797177195549, "kl": 0.6734522171318531, "learning_rate": 7.999890049459232e-06, "loss": -0.091, "step": 1126, "step_time": 6.144699406984728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 4.875, "completions/mean_terminated_length": 4.875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9567207507789135, "epoch": 0.005635, "frac_reward_zero_std": 0.0, "grad_norm": 0.07466457039117813, "kl": 0.9095215648412704, "learning_rate": 7.999889847623811e-06, "loss": -0.0733, "num_tokens": 14840643.0, "reward": 1.5454820394515991, "reward_std": 0.6976494789123535, "rewards/rollout_reward_func/mean": 1.5454820394515991, "rewards/rollout_reward_func/std": 0.6976495981216431, "sampling/importance_sampling_ratio/max": 1.0756903886795044, "sampling/importance_sampling_ratio/mean": 0.8506767749786377, "sampling/importance_sampling_ratio/min": 0.00010258329712087288, "sampling/sampling_logp_difference/max": 1.4724382162094116, "sampling/sampling_logp_difference/mean": 0.21390047669410706, "step": 1127, "step_time": 6.224670151015744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9697154313325882, "epoch": 0.00564, "grad_norm": 0.07340172678232193, "kl": 0.88389827683568, "learning_rate": 7.999889645603313e-06, "loss": -0.0732, "step": 1128, "step_time": 3.372095413011266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5128304436802864, "epoch": 0.005645, "frac_reward_zero_std": 0.5, "grad_norm": 0.05975494161248207, "kl": 0.9039943590760231, "learning_rate": 7.999889443397732e-06, "loss": -0.0459, "num_tokens": 14856523.0, "reward": 1.9702601432800293, "reward_std": 0.07030946761369705, "rewards/rollout_reward_func/mean": 1.9702601432800293, "rewards/rollout_reward_func/std": 0.07030944526195526, "sampling/importance_sampling_ratio/max": 1.0295844078063965, "sampling/importance_sampling_ratio/mean": 0.8931471705436707, "sampling/importance_sampling_ratio/min": 0.003763898042961955, "sampling/sampling_logp_difference/max": 1.6999626159667969, "sampling/sampling_logp_difference/mean": 0.10914354771375656, "step": 1129, "step_time": 6.5116410390037345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.515337847173214, "epoch": 0.00565, "grad_norm": 0.05409712344408035, "kl": 0.8466920480132103, "learning_rate": 7.99988924100707e-06, "loss": -0.0459, "step": 1130, "step_time": 3.1555038480146322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.3125, "completions/mean_terminated_length": 5.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8290312811732292, "epoch": 0.005655, "frac_reward_zero_std": 0.0, "grad_norm": 0.11147715896368027, "kl": 0.26816555112600327, "learning_rate": 7.99988903843133e-06, "loss": -0.0534, "num_tokens": 14884308.0, "reward": -0.3622508645057678, "reward_std": 1.113349437713623, "rewards/rollout_reward_func/mean": -0.3622508645057678, "rewards/rollout_reward_func/std": 1.1133495569229126, "sampling/importance_sampling_ratio/max": 1.1745866537094116, "sampling/importance_sampling_ratio/mean": 0.6012738943099976, "sampling/importance_sampling_ratio/min": 4.3240730440707864e-10, "sampling/sampling_logp_difference/max": 2.6484928131103516, "sampling/sampling_logp_difference/mean": 0.3978349268436432, "step": 1131, "step_time": 14.667368180002086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8574825450778008, "epoch": 0.00566, "grad_norm": 0.1214471161365509, "kl": 0.2679867446422577, "learning_rate": 7.999888835670505e-06, "loss": -0.0538, "step": 1132, "step_time": 6.2943322859937325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 10.9375, "completions/mean_terminated_length": 4.428571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.807203233242035, "epoch": 0.005665, "frac_reward_zero_std": 0.0, "grad_norm": 0.024831218644976616, "kl": 0.09944559074938297, "learning_rate": 7.9998886327246e-06, "loss": -0.0637, "num_tokens": 14920918.0, "reward": -0.3270576596260071, "reward_std": 0.8009615540504456, "rewards/rollout_reward_func/mean": -0.3270576596260071, "rewards/rollout_reward_func/std": 0.8009615540504456, "sampling/importance_sampling_ratio/max": 1.057564616203308, "sampling/importance_sampling_ratio/mean": 0.41138672828674316, "sampling/importance_sampling_ratio/min": 2.114244671247434e-05, "sampling/sampling_logp_difference/max": 1.8852081298828125, "sampling/sampling_logp_difference/mean": 0.362308531999588, "step": 1133, "step_time": 20.64539008999418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8093218207359314, "epoch": 0.00567, "grad_norm": 0.027140477672219276, "kl": 0.09861081279814243, "learning_rate": 7.999888429593618e-06, "loss": -0.0636, "step": 1134, "step_time": 9.357453059012187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.5625, "completions/mean_terminated_length": 5.615385055541992, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.620162695646286, "epoch": 0.005675, "frac_reward_zero_std": 0.0, "grad_norm": 0.07262113690376282, "kl": 0.557612357661128, "learning_rate": 7.999888226277554e-06, "loss": -0.081, "num_tokens": 14946197.0, "reward": -0.12234395742416382, "reward_std": 0.8669506907463074, "rewards/rollout_reward_func/mean": -0.12234395742416382, "rewards/rollout_reward_func/std": 0.8669507503509521, "sampling/importance_sampling_ratio/max": 1.0577882528305054, "sampling/importance_sampling_ratio/mean": 0.6452212333679199, "sampling/importance_sampling_ratio/min": 9.557911835145205e-05, "sampling/sampling_logp_difference/max": 1.8321926593780518, "sampling/sampling_logp_difference/mean": 0.2939029633998871, "step": 1135, "step_time": 15.93306358801783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6289257109165192, "epoch": 0.00568, "grad_norm": 0.06741075217723846, "kl": 0.5057058893144131, "learning_rate": 7.999888022776408e-06, "loss": -0.0811, "step": 1136, "step_time": 6.8835049720073584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8654769696295261, "epoch": 0.005685, "frac_reward_zero_std": 0.0, "grad_norm": 0.14765934646129608, "kl": 0.19410809874534607, "learning_rate": 7.999887819090183e-06, "loss": -0.098, "num_tokens": 14970017.0, "reward": 0.8765120506286621, "reward_std": 1.2882939577102661, "rewards/rollout_reward_func/mean": 0.8765120506286621, "rewards/rollout_reward_func/std": 1.2882939577102661, "sampling/importance_sampling_ratio/max": 1.0784531831741333, "sampling/importance_sampling_ratio/mean": 0.7003092765808105, "sampling/importance_sampling_ratio/min": 6.058137660147622e-05, "sampling/sampling_logp_difference/max": 1.7762154340744019, "sampling/sampling_logp_difference/mean": 0.3280060589313507, "step": 1137, "step_time": 13.76266080600908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8730139657855034, "epoch": 0.00569, "grad_norm": 0.1472216546535492, "kl": 0.19450493529438972, "learning_rate": 7.999887615218877e-06, "loss": -0.0979, "step": 1138, "step_time": 5.919804269986344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 13.375, "completions/mean_terminated_length": 5.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.431069076061249, "epoch": 0.005695, "frac_reward_zero_std": 0.0, "grad_norm": 0.020280903205275536, "kl": 0.06396426539868116, "learning_rate": 7.99988741116249e-06, "loss": -0.0887, "num_tokens": 14998322.0, "reward": -0.3860618472099304, "reward_std": 1.165907859802246, "rewards/rollout_reward_func/mean": -0.3860618472099304, "rewards/rollout_reward_func/std": 1.165907859802246, "sampling/importance_sampling_ratio/max": 1.0511139631271362, "sampling/importance_sampling_ratio/mean": 0.19570764899253845, "sampling/importance_sampling_ratio/min": 8.771560260356637e-07, "sampling/sampling_logp_difference/max": 2.3633365631103516, "sampling/sampling_logp_difference/mean": 0.4816768169403076, "step": 1139, "step_time": 16.157870309005375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.43277770280838, "epoch": 0.0057, "grad_norm": 0.01996874250471592, "kl": 0.0633537033572793, "learning_rate": 7.999887206921024e-06, "loss": -0.0887, "step": 1140, "step_time": 6.150281362977694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 10.1875, "completions/mean_terminated_length": 4.375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.134758111089468, "epoch": 0.005705, "frac_reward_zero_std": 0.0, "grad_norm": 0.042830582708120346, "kl": 0.16554559115320444, "learning_rate": 7.999887002494475e-06, "loss": -0.0985, "num_tokens": 15031312.0, "reward": 0.3622424304485321, "reward_std": 1.3134398460388184, "rewards/rollout_reward_func/mean": 0.3622424304485321, "rewards/rollout_reward_func/std": 1.3134398460388184, "sampling/importance_sampling_ratio/max": 1.036428451538086, "sampling/importance_sampling_ratio/mean": 0.46687760949134827, "sampling/importance_sampling_ratio/min": 2.805509211611934e-05, "sampling/sampling_logp_difference/max": 1.8126956224441528, "sampling/sampling_logp_difference/mean": 0.3654381036758423, "step": 1141, "step_time": 17.849137443990912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1340216575190425, "epoch": 0.00571, "grad_norm": 0.04066653922200203, "kl": 0.1660990547388792, "learning_rate": 7.999886797882845e-06, "loss": -0.0985, "step": 1142, "step_time": 7.998622770988732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 11.0625, "completions/mean_terminated_length": 4.714285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.9519931077957153, "epoch": 0.005715, "frac_reward_zero_std": 0.0, "grad_norm": 0.06004892662167549, "kl": 0.09362080320715904, "learning_rate": 7.999886593086137e-06, "loss": -0.0518, "num_tokens": 15059697.0, "reward": -0.41213613748550415, "reward_std": 0.912296712398529, "rewards/rollout_reward_func/mean": -0.41213613748550415, "rewards/rollout_reward_func/std": 0.912296712398529, "sampling/importance_sampling_ratio/max": 1.1653603315353394, "sampling/importance_sampling_ratio/mean": 0.34740835428237915, "sampling/importance_sampling_ratio/min": 3.1253509291673254e-07, "sampling/sampling_logp_difference/max": 1.654813289642334, "sampling/sampling_logp_difference/mean": 0.4022636413574219, "step": 1143, "step_time": 16.22430437100411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.9518295526504517, "epoch": 0.00572, "grad_norm": 0.05665633827447891, "kl": 0.09401181247085333, "learning_rate": 7.999886388104348e-06, "loss": -0.0518, "step": 1144, "step_time": 6.069841186021222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0625, "completions/mean_terminated_length": 4.900000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0047371685504913, "epoch": 0.005725, "frac_reward_zero_std": 0.0, "grad_norm": 0.18040433526039124, "kl": 0.2870841510593891, "learning_rate": 7.999886182937478e-06, "loss": -0.0924, "num_tokens": 15083474.0, "reward": 0.909967303276062, "reward_std": 1.2567098140716553, "rewards/rollout_reward_func/mean": 0.909967303276062, "rewards/rollout_reward_func/std": 1.2567099332809448, "sampling/importance_sampling_ratio/max": 1.091254711151123, "sampling/importance_sampling_ratio/mean": 0.5415252447128296, "sampling/importance_sampling_ratio/min": 1.3078714573566685e-06, "sampling/sampling_logp_difference/max": 1.8020904064178467, "sampling/sampling_logp_difference/mean": 0.3100695013999939, "step": 1145, "step_time": 15.072521567999502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9944875240325928, "epoch": 0.00573, "grad_norm": 0.1503063142299652, "kl": 0.29652615636587143, "learning_rate": 7.999885977585527e-06, "loss": -0.0931, "step": 1146, "step_time": 6.25889949798875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 5.785714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8179151713848114, "epoch": 0.005735, "frac_reward_zero_std": 0.5, "grad_norm": 0.0456530898809433, "kl": 0.19294457510113716, "learning_rate": 7.999885772048498e-06, "loss": -0.0462, "num_tokens": 15106320.0, "reward": 0.9356595873832703, "reward_std": 1.4139606952667236, "rewards/rollout_reward_func/mean": 0.9356595873832703, "rewards/rollout_reward_func/std": 1.4139606952667236, "sampling/importance_sampling_ratio/max": 1.0449470281600952, "sampling/importance_sampling_ratio/mean": 0.6711257100105286, "sampling/importance_sampling_ratio/min": 5.34575738129206e-05, "sampling/sampling_logp_difference/max": 1.8672473430633545, "sampling/sampling_logp_difference/mean": 0.25512853264808655, "step": 1147, "step_time": 12.788565415030462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8146189153194427, "epoch": 0.00574, "grad_norm": 0.05609589442610741, "kl": 0.19337118417024612, "learning_rate": 7.999885566326388e-06, "loss": -0.0461, "step": 1148, "step_time": 5.671823691009195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 9.4375, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5304837822914124, "epoch": 0.005745, "frac_reward_zero_std": 0.0, "grad_norm": 0.34532323479652405, "kl": 1.228312469087541, "learning_rate": 7.999885360419195e-06, "loss": -0.0753, "num_tokens": 15135054.0, "reward": 0.19182102382183075, "reward_std": 1.1407296657562256, "rewards/rollout_reward_func/mean": 0.19182102382183075, "rewards/rollout_reward_func/std": 1.1407296657562256, "sampling/importance_sampling_ratio/max": 1.2259185314178467, "sampling/importance_sampling_ratio/mean": 0.3721786141395569, "sampling/importance_sampling_ratio/min": 0.0001004342848318629, "sampling/sampling_logp_difference/max": 1.6014513969421387, "sampling/sampling_logp_difference/mean": 0.3696865439414978, "step": 1149, "step_time": 14.68004906999704 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 2.5317171812057495, "epoch": 0.00575, "grad_norm": 0.28387176990509033, "kl": 1.1353892982006073, "learning_rate": 7.999885154326923e-06, "loss": -0.0772, "step": 1150, "step_time": 5.955282064969651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9390237517654896, "epoch": 0.005755, "frac_reward_zero_std": 0.0, "grad_norm": 0.023887978866696358, "kl": 0.30755824595689774, "learning_rate": 7.99988494804957e-06, "loss": -0.0656, "num_tokens": 15156553.0, "reward": 1.6054952144622803, "reward_std": 0.8835780620574951, "rewards/rollout_reward_func/mean": 1.6054952144622803, "rewards/rollout_reward_func/std": 0.8835781216621399, "sampling/importance_sampling_ratio/max": 1.0312137603759766, "sampling/importance_sampling_ratio/mean": 0.891045331954956, "sampling/importance_sampling_ratio/min": 3.1653416954213753e-07, "sampling/sampling_logp_difference/max": 2.50571608543396, "sampling/sampling_logp_difference/mean": 0.25457483530044556, "step": 1151, "step_time": 10.726939241023501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9422361310571432, "epoch": 0.00576, "grad_norm": 0.018075792118906975, "kl": 0.2900504730641842, "learning_rate": 7.99988474158714e-06, "loss": -0.0657, "step": 1152, "step_time": 5.27838452301512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 4.454545497894287, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.442111074924469, "epoch": 0.005765, "frac_reward_zero_std": 0.0, "grad_norm": 0.15604273974895477, "kl": 0.29551613703370094, "learning_rate": 7.999884534939625e-06, "loss": -0.0559, "num_tokens": 15187573.0, "reward": 0.6727524995803833, "reward_std": 1.2114152908325195, "rewards/rollout_reward_func/mean": 0.6727524995803833, "rewards/rollout_reward_func/std": 1.2114152908325195, "sampling/importance_sampling_ratio/max": 1.1108542680740356, "sampling/importance_sampling_ratio/mean": 0.532243549823761, "sampling/importance_sampling_ratio/min": 6.789688455910436e-08, "sampling/sampling_logp_difference/max": 1.9158082008361816, "sampling/sampling_logp_difference/mean": 0.4463048577308655, "step": 1153, "step_time": 14.16546480199031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012013730127364397, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012013730127364397, "entropy": 2.438106596469879, "epoch": 0.00577, "grad_norm": 0.15860041975975037, "kl": 0.28486254066228867, "learning_rate": 7.999884328107032e-06, "loss": -0.0561, "step": 1154, "step_time": 6.4145124860078795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.9375, "completions/mean_terminated_length": 5.727272987365723, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.762418210506439, "epoch": 0.005775, "frac_reward_zero_std": 0.0, "grad_norm": 0.15023614466190338, "kl": 0.14250089414417744, "learning_rate": 7.99988412108936e-06, "loss": -0.033, "num_tokens": 15224424.0, "reward": 0.25390368700027466, "reward_std": 1.0863914489746094, "rewards/rollout_reward_func/mean": 0.25390368700027466, "rewards/rollout_reward_func/std": 1.0863914489746094, "sampling/importance_sampling_ratio/max": 1.06190025806427, "sampling/importance_sampling_ratio/mean": 0.4285663664340973, "sampling/importance_sampling_ratio/min": 1.722583817809209e-07, "sampling/sampling_logp_difference/max": 2.1352930068969727, "sampling/sampling_logp_difference/mean": 0.4197536110877991, "step": 1155, "step_time": 18.82982986001298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7708321809768677, "epoch": 0.00578, "grad_norm": 0.13774436712265015, "kl": 0.1442779954522848, "learning_rate": 7.999883913886604e-06, "loss": -0.0333, "step": 1156, "step_time": 7.736405756993918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.800000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0884964764118195, "epoch": 0.005785, "frac_reward_zero_std": 0.5, "grad_norm": 0.08432621508836746, "kl": 0.40915418416261673, "learning_rate": 7.99988370649877e-06, "loss": -0.0399, "num_tokens": 15245236.0, "reward": 0.9155237078666687, "reward_std": 1.4476981163024902, "rewards/rollout_reward_func/mean": 0.9155237078666687, "rewards/rollout_reward_func/std": 1.4476982355117798, "sampling/importance_sampling_ratio/max": 1.042749047279358, "sampling/importance_sampling_ratio/mean": 0.7844417095184326, "sampling/importance_sampling_ratio/min": 3.4293792850803584e-05, "sampling/sampling_logp_difference/max": 1.9250743389129639, "sampling/sampling_logp_difference/mean": 0.2250603884458542, "step": 1157, "step_time": 8.969977023007232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0829725824296474, "epoch": 0.00579, "grad_norm": 0.0844072476029396, "kl": 0.40971530973911285, "learning_rate": 7.999883498925855e-06, "loss": -0.0398, "step": 1158, "step_time": 4.400487076985883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5812196340411901, "epoch": 0.005795, "frac_reward_zero_std": 0.5, "grad_norm": 0.0120915612205863, "kl": 0.3000512346625328, "learning_rate": 7.99988329116786e-06, "loss": -0.0289, "num_tokens": 15272105.0, "reward": 0.3753841519355774, "reward_std": 1.1349899768829346, "rewards/rollout_reward_func/mean": 0.3753841519355774, "rewards/rollout_reward_func/std": 1.1349899768829346, "sampling/importance_sampling_ratio/max": 1.0831210613250732, "sampling/importance_sampling_ratio/mean": 0.9633480906486511, "sampling/importance_sampling_ratio/min": 1.236309003616043e-06, "sampling/sampling_logp_difference/max": 1.965059518814087, "sampling/sampling_logp_difference/mean": 0.1618696004152298, "step": 1159, "step_time": 15.441336536998278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5860415864735842, "epoch": 0.0058, "grad_norm": 0.01524018682539463, "kl": 0.3069949969649315, "learning_rate": 7.999883083224785e-06, "loss": -0.0288, "step": 1160, "step_time": 7.842525183004909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7501935213804245, "epoch": 0.005805, "frac_reward_zero_std": 0.0, "grad_norm": 0.08371996879577637, "kl": 0.4813900515437126, "learning_rate": 7.999882875096631e-06, "loss": -0.0505, "num_tokens": 15298497.0, "reward": 0.7412593364715576, "reward_std": 1.3341437578201294, "rewards/rollout_reward_func/mean": 0.7412593364715576, "rewards/rollout_reward_func/std": 1.3341437578201294, "sampling/importance_sampling_ratio/max": 1.0605666637420654, "sampling/importance_sampling_ratio/mean": 0.853486180305481, "sampling/importance_sampling_ratio/min": 0.00015019552665762603, "sampling/sampling_logp_difference/max": 1.5905227661132812, "sampling/sampling_logp_difference/mean": 0.15560314059257507, "step": 1161, "step_time": 13.82783350799582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7526023425161839, "epoch": 0.00581, "grad_norm": 0.08447059988975525, "kl": 0.43375036120414734, "learning_rate": 7.999882666783394e-06, "loss": -0.0505, "step": 1162, "step_time": 6.781609147990821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 4.384615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.388033926486969, "epoch": 0.005815, "frac_reward_zero_std": 0.0, "grad_norm": 0.20119807124137878, "kl": 0.1880788952112198, "learning_rate": 7.999882458285079e-06, "loss": -0.0679, "num_tokens": 15327379.0, "reward": 1.0576337575912476, "reward_std": 1.23209810256958, "rewards/rollout_reward_func/mean": 1.0576337575912476, "rewards/rollout_reward_func/std": 1.23209810256958, "sampling/importance_sampling_ratio/max": 1.0730814933776855, "sampling/importance_sampling_ratio/mean": 0.7573174834251404, "sampling/importance_sampling_ratio/min": 0.000772965548094362, "sampling/sampling_logp_difference/max": 1.2435582876205444, "sampling/sampling_logp_difference/mean": 0.16812527179718018, "step": 1163, "step_time": 15.304178232021513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3949908316135406, "epoch": 0.00582, "grad_norm": 0.18987926840782166, "kl": 0.18881283327937126, "learning_rate": 7.999882249601682e-06, "loss": -0.0683, "step": 1164, "step_time": 6.83866803800629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.076923370361328, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8039341568946838, "epoch": 0.005825, "frac_reward_zero_std": 0.0, "grad_norm": 0.06290958821773529, "kl": 0.2409096360206604, "learning_rate": 7.999882040733206e-06, "loss": -0.0706, "num_tokens": 15352111.0, "reward": 0.680671215057373, "reward_std": 1.38336181640625, "rewards/rollout_reward_func/mean": 0.680671215057373, "rewards/rollout_reward_func/std": 1.3833619356155396, "sampling/importance_sampling_ratio/max": 1.0134012699127197, "sampling/importance_sampling_ratio/mean": 0.5854477882385254, "sampling/importance_sampling_ratio/min": 5.20719765972899e-07, "sampling/sampling_logp_difference/max": 1.9137667417526245, "sampling/sampling_logp_difference/mean": 0.3576021194458008, "step": 1165, "step_time": 13.541804733016761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8133583068847656, "epoch": 0.00583, "grad_norm": 0.059849221259355545, "kl": 0.24757135659456253, "learning_rate": 7.99988183167965e-06, "loss": -0.0708, "step": 1166, "step_time": 6.885432103983476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.375, "completions/mean_terminated_length": 4.375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.359767384827137, "epoch": 0.005835, "frac_reward_zero_std": 0.0, "grad_norm": 0.12750132381916046, "kl": 0.3483779765665531, "learning_rate": 7.999881622441012e-06, "loss": -0.0553, "num_tokens": 15375389.0, "reward": 1.8175129890441895, "reward_std": 0.09718747437000275, "rewards/rollout_reward_func/mean": 1.8175129890441895, "rewards/rollout_reward_func/std": 0.09718744456768036, "sampling/importance_sampling_ratio/max": 1.050399899482727, "sampling/importance_sampling_ratio/mean": 0.8894895315170288, "sampling/importance_sampling_ratio/min": 0.1377599835395813, "sampling/sampling_logp_difference/max": 1.0477303266525269, "sampling/sampling_logp_difference/mean": 0.06509844213724136, "step": 1167, "step_time": 9.858943597995676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36758301965892315, "epoch": 0.00584, "grad_norm": 0.12628306448459625, "kl": 0.3605704568326473, "learning_rate": 7.999881413017296e-06, "loss": -0.0557, "step": 1168, "step_time": 4.909613438998349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.333947829902172, "epoch": 0.005845, "frac_reward_zero_std": 0.5, "grad_norm": 0.0336838960647583, "kl": 0.28144408389925957, "learning_rate": 7.999881203408499e-06, "loss": -0.0313, "num_tokens": 15404099.0, "reward": 1.0158038139343262, "reward_std": 1.214043140411377, "rewards/rollout_reward_func/mean": 1.0158038139343262, "rewards/rollout_reward_func/std": 1.214043140411377, "sampling/importance_sampling_ratio/max": 1.0457645654678345, "sampling/importance_sampling_ratio/mean": 0.8254663944244385, "sampling/importance_sampling_ratio/min": 1.8509298342905822e-06, "sampling/sampling_logp_difference/max": 2.7221145629882812, "sampling/sampling_logp_difference/mean": 0.24640874564647675, "step": 1169, "step_time": 13.646702495985664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3355325162410736, "epoch": 0.00585, "grad_norm": 0.034332770854234695, "kl": 0.2807301990687847, "learning_rate": 7.999880993614621e-06, "loss": -0.0314, "step": 1170, "step_time": 6.926537133011152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 4.454545497894287, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7055572792887688, "epoch": 0.005855, "frac_reward_zero_std": 0.0, "grad_norm": 0.25518280267715454, "kl": 0.1843880582600832, "learning_rate": 7.999880783635665e-06, "loss": -0.0891, "num_tokens": 15430540.0, "reward": 0.7603164911270142, "reward_std": 1.3642590045928955, "rewards/rollout_reward_func/mean": 0.7603164911270142, "rewards/rollout_reward_func/std": 1.364259123802185, "sampling/importance_sampling_ratio/max": 1.0515769720077515, "sampling/importance_sampling_ratio/mean": 0.5640915036201477, "sampling/importance_sampling_ratio/min": 5.732836507377215e-05, "sampling/sampling_logp_difference/max": 1.9160411357879639, "sampling/sampling_logp_difference/mean": 0.2984291613101959, "step": 1171, "step_time": 12.881966341010411 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.6753151938319206, "epoch": 0.00586, "grad_norm": 0.16808897256851196, "kl": 0.18054455425590277, "learning_rate": 7.999880573471627e-06, "loss": -0.0902, "step": 1172, "step_time": 5.686616115010111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 10.4375, "completions/mean_terminated_length": 4.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.085802733898163, "epoch": 0.005865, "frac_reward_zero_std": 0.0, "grad_norm": 0.17557206749916077, "kl": 0.2685324130579829, "learning_rate": 7.99988036312251e-06, "loss": -0.054, "num_tokens": 15465419.0, "reward": -0.3368602693080902, "reward_std": 0.8695600628852844, "rewards/rollout_reward_func/mean": -0.3368602693080902, "rewards/rollout_reward_func/std": 0.8695600032806396, "sampling/importance_sampling_ratio/max": 1.0133363008499146, "sampling/importance_sampling_ratio/mean": 0.3215656876564026, "sampling/importance_sampling_ratio/min": 7.647927122889087e-05, "sampling/sampling_logp_difference/max": 1.8238708972930908, "sampling/sampling_logp_difference/mean": 0.40701836347579956, "step": 1173, "step_time": 18.23093627601338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.0800986289978027, "epoch": 0.00587, "grad_norm": 0.1595555543899536, "kl": 0.2518467381596565, "learning_rate": 7.999880152588313e-06, "loss": -0.0537, "step": 1174, "step_time": 8.05624953500228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 4.692307949066162, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9304061606526375, "epoch": 0.005875, "frac_reward_zero_std": 0.0, "grad_norm": 0.19420181214809418, "kl": 0.21552752144634724, "learning_rate": 7.999879941869035e-06, "loss": -0.0755, "num_tokens": 15495014.0, "reward": 0.1316555142402649, "reward_std": 1.2593352794647217, "rewards/rollout_reward_func/mean": 0.1316555142402649, "rewards/rollout_reward_func/std": 1.2593352794647217, "sampling/importance_sampling_ratio/max": 1.0896629095077515, "sampling/importance_sampling_ratio/mean": 0.6374326944351196, "sampling/importance_sampling_ratio/min": 0.0002328935224795714, "sampling/sampling_logp_difference/max": 1.442218542098999, "sampling/sampling_logp_difference/mean": 0.3147695064544678, "step": 1175, "step_time": 15.92486811101844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9451744109392166, "epoch": 0.00588, "grad_norm": 0.21729044616222382, "kl": 0.21388403698801994, "learning_rate": 7.999879730964677e-06, "loss": -0.0764, "step": 1176, "step_time": 6.363317373004975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.11458980664610863, "epoch": 0.005885, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004622316046152264, "kl": 0.25212350860238075, "learning_rate": 7.99987951987524e-06, "loss": 0.0006, "num_tokens": 15512836.0, "reward": 1.5684185028076172, "reward_std": 0.34234702587127686, "rewards/rollout_reward_func/mean": 1.5684185028076172, "rewards/rollout_reward_func/std": 0.34234705567359924, "sampling/importance_sampling_ratio/max": 1.0260612964630127, "sampling/importance_sampling_ratio/mean": 1.0116900205612183, "sampling/importance_sampling_ratio/min": 0.9957129955291748, "sampling/sampling_logp_difference/max": 0.02851998619735241, "sampling/sampling_logp_difference/mean": 0.006236208137124777, "step": 1177, "step_time": 6.818790714984061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12284771725535393, "epoch": 0.00589, "grad_norm": 0.00048627424985170364, "kl": 0.2519334889948368, "learning_rate": 7.999879308600722e-06, "loss": 0.0006, "step": 1178, "step_time": 3.3872166779910913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3588568000122905, "epoch": 0.005895, "frac_reward_zero_std": 0.0, "grad_norm": 0.1375861018896103, "kl": 0.25429031252861023, "learning_rate": 7.999879097141123e-06, "loss": -0.065, "num_tokens": 15541201.0, "reward": 1.08818781375885, "reward_std": 1.2542325258255005, "rewards/rollout_reward_func/mean": 1.08818781375885, "rewards/rollout_reward_func/std": 1.25423264503479, "sampling/importance_sampling_ratio/max": 1.0371571779251099, "sampling/importance_sampling_ratio/mean": 0.7835978269577026, "sampling/importance_sampling_ratio/min": 4.1380374682375987e-07, "sampling/sampling_logp_difference/max": 2.7567739486694336, "sampling/sampling_logp_difference/mean": 0.23602944612503052, "step": 1179, "step_time": 14.640341057995101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3636628724634647, "epoch": 0.0059, "grad_norm": 0.11356958001852036, "kl": 0.2648134306073189, "learning_rate": 7.999878885496445e-06, "loss": -0.0656, "step": 1180, "step_time": 7.562406468990957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.538461685180664, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9752740561962128, "epoch": 0.005905, "frac_reward_zero_std": 0.0, "grad_norm": 0.24220424890518188, "kl": 0.4068039171397686, "learning_rate": 7.999878673666688e-06, "loss": -0.069, "num_tokens": 15569908.0, "reward": -0.1349620372056961, "reward_std": 1.17525315284729, "rewards/rollout_reward_func/mean": -0.1349620372056961, "rewards/rollout_reward_func/std": 1.17525315284729, "sampling/importance_sampling_ratio/max": 1.0327376127243042, "sampling/importance_sampling_ratio/mean": 0.47849589586257935, "sampling/importance_sampling_ratio/min": 0.0001445864763809368, "sampling/sampling_logp_difference/max": 1.4870697259902954, "sampling/sampling_logp_difference/mean": 0.33213406801223755, "step": 1181, "step_time": 15.039463963999879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.06960227340459824, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06960227340459824, "entropy": 2.062589317560196, "epoch": 0.00591, "grad_norm": 0.10812108963727951, "kl": 0.379198394715786, "learning_rate": 7.99987846165185e-06, "loss": -0.0707, "step": 1182, "step_time": 7.057648207992315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.3125, "completions/mean_terminated_length": 4.4166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.257300227880478, "epoch": 0.005915, "frac_reward_zero_std": 0.0, "grad_norm": 0.2767360508441925, "kl": 0.4379630275070667, "learning_rate": 7.99987824945193e-06, "loss": -0.0457, "num_tokens": 15599263.0, "reward": 0.13443231582641602, "reward_std": 1.2571632862091064, "rewards/rollout_reward_func/mean": 0.13443231582641602, "rewards/rollout_reward_func/std": 1.257163405418396, "sampling/importance_sampling_ratio/max": 1.0237425565719604, "sampling/importance_sampling_ratio/mean": 0.40898412466049194, "sampling/importance_sampling_ratio/min": 0.0004274891398381442, "sampling/sampling_logp_difference/max": 1.6643418073654175, "sampling/sampling_logp_difference/mean": 0.36281001567840576, "step": 1183, "step_time": 15.121867006004322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 2.3096095621585846, "epoch": 0.00592, "grad_norm": 0.20865152776241302, "kl": 0.45628149062395096, "learning_rate": 7.999878037066934e-06, "loss": -0.0462, "step": 1184, "step_time": 7.012219599986565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.125, "completions/mean_terminated_length": 5.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.1019328236579895, "epoch": 0.005925, "frac_reward_zero_std": 0.0, "grad_norm": 0.14037033915519714, "kl": 0.1920167114585638, "learning_rate": 7.999877824496856e-06, "loss": -0.0722, "num_tokens": 15628466.0, "reward": -0.36443403363227844, "reward_std": 0.9825606346130371, "rewards/rollout_reward_func/mean": -0.36443403363227844, "rewards/rollout_reward_func/std": 0.9825606942176819, "sampling/importance_sampling_ratio/max": 1.011210322380066, "sampling/importance_sampling_ratio/mean": 0.33328866958618164, "sampling/importance_sampling_ratio/min": 4.577944139327883e-07, "sampling/sampling_logp_difference/max": 1.8550691604614258, "sampling/sampling_logp_difference/mean": 0.48615676164627075, "step": 1185, "step_time": 15.585533199002384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.106763392686844, "epoch": 0.00593, "grad_norm": 0.14163587987422943, "kl": 0.19656861573457718, "learning_rate": 7.999877611741699e-06, "loss": -0.0728, "step": 1186, "step_time": 6.841764111974044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 11.625, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.9071902334690094, "epoch": 0.005935, "frac_reward_zero_std": 0.0, "grad_norm": 0.04662630707025528, "kl": 0.3547134976834059, "learning_rate": 7.99987739880146e-06, "loss": -0.0453, "num_tokens": 15663285.0, "reward": -0.1419600248336792, "reward_std": 1.0484843254089355, "rewards/rollout_reward_func/mean": -0.1419600248336792, "rewards/rollout_reward_func/std": 1.0484843254089355, "sampling/importance_sampling_ratio/max": 1.0361233949661255, "sampling/importance_sampling_ratio/mean": 0.26370251178741455, "sampling/importance_sampling_ratio/min": 4.01769417734954e-09, "sampling/sampling_logp_difference/max": 2.226609468460083, "sampling/sampling_logp_difference/mean": 0.5114789605140686, "step": 1187, "step_time": 17.706190160024562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.9117263555526733, "epoch": 0.00594, "grad_norm": 0.04491043463349342, "kl": 0.372498270124197, "learning_rate": 7.999877185676144e-06, "loss": -0.0453, "step": 1188, "step_time": 7.656555532972561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 5.230769634246826, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.045171558856964, "epoch": 0.005945, "frac_reward_zero_std": 0.0, "grad_norm": 0.15976038575172424, "kl": 0.24986156821250916, "learning_rate": 7.999876972365746e-06, "loss": -0.0618, "num_tokens": 15689992.0, "reward": 0.7315411567687988, "reward_std": 1.3240783214569092, "rewards/rollout_reward_func/mean": 0.7315411567687988, "rewards/rollout_reward_func/std": 1.3240782022476196, "sampling/importance_sampling_ratio/max": 1.016036868095398, "sampling/importance_sampling_ratio/mean": 0.4907737374305725, "sampling/importance_sampling_ratio/min": 0.0001403315254719928, "sampling/sampling_logp_difference/max": 1.5870163440704346, "sampling/sampling_logp_difference/mean": 0.2891874313354492, "step": 1189, "step_time": 16.331469143988215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.042376935482025, "epoch": 0.00595, "grad_norm": 0.14456263184547424, "kl": 0.2559427246451378, "learning_rate": 7.999876758870269e-06, "loss": -0.0622, "step": 1190, "step_time": 6.882729610995739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 4.785714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.592632457613945, "epoch": 0.005955, "frac_reward_zero_std": 0.0, "grad_norm": 0.0392770953476429, "kl": 0.4280170351266861, "learning_rate": 7.999876545189713e-06, "loss": -0.0604, "num_tokens": 15717629.0, "reward": -0.620318591594696, "reward_std": 0.23860810697078705, "rewards/rollout_reward_func/mean": -0.620318591594696, "rewards/rollout_reward_func/std": 0.23860812187194824, "sampling/importance_sampling_ratio/max": 1.0305333137512207, "sampling/importance_sampling_ratio/mean": 0.5432791709899902, "sampling/importance_sampling_ratio/min": 0.00044220194104127586, "sampling/sampling_logp_difference/max": 1.773207664489746, "sampling/sampling_logp_difference/mean": 0.32741034030914307, "step": 1191, "step_time": 14.19306528699235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5798682570457458, "epoch": 0.00596, "grad_norm": 0.04170430079102516, "kl": 0.43893884867429733, "learning_rate": 7.999876331324075e-06, "loss": -0.0604, "step": 1192, "step_time": 7.077665545017226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1702000983059406, "epoch": 0.005965, "frac_reward_zero_std": 0.0, "grad_norm": 0.16131940484046936, "kl": 0.2709423005580902, "learning_rate": 7.999876117273359e-06, "loss": -0.0717, "num_tokens": 15740606.0, "reward": 1.5185086727142334, "reward_std": 1.074516773223877, "rewards/rollout_reward_func/mean": 1.5185086727142334, "rewards/rollout_reward_func/std": 1.074516773223877, "sampling/importance_sampling_ratio/max": 1.0551824569702148, "sampling/importance_sampling_ratio/mean": 0.823149561882019, "sampling/importance_sampling_ratio/min": 5.099208465253469e-06, "sampling/sampling_logp_difference/max": 2.1165499687194824, "sampling/sampling_logp_difference/mean": 0.2924388349056244, "step": 1193, "step_time": 9.90030042498256 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 1.1204970516264439, "epoch": 0.00597, "grad_norm": 0.06559480726718903, "kl": 0.2737504690885544, "learning_rate": 7.999875903037562e-06, "loss": -0.0725, "step": 1194, "step_time": 5.020858785021119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 6.545454978942871, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.6187902688980103, "epoch": 0.005975, "frac_reward_zero_std": 0.0, "grad_norm": 0.05436981841921806, "kl": 0.24005858600139618, "learning_rate": 7.999875688616684e-06, "loss": -0.1172, "num_tokens": 15774884.0, "reward": 0.478854775428772, "reward_std": 1.2949670553207397, "rewards/rollout_reward_func/mean": 0.478854775428772, "rewards/rollout_reward_func/std": 1.2949670553207397, "sampling/importance_sampling_ratio/max": 1.0519133806228638, "sampling/importance_sampling_ratio/mean": 0.515487551689148, "sampling/importance_sampling_ratio/min": 2.0145192536347167e-07, "sampling/sampling_logp_difference/max": 1.9253779649734497, "sampling/sampling_logp_difference/mean": 0.4530874192714691, "step": 1195, "step_time": 17.333826422996935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.609399825334549, "epoch": 0.00598, "grad_norm": 0.04019003361463547, "kl": 0.2386842928826809, "learning_rate": 7.999875474010728e-06, "loss": -0.1175, "step": 1196, "step_time": 7.951612421005848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.1875, "completions/mean_terminated_length": 6.090909481048584, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0181561410427094, "epoch": 0.005985, "frac_reward_zero_std": 0.0, "grad_norm": 0.029787227511405945, "kl": 0.29424968734383583, "learning_rate": 7.999875259219693e-06, "loss": -0.02, "num_tokens": 15807525.0, "reward": 0.5454361438751221, "reward_std": 1.366398811340332, "rewards/rollout_reward_func/mean": 0.5454361438751221, "rewards/rollout_reward_func/std": 1.366398811340332, "sampling/importance_sampling_ratio/max": 1.0481276512145996, "sampling/importance_sampling_ratio/mean": 0.5141754746437073, "sampling/importance_sampling_ratio/min": 1.0752508387668058e-05, "sampling/sampling_logp_difference/max": 1.830956220626831, "sampling/sampling_logp_difference/mean": 0.2790634334087372, "step": 1197, "step_time": 19.436654129982344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0155321061611176, "epoch": 0.00599, "grad_norm": 0.028991516679525375, "kl": 0.2944136094301939, "learning_rate": 7.999875044243576e-06, "loss": -0.02, "step": 1198, "step_time": 8.822127268009353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.4478207118809223, "epoch": 0.005995, "frac_reward_zero_std": 0.5, "grad_norm": 0.004098448436707258, "kl": 0.2690822407603264, "learning_rate": 7.999874829082381e-06, "loss": -0.0361, "num_tokens": 15830367.0, "reward": 1.5479512214660645, "reward_std": 0.6484266519546509, "rewards/rollout_reward_func/mean": 1.5479512214660645, "rewards/rollout_reward_func/std": 0.6484266519546509, "sampling/importance_sampling_ratio/max": 1.0834261178970337, "sampling/importance_sampling_ratio/mean": 0.9215657711029053, "sampling/importance_sampling_ratio/min": 0.006336477119475603, "sampling/sampling_logp_difference/max": 0.8111088275909424, "sampling/sampling_logp_difference/mean": 0.07079695165157318, "step": 1199, "step_time": 11.628205087021342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4217846915125847, "epoch": 0.006, "grad_norm": 0.0037694904021918774, "kl": 0.27112725749611855, "learning_rate": 7.999874613736106e-06, "loss": -0.0361, "step": 1200, "step_time": 5.502378600998782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8830735497176647, "epoch": 0.006005, "frac_reward_zero_std": 0.0, "grad_norm": 0.03753498196601868, "kl": 0.3706761822104454, "learning_rate": 7.99987439820475e-06, "loss": -0.077, "num_tokens": 15853799.0, "reward": 1.6558846235275269, "reward_std": 0.6571345329284668, "rewards/rollout_reward_func/mean": 1.6558846235275269, "rewards/rollout_reward_func/std": 0.6571345329284668, "sampling/importance_sampling_ratio/max": 1.0588834285736084, "sampling/importance_sampling_ratio/mean": 0.8895785808563232, "sampling/importance_sampling_ratio/min": 9.500056876277085e-06, "sampling/sampling_logp_difference/max": 1.8372235298156738, "sampling/sampling_logp_difference/mean": 0.23356594145298004, "step": 1201, "step_time": 11.961980467982357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8779375180602074, "epoch": 0.00601, "grad_norm": 0.035835083574056625, "kl": 0.37872999534010887, "learning_rate": 7.999874182488315e-06, "loss": -0.077, "step": 1202, "step_time": 5.824396244002855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.6875, "completions/mean_terminated_length": 6.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.891428142786026, "epoch": 0.006015, "frac_reward_zero_std": 0.5, "grad_norm": 0.010515335947275162, "kl": 0.17718975245952606, "learning_rate": 7.999873966586802e-06, "loss": 0.0002, "num_tokens": 15875833.0, "reward": 0.3996543288230896, "reward_std": 1.6613967418670654, "rewards/rollout_reward_func/mean": 0.3996543288230896, "rewards/rollout_reward_func/std": 1.661396861076355, "sampling/importance_sampling_ratio/max": 1.0223230123519897, "sampling/importance_sampling_ratio/mean": 0.5124446153640747, "sampling/importance_sampling_ratio/min": 1.4902788279869128e-05, "sampling/sampling_logp_difference/max": 1.9780694246292114, "sampling/sampling_logp_difference/mean": 0.2635919749736786, "step": 1203, "step_time": 14.234123707996332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8864942789077759, "epoch": 0.00602, "grad_norm": 0.009705672040581703, "kl": 0.1771555319428444, "learning_rate": 7.999873750500208e-06, "loss": 0.0001, "step": 1204, "step_time": 5.574073437004699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4288154542446136, "epoch": 0.006025, "frac_reward_zero_std": 0.5, "grad_norm": 0.022448034957051277, "kl": 0.2498524971306324, "learning_rate": 7.999873534228532e-06, "loss": -0.033, "num_tokens": 15900477.0, "reward": 0.7157838344573975, "reward_std": 1.4037567377090454, "rewards/rollout_reward_func/mean": 0.7157838344573975, "rewards/rollout_reward_func/std": 1.4037567377090454, "sampling/importance_sampling_ratio/max": 1.1587001085281372, "sampling/importance_sampling_ratio/mean": 0.7124764323234558, "sampling/importance_sampling_ratio/min": 0.0003354024956934154, "sampling/sampling_logp_difference/max": 1.485687255859375, "sampling/sampling_logp_difference/mean": 0.17396102845668793, "step": 1205, "step_time": 14.72842336098256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4188199937343597, "epoch": 0.00603, "grad_norm": 0.0193011537194252, "kl": 0.24833669513463974, "learning_rate": 7.99987331777178e-06, "loss": -0.033, "step": 1206, "step_time": 6.1670071039989125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6319253742694855, "epoch": 0.006035, "frac_reward_zero_std": 0.5, "grad_norm": 0.027450479567050934, "kl": 0.15837344899773598, "learning_rate": 7.999873101129946e-06, "loss": -0.0568, "num_tokens": 15924330.0, "reward": 1.1121989488601685, "reward_std": 1.3187592029571533, "rewards/rollout_reward_func/mean": 1.1121989488601685, "rewards/rollout_reward_func/std": 1.3187590837478638, "sampling/importance_sampling_ratio/max": 1.043071985244751, "sampling/importance_sampling_ratio/mean": 0.7000139951705933, "sampling/importance_sampling_ratio/min": 1.2214134585519787e-05, "sampling/sampling_logp_difference/max": 1.6573272943496704, "sampling/sampling_logp_difference/mean": 0.22295542061328888, "step": 1207, "step_time": 16.42865496801096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.629518747329712, "epoch": 0.00604, "grad_norm": 0.024854909628629684, "kl": 0.15803946182131767, "learning_rate": 7.999872884303033e-06, "loss": -0.0569, "step": 1208, "step_time": 5.52641792097711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.658526450395584, "epoch": 0.006045, "frac_reward_zero_std": 0.0, "grad_norm": 0.18393035233020782, "kl": 1.0250166282057762, "learning_rate": 7.99987266729104e-06, "loss": -0.0807, "num_tokens": 15957270.0, "reward": 1.2613019943237305, "reward_std": 1.1123114824295044, "rewards/rollout_reward_func/mean": 1.2613019943237305, "rewards/rollout_reward_func/std": 1.112311601638794, "sampling/importance_sampling_ratio/max": 1.0517053604125977, "sampling/importance_sampling_ratio/mean": 0.6781114339828491, "sampling/importance_sampling_ratio/min": 4.820089088752866e-05, "sampling/sampling_logp_difference/max": 1.7212955951690674, "sampling/sampling_logp_difference/mean": 0.24780350923538208, "step": 1209, "step_time": 17.547897455006023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6528874337673187, "epoch": 0.00605, "grad_norm": 0.2103106826543808, "kl": 0.9727629497647285, "learning_rate": 7.999872450093968e-06, "loss": -0.0816, "step": 1210, "step_time": 6.991821217990946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1367633286863565, "epoch": 0.006055, "frac_reward_zero_std": 0.0, "grad_norm": 0.19846732914447784, "kl": 1.0836482308804989, "learning_rate": 7.999872232711817e-06, "loss": -0.0741, "num_tokens": 15981693.0, "reward": 0.28537026047706604, "reward_std": 1.149306297302246, "rewards/rollout_reward_func/mean": 0.28537026047706604, "rewards/rollout_reward_func/std": 1.1493064165115356, "sampling/importance_sampling_ratio/max": 1.0429295301437378, "sampling/importance_sampling_ratio/mean": 0.8312879800796509, "sampling/importance_sampling_ratio/min": 1.8258713225804968e-06, "sampling/sampling_logp_difference/max": 1.706558346748352, "sampling/sampling_logp_difference/mean": 0.2619748115539551, "step": 1211, "step_time": 13.377092808979796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1342517398297787, "epoch": 0.00606, "grad_norm": 0.17067719995975494, "kl": 0.978788074105978, "learning_rate": 7.999872015144586e-06, "loss": -0.0748, "step": 1212, "step_time": 6.268650669997442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.375, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.8064631819725037, "epoch": 0.006065, "frac_reward_zero_std": 0.0, "grad_norm": 0.1714819222688675, "kl": 0.6014525918290019, "learning_rate": 7.999871797392275e-06, "loss": -0.0925, "num_tokens": 16014825.0, "reward": 0.03551200032234192, "reward_std": 1.1841291189193726, "rewards/rollout_reward_func/mean": 0.03551200032234192, "rewards/rollout_reward_func/std": 1.1841291189193726, "sampling/importance_sampling_ratio/max": 1.0509614944458008, "sampling/importance_sampling_ratio/mean": 0.40043696761131287, "sampling/importance_sampling_ratio/min": 1.592534317751415e-05, "sampling/sampling_logp_difference/max": 1.62784743309021, "sampling/sampling_logp_difference/mean": 0.48886948823928833, "step": 1213, "step_time": 17.74130474498088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8103758096694946, "epoch": 0.00607, "grad_norm": 0.030771546065807343, "kl": 0.5312991328537464, "learning_rate": 7.999871579454884e-06, "loss": -0.0925, "step": 1214, "step_time": 7.932506135010044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.07648395374417305, "epoch": 0.006075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003278047952335328, "kl": 0.2638951390981674, "learning_rate": 7.999871361332416e-06, "loss": 0.0007, "num_tokens": 16032805.0, "reward": 0.4735277593135834, "reward_std": 1.4731453657150269, "rewards/rollout_reward_func/mean": 0.4735277593135834, "rewards/rollout_reward_func/std": 1.4731453657150269, "sampling/importance_sampling_ratio/max": 1.0588514804840088, "sampling/importance_sampling_ratio/mean": 1.0306768417358398, "sampling/importance_sampling_ratio/min": 1.0079625844955444, "sampling/sampling_logp_difference/max": 0.037739306688308716, "sampling/sampling_logp_difference/mean": 0.007815544493496418, "step": 1215, "step_time": 6.041122289010673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07965443655848503, "epoch": 0.00608, "grad_norm": 0.000374120834749192, "kl": 0.263506006449461, "learning_rate": 7.999871143024866e-06, "loss": 0.0007, "step": 1216, "step_time": 3.3009512679855106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 4.785714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.202490694820881, "epoch": 0.006085, "frac_reward_zero_std": 0.5, "grad_norm": 0.31802037358283997, "kl": 0.9784924611449242, "learning_rate": 7.999870924532236e-06, "loss": -0.0035, "num_tokens": 16056261.0, "reward": 1.1318793296813965, "reward_std": 1.265249490737915, "rewards/rollout_reward_func/mean": 1.1318793296813965, "rewards/rollout_reward_func/std": 1.265249490737915, "sampling/importance_sampling_ratio/max": 1.1041114330291748, "sampling/importance_sampling_ratio/mean": 0.8331881165504456, "sampling/importance_sampling_ratio/min": 3.3767395279937773e-07, "sampling/sampling_logp_difference/max": 1.8939255475997925, "sampling/sampling_logp_difference/mean": 0.29173287749290466, "step": 1217, "step_time": 12.309341251006117 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.2129439674317837, "epoch": 0.00609, "grad_norm": 0.3021293580532074, "kl": 0.7742540687322617, "learning_rate": 7.999870705854528e-06, "loss": -0.0053, "step": 1218, "step_time": 5.9447623119922355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 12.625, "completions/mean_terminated_length": 5.200000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.443666636943817, "epoch": 0.006095, "frac_reward_zero_std": 0.0, "grad_norm": 0.05221165716648102, "kl": 0.07904005143791437, "learning_rate": 7.99987048699174e-06, "loss": -0.0575, "num_tokens": 16091671.0, "reward": -0.5386420488357544, "reward_std": 0.7296705842018127, "rewards/rollout_reward_func/mean": -0.5386420488357544, "rewards/rollout_reward_func/std": 0.7296706438064575, "sampling/importance_sampling_ratio/max": 1.1823770999908447, "sampling/importance_sampling_ratio/mean": 0.2718035578727722, "sampling/importance_sampling_ratio/min": 3.047524660360068e-05, "sampling/sampling_logp_difference/max": 1.946650743484497, "sampling/sampling_logp_difference/mean": 0.46712541580200195, "step": 1219, "step_time": 21.33058882500336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.445243775844574, "epoch": 0.0061, "grad_norm": 0.05902174860239029, "kl": 0.07409610226750374, "learning_rate": 7.999870267943871e-06, "loss": -0.0573, "step": 1220, "step_time": 9.661830436001765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.008413314819336, "epoch": 0.006105, "frac_reward_zero_std": 0.0, "grad_norm": 0.1583373248577118, "kl": 0.15546788647770882, "learning_rate": 7.999870048710924e-06, "loss": -0.1031, "num_tokens": 16124585.0, "reward": 0.1622285544872284, "reward_std": 1.2094956636428833, "rewards/rollout_reward_func/mean": 0.1622285544872284, "rewards/rollout_reward_func/std": 1.2094956636428833, "sampling/importance_sampling_ratio/max": 1.1683725118637085, "sampling/importance_sampling_ratio/mean": 0.5173400044441223, "sampling/importance_sampling_ratio/min": 0.0012861596187576652, "sampling/sampling_logp_difference/max": 1.1972541809082031, "sampling/sampling_logp_difference/mean": 0.2857111096382141, "step": 1221, "step_time": 17.65860345699184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 2.0190420746803284, "epoch": 0.00611, "grad_norm": 0.14738711714744568, "kl": 0.16096597164869308, "learning_rate": 7.999869829292899e-06, "loss": -0.1034, "step": 1222, "step_time": 7.140906201995676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.9375, "completions/mean_terminated_length": 5.875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.0841837525367737, "epoch": 0.006115, "frac_reward_zero_std": 0.0, "grad_norm": 0.05766427144408226, "kl": 0.13030976243317127, "learning_rate": 7.999869609689794e-06, "loss": -0.112, "num_tokens": 16156011.0, "reward": 0.11777277290821075, "reward_std": 1.3389204740524292, "rewards/rollout_reward_func/mean": 0.11777277290821075, "rewards/rollout_reward_func/std": 1.3389204740524292, "sampling/importance_sampling_ratio/max": 1.1517328023910522, "sampling/importance_sampling_ratio/mean": 0.3985866606235504, "sampling/importance_sampling_ratio/min": 1.2363344126242737e-07, "sampling/sampling_logp_difference/max": 2.08733868598938, "sampling/sampling_logp_difference/mean": 0.48838502168655396, "step": 1223, "step_time": 17.733007905000704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.086894989013672, "epoch": 0.00612, "grad_norm": 0.06128580495715141, "kl": 0.12921085581183434, "learning_rate": 7.999869389901607e-06, "loss": -0.1119, "step": 1224, "step_time": 6.856767757999478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9522295445203781, "epoch": 0.006125, "frac_reward_zero_std": 0.0, "grad_norm": 0.18881605565547943, "kl": 0.22331054508686066, "learning_rate": 7.999869169928343e-06, "loss": -0.0939, "num_tokens": 16182813.0, "reward": 0.6015863418579102, "reward_std": 1.3107019662857056, "rewards/rollout_reward_func/mean": 0.6015863418579102, "rewards/rollout_reward_func/std": 1.3107020854949951, "sampling/importance_sampling_ratio/max": 1.085248589515686, "sampling/importance_sampling_ratio/mean": 0.5746474266052246, "sampling/importance_sampling_ratio/min": 9.952005711966194e-06, "sampling/sampling_logp_difference/max": 1.722578763961792, "sampling/sampling_logp_difference/mean": 0.3203580975532532, "step": 1225, "step_time": 16.05656310399354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9362523257732391, "epoch": 0.00613, "grad_norm": 0.1848548799753189, "kl": 0.2286750227212906, "learning_rate": 7.999868949769999e-06, "loss": -0.0941, "step": 1226, "step_time": 6.3025222640135325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.0625, "completions/mean_terminated_length": 5.0625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1501847933977842, "epoch": 0.006135, "frac_reward_zero_std": 0.0, "grad_norm": 0.07381987571716309, "kl": 0.3414461761713028, "learning_rate": 7.999868729426575e-06, "loss": -0.0596, "num_tokens": 16206834.0, "reward": -0.17297042906284332, "reward_std": 0.7785312533378601, "rewards/rollout_reward_func/mean": -0.17297042906284332, "rewards/rollout_reward_func/std": 0.7785312533378601, "sampling/importance_sampling_ratio/max": 1.111668586730957, "sampling/importance_sampling_ratio/mean": 0.838973879814148, "sampling/importance_sampling_ratio/min": 4.1986609176092315e-06, "sampling/sampling_logp_difference/max": 1.8883881568908691, "sampling/sampling_logp_difference/mean": 0.25095224380493164, "step": 1227, "step_time": 12.350101921023452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.145021304488182, "epoch": 0.00614, "grad_norm": 0.07471079379320145, "kl": 0.35306037962436676, "learning_rate": 7.999868508898073e-06, "loss": -0.0594, "step": 1228, "step_time": 6.324637092009652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 4.2727274894714355, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1451107561588287, "epoch": 0.006145, "frac_reward_zero_std": 0.0, "grad_norm": 0.050913043320178986, "kl": 0.2555070873349905, "learning_rate": 7.99986828818449e-06, "loss": -0.0665, "num_tokens": 16234727.0, "reward": 0.12492096424102783, "reward_std": 1.358891487121582, "rewards/rollout_reward_func/mean": 0.12492096424102783, "rewards/rollout_reward_func/std": 1.358891487121582, "sampling/importance_sampling_ratio/max": 1.3332035541534424, "sampling/importance_sampling_ratio/mean": 0.5554823875427246, "sampling/importance_sampling_ratio/min": 0.0009922225726768374, "sampling/sampling_logp_difference/max": 1.5879969596862793, "sampling/sampling_logp_difference/mean": 0.3047609031200409, "step": 1229, "step_time": 14.179901661991607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1214390993118286, "epoch": 0.00615, "grad_norm": 0.051357053220272064, "kl": 0.2556133270263672, "learning_rate": 7.999868067285828e-06, "loss": -0.0666, "step": 1230, "step_time": 5.995272669999395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.875, "completions/mean_terminated_length": 5.111111164093018, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.0441516041755676, "epoch": 0.006155, "frac_reward_zero_std": 0.0, "grad_norm": 0.042923200875520706, "kl": 0.17214587330818176, "learning_rate": 7.999867846202088e-06, "loss": -0.0651, "num_tokens": 16266470.0, "reward": -0.16625875234603882, "reward_std": 1.0051841735839844, "rewards/rollout_reward_func/mean": -0.16625875234603882, "rewards/rollout_reward_func/std": 1.0051841735839844, "sampling/importance_sampling_ratio/max": 1.1426950693130493, "sampling/importance_sampling_ratio/mean": 0.5406221747398376, "sampling/importance_sampling_ratio/min": 2.049858665031934e-07, "sampling/sampling_logp_difference/max": 2.2604246139526367, "sampling/sampling_logp_difference/mean": 0.4747144877910614, "step": 1231, "step_time": 17.342197331003263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.0447311401367188, "epoch": 0.00616, "grad_norm": 0.04100585728883743, "kl": 0.17736204527318478, "learning_rate": 7.999867624933267e-06, "loss": -0.0651, "step": 1232, "step_time": 6.852775810009916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4598050899803638, "epoch": 0.006165, "frac_reward_zero_std": 0.0, "grad_norm": 0.03391989320516586, "kl": 0.3684256337583065, "learning_rate": 7.999867403479368e-06, "loss": -0.0703, "num_tokens": 16295091.0, "reward": 1.5232784748077393, "reward_std": 0.7766949534416199, "rewards/rollout_reward_func/mean": 1.5232784748077393, "rewards/rollout_reward_func/std": 0.7766949534416199, "sampling/importance_sampling_ratio/max": 1.0965207815170288, "sampling/importance_sampling_ratio/mean": 0.8389619588851929, "sampling/importance_sampling_ratio/min": 2.598401067643863e-07, "sampling/sampling_logp_difference/max": 1.9261195659637451, "sampling/sampling_logp_difference/mean": 0.37521007657051086, "step": 1233, "step_time": 14.427332812003442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4572352804243565, "epoch": 0.00617, "grad_norm": 0.03230263665318489, "kl": 0.3626953065395355, "learning_rate": 7.99986718184039e-06, "loss": -0.0703, "step": 1234, "step_time": 7.879128496977501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.3125, "completions/mean_terminated_length": 4.4166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9714626371860504, "epoch": 0.006175, "frac_reward_zero_std": 0.0, "grad_norm": 0.10803970694541931, "kl": 0.3066450320184231, "learning_rate": 7.99986696001633e-06, "loss": -0.0942, "num_tokens": 16327623.0, "reward": 0.6866927146911621, "reward_std": 1.187622308731079, "rewards/rollout_reward_func/mean": 0.6866927146911621, "rewards/rollout_reward_func/std": 1.187622308731079, "sampling/importance_sampling_ratio/max": 1.1582589149475098, "sampling/importance_sampling_ratio/mean": 0.6270065307617188, "sampling/importance_sampling_ratio/min": 7.139128683775198e-07, "sampling/sampling_logp_difference/max": 1.6805989742279053, "sampling/sampling_logp_difference/mean": 0.34313374757766724, "step": 1235, "step_time": 16.10431212701951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9669525474309921, "epoch": 0.00618, "grad_norm": 0.10907507687807083, "kl": 0.3181342575699091, "learning_rate": 7.999866738007194e-06, "loss": -0.0946, "step": 1236, "step_time": 7.086107819006429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9130276367068291, "epoch": 0.006185, "frac_reward_zero_std": 0.5, "grad_norm": 0.29222145676612854, "kl": 0.20944395661354065, "learning_rate": 7.999866515812976e-06, "loss": 0.0002, "num_tokens": 16349812.0, "reward": 0.6326836943626404, "reward_std": 1.420367956161499, "rewards/rollout_reward_func/mean": 0.6326836943626404, "rewards/rollout_reward_func/std": 1.4203680753707886, "sampling/importance_sampling_ratio/max": 1.074278712272644, "sampling/importance_sampling_ratio/mean": 0.8068649768829346, "sampling/importance_sampling_ratio/min": 0.0011337855830788612, "sampling/sampling_logp_difference/max": 1.1371116638183594, "sampling/sampling_logp_difference/mean": 0.14672407507896423, "step": 1237, "step_time": 13.688795847003348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9311403557658195, "epoch": 0.00619, "grad_norm": 0.3236655294895172, "kl": 0.20876767486333847, "learning_rate": 7.999866293433682e-06, "loss": -0.0005, "step": 1238, "step_time": 5.691416853995179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 5.357142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.970582790672779, "epoch": 0.006195, "frac_reward_zero_std": 0.0, "grad_norm": 0.05607741326093674, "kl": 0.2751758396625519, "learning_rate": 7.999866070869305e-06, "loss": -0.0584, "num_tokens": 16375570.0, "reward": 0.592565655708313, "reward_std": 1.336269497871399, "rewards/rollout_reward_func/mean": 0.592565655708313, "rewards/rollout_reward_func/std": 1.336269497871399, "sampling/importance_sampling_ratio/max": 1.123537302017212, "sampling/importance_sampling_ratio/mean": 0.6728077530860901, "sampling/importance_sampling_ratio/min": 3.0087183404248208e-05, "sampling/sampling_logp_difference/max": 1.8166028261184692, "sampling/sampling_logp_difference/mean": 0.345661997795105, "step": 1239, "step_time": 14.158642763999524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9775620698928833, "epoch": 0.0062, "grad_norm": 0.05251169577240944, "kl": 0.26630691438913345, "learning_rate": 7.999865848119852e-06, "loss": -0.0586, "step": 1240, "step_time": 6.509919787000399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.30784461833536625, "epoch": 0.006205, "frac_reward_zero_std": 0.0, "grad_norm": 0.13180992007255554, "kl": 0.9876921102404594, "learning_rate": 7.999865625185319e-06, "loss": -0.0317, "num_tokens": 16397626.0, "reward": 1.8752727508544922, "reward_std": 0.05874830484390259, "rewards/rollout_reward_func/mean": 1.8752727508544922, "rewards/rollout_reward_func/std": 0.05874830484390259, "sampling/importance_sampling_ratio/max": 1.1545584201812744, "sampling/importance_sampling_ratio/mean": 0.9721554517745972, "sampling/importance_sampling_ratio/min": 0.008843629620969296, "sampling/sampling_logp_difference/max": 2.33510160446167, "sampling/sampling_logp_difference/mean": 0.07390843331813812, "step": 1241, "step_time": 9.74650113498501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.31187916547060013, "epoch": 0.00621, "grad_norm": 0.11806581169366837, "kl": 0.8614007607102394, "learning_rate": 7.999865402065705e-06, "loss": -0.0323, "step": 1242, "step_time": 4.770757865000633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.625, "completions/mean_terminated_length": 5.692307949066162, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2059907019138336, "epoch": 0.006215, "frac_reward_zero_std": 0.0, "grad_norm": 0.2801832854747772, "kl": 1.2072114758193493, "learning_rate": 7.999865178761013e-06, "loss": -0.0295, "num_tokens": 16425293.0, "reward": -0.2344505488872528, "reward_std": 1.0710712671279907, "rewards/rollout_reward_func/mean": -0.2344505488872528, "rewards/rollout_reward_func/std": 1.0710713863372803, "sampling/importance_sampling_ratio/max": 1.2164483070373535, "sampling/importance_sampling_ratio/mean": 0.4149564206600189, "sampling/importance_sampling_ratio/min": 5.151184723217739e-06, "sampling/sampling_logp_difference/max": 1.6939359903335571, "sampling/sampling_logp_difference/mean": 0.39986252784729004, "step": 1243, "step_time": 15.443582707011956 }, { "clip_ratio/high_max": 0.043560607358813286, "clip_ratio/high_mean": 0.021780303679406643, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021780303679406643, "entropy": 2.2439107298851013, "epoch": 0.00622, "grad_norm": 0.24213559925556183, "kl": 0.9746368490159512, "learning_rate": 7.999864955271243e-06, "loss": -0.0312, "step": 1244, "step_time": 6.146408311993582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.0625, "completions/mean_terminated_length": 4.900000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5718128979206085, "epoch": 0.006225, "frac_reward_zero_std": 0.0, "grad_norm": 0.04638805240392685, "kl": 0.1419915184378624, "learning_rate": 7.999864731596393e-06, "loss": -0.1045, "num_tokens": 16459859.0, "reward": 0.35633596777915955, "reward_std": 1.312288761138916, "rewards/rollout_reward_func/mean": 0.35633596777915955, "rewards/rollout_reward_func/std": 1.3122888803482056, "sampling/importance_sampling_ratio/max": 1.1367595195770264, "sampling/importance_sampling_ratio/mean": 0.5417003631591797, "sampling/importance_sampling_ratio/min": 2.394888269918738e-06, "sampling/sampling_logp_difference/max": 1.9953395128250122, "sampling/sampling_logp_difference/mean": 0.44079113006591797, "step": 1245, "step_time": 17.01456784797483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5716244280338287, "epoch": 0.00623, "grad_norm": 0.04036586731672287, "kl": 0.14327409863471985, "learning_rate": 7.999864507736462e-06, "loss": -0.1047, "step": 1246, "step_time": 6.983453481996548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.218910902738571, "epoch": 0.006235, "frac_reward_zero_std": 0.0, "grad_norm": 0.031236516311764717, "kl": 0.09563211165368557, "learning_rate": 7.999864283691454e-06, "loss": -0.1044, "num_tokens": 16491730.0, "reward": 0.3256269097328186, "reward_std": 1.336686611175537, "rewards/rollout_reward_func/mean": 0.3256269097328186, "rewards/rollout_reward_func/std": 1.336686611175537, "sampling/importance_sampling_ratio/max": 1.1549772024154663, "sampling/importance_sampling_ratio/mean": 0.5240375399589539, "sampling/importance_sampling_ratio/min": 1.0820442184922285e-05, "sampling/sampling_logp_difference/max": 1.7669382095336914, "sampling/sampling_logp_difference/mean": 0.34906935691833496, "step": 1247, "step_time": 17.002319681982044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2198040783405304, "epoch": 0.00624, "grad_norm": 0.03305784612894058, "kl": 0.0945392157882452, "learning_rate": 7.999864059461368e-06, "loss": -0.1045, "step": 1248, "step_time": 6.282053143979283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 8.4375, "completions/mean_terminated_length": 3.9000000953674316, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.205018073320389, "epoch": 0.006245, "frac_reward_zero_std": 0.0, "grad_norm": 0.06870485097169876, "kl": 0.7810512520372868, "learning_rate": 7.999863835046201e-06, "loss": -0.0732, "num_tokens": 16522624.0, "reward": 0.6051104068756104, "reward_std": 1.2523224353790283, "rewards/rollout_reward_func/mean": 0.6051104068756104, "rewards/rollout_reward_func/std": 1.2523223161697388, "sampling/importance_sampling_ratio/max": 1.125138759613037, "sampling/importance_sampling_ratio/mean": 0.5507336854934692, "sampling/importance_sampling_ratio/min": 2.7449933881484867e-08, "sampling/sampling_logp_difference/max": 1.923828125, "sampling/sampling_logp_difference/mean": 0.4483862519264221, "step": 1249, "step_time": 17.11064283498854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.20480540394783, "epoch": 0.00625, "grad_norm": 0.06735482066869736, "kl": 0.6646341867744923, "learning_rate": 7.999863610445955e-06, "loss": -0.0735, "step": 1250, "step_time": 6.84432075600489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3998050428926945, "epoch": 0.006255, "frac_reward_zero_std": 0.0, "grad_norm": 0.05733745917677879, "kl": 0.3810390792787075, "learning_rate": 7.999863385660631e-06, "loss": -0.0722, "num_tokens": 16540731.0, "reward": 0.4173540472984314, "reward_std": 1.417479157447815, "rewards/rollout_reward_func/mean": 0.4173540472984314, "rewards/rollout_reward_func/std": 1.4174790382385254, "sampling/importance_sampling_ratio/max": 1.0948259830474854, "sampling/importance_sampling_ratio/mean": 0.8518527746200562, "sampling/importance_sampling_ratio/min": 4.349453206486942e-07, "sampling/sampling_logp_difference/max": 2.3405330181121826, "sampling/sampling_logp_difference/mean": 0.2846553921699524, "step": 1251, "step_time": 10.432537284985301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.407705120742321, "epoch": 0.00626, "grad_norm": 0.06308052688837051, "kl": 0.35502905026078224, "learning_rate": 7.999863160690227e-06, "loss": -0.0722, "step": 1252, "step_time": 5.256143654012703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6948683187365532, "epoch": 0.006265, "frac_reward_zero_std": 0.0, "grad_norm": 0.056744374334812164, "kl": 0.2589949108660221, "learning_rate": 7.999862935534743e-06, "loss": -0.0937, "num_tokens": 16564282.0, "reward": 1.0782208442687988, "reward_std": 1.3276456594467163, "rewards/rollout_reward_func/mean": 1.0782208442687988, "rewards/rollout_reward_func/std": 1.3276456594467163, "sampling/importance_sampling_ratio/max": 1.0916085243225098, "sampling/importance_sampling_ratio/mean": 0.6590225100517273, "sampling/importance_sampling_ratio/min": 1.1017419199177425e-09, "sampling/sampling_logp_difference/max": 2.039808750152588, "sampling/sampling_logp_difference/mean": 0.3252548277378082, "step": 1253, "step_time": 15.081548329006182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6955523490905762, "epoch": 0.00627, "grad_norm": 0.05773689225316048, "kl": 0.259990893304348, "learning_rate": 7.999862710194182e-06, "loss": -0.0937, "step": 1254, "step_time": 5.5220191389962565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 5.090909004211426, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.476192146539688, "epoch": 0.006275, "frac_reward_zero_std": 0.0, "grad_norm": 0.10023228824138641, "kl": 0.20417991653084755, "learning_rate": 7.999862484668541e-06, "loss": -0.0446, "num_tokens": 16592680.0, "reward": -0.3515487611293793, "reward_std": 1.0580105781555176, "rewards/rollout_reward_func/mean": -0.3515487611293793, "rewards/rollout_reward_func/std": 1.0580105781555176, "sampling/importance_sampling_ratio/max": 1.0635322332382202, "sampling/importance_sampling_ratio/mean": 0.4887548089027405, "sampling/importance_sampling_ratio/min": 0.00013546226546168327, "sampling/sampling_logp_difference/max": 1.8758949041366577, "sampling/sampling_logp_difference/mean": 0.35334891080856323, "step": 1255, "step_time": 16.797027734995936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.483735829591751, "epoch": 0.00628, "grad_norm": 0.10247652977705002, "kl": 0.1983471494168043, "learning_rate": 7.999862258957822e-06, "loss": -0.0447, "step": 1256, "step_time": 6.832654871017439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.0625, "completions/mean_terminated_length": 4.900000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.307714432477951, "epoch": 0.006285, "frac_reward_zero_std": 0.0, "grad_norm": 0.0861162394285202, "kl": 0.16543561220169067, "learning_rate": 7.999862033062022e-06, "loss": -0.1038, "num_tokens": 16623184.0, "reward": 0.3650311529636383, "reward_std": 1.1416871547698975, "rewards/rollout_reward_func/mean": 0.3650311529636383, "rewards/rollout_reward_func/std": 1.141687273979187, "sampling/importance_sampling_ratio/max": 1.215846300125122, "sampling/importance_sampling_ratio/mean": 0.5423363447189331, "sampling/importance_sampling_ratio/min": 6.785594450775534e-05, "sampling/sampling_logp_difference/max": 2.2100942134857178, "sampling/sampling_logp_difference/mean": 0.3482847809791565, "step": 1257, "step_time": 16.32880745998409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.30512934923172, "epoch": 0.00629, "grad_norm": 0.08524779230356216, "kl": 0.1689901277422905, "learning_rate": 7.999861806981145e-06, "loss": -0.1039, "step": 1258, "step_time": 7.112532835031743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 12.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.324182868003845, "epoch": 0.006295, "frac_reward_zero_std": 0.0, "grad_norm": 0.22851864993572235, "kl": 0.05260895658284426, "learning_rate": 7.999861580715188e-06, "loss": -0.1028, "num_tokens": 16658481.0, "reward": 0.047415800392627716, "reward_std": 1.201683759689331, "rewards/rollout_reward_func/mean": 0.047415800392627716, "rewards/rollout_reward_func/std": 1.201683759689331, "sampling/importance_sampling_ratio/max": 1.1017621755599976, "sampling/importance_sampling_ratio/mean": 0.3069339990615845, "sampling/importance_sampling_ratio/min": 1.5882956176938023e-06, "sampling/sampling_logp_difference/max": 1.548727035522461, "sampling/sampling_logp_difference/mean": 0.46473798155784607, "step": 1259, "step_time": 20.158111297001597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.3126603960990906, "epoch": 0.0063, "grad_norm": 0.2110041379928589, "kl": 0.05253946827724576, "learning_rate": 7.999861354264154e-06, "loss": -0.1034, "step": 1260, "step_time": 7.943789577984717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 4.727272987365723, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9568510055541992, "epoch": 0.006305, "frac_reward_zero_std": 0.5, "grad_norm": 0.09789317846298218, "kl": 0.1629990115761757, "learning_rate": 7.999861127628039e-06, "loss": -0.0232, "num_tokens": 16681183.0, "reward": 0.4634459912776947, "reward_std": 1.5032641887664795, "rewards/rollout_reward_func/mean": 0.4634459912776947, "rewards/rollout_reward_func/std": 1.5032641887664795, "sampling/importance_sampling_ratio/max": 1.046319603919983, "sampling/importance_sampling_ratio/mean": 0.6366068124771118, "sampling/importance_sampling_ratio/min": 2.7074200659171765e-08, "sampling/sampling_logp_difference/max": 2.002936601638794, "sampling/sampling_logp_difference/mean": 0.28218379616737366, "step": 1261, "step_time": 14.713761475984938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9506113529205322, "epoch": 0.00631, "grad_norm": 0.10253309458494186, "kl": 0.16422361508011818, "learning_rate": 7.999860900806846e-06, "loss": -0.0235, "step": 1262, "step_time": 5.574943579005776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 9.1875, "completions/mean_terminated_length": 5.099999904632568, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5055788308382034, "epoch": 0.006315, "frac_reward_zero_std": 0.0, "grad_norm": 0.16780105233192444, "kl": 0.6994171962141991, "learning_rate": 7.999860673800573e-06, "loss": -0.0834, "num_tokens": 16713109.0, "reward": 0.2195669263601303, "reward_std": 1.274234652519226, "rewards/rollout_reward_func/mean": 0.2195669263601303, "rewards/rollout_reward_func/std": 1.2742347717285156, "sampling/importance_sampling_ratio/max": 1.0803176164627075, "sampling/importance_sampling_ratio/mean": 0.40662896633148193, "sampling/importance_sampling_ratio/min": 6.335536113510898e-07, "sampling/sampling_logp_difference/max": 1.8651138544082642, "sampling/sampling_logp_difference/mean": 0.4657757878303528, "step": 1263, "step_time": 16.408421757019823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4902508705854416, "epoch": 0.00632, "grad_norm": 0.1512409895658493, "kl": 0.635814618319273, "learning_rate": 7.999860446609223e-06, "loss": -0.084, "step": 1264, "step_time": 7.017792781989556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5959708988666534, "epoch": 0.006325, "frac_reward_zero_std": 0.0, "grad_norm": 0.10652483254671097, "kl": 0.16128677502274513, "learning_rate": 7.999860219232791e-06, "loss": -0.0882, "num_tokens": 16741055.0, "reward": 0.9617127776145935, "reward_std": 1.2358577251434326, "rewards/rollout_reward_func/mean": 0.9617127776145935, "rewards/rollout_reward_func/std": 1.2358579635620117, "sampling/importance_sampling_ratio/max": 1.193355679512024, "sampling/importance_sampling_ratio/mean": 0.71930992603302, "sampling/importance_sampling_ratio/min": 0.00017238616419490427, "sampling/sampling_logp_difference/max": 1.5961073637008667, "sampling/sampling_logp_difference/mean": 0.26482900977134705, "step": 1265, "step_time": 19.174649152031634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5890035331249237, "epoch": 0.00633, "grad_norm": 0.09202297031879425, "kl": 0.16330834478139877, "learning_rate": 7.999859991671283e-06, "loss": -0.0888, "step": 1266, "step_time": 7.888981683005113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 4.454545497894287, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2162566483020782, "epoch": 0.006335, "frac_reward_zero_std": 0.0, "grad_norm": 0.23957346379756927, "kl": 0.26203181967139244, "learning_rate": 7.999859763924696e-06, "loss": -0.0703, "num_tokens": 16775329.0, "reward": 0.19649377465248108, "reward_std": 1.2173367738723755, "rewards/rollout_reward_func/mean": 0.19649377465248108, "rewards/rollout_reward_func/std": 1.217336893081665, "sampling/importance_sampling_ratio/max": 1.2540754079818726, "sampling/importance_sampling_ratio/mean": 0.5665177702903748, "sampling/importance_sampling_ratio/min": 8.461931315650872e-07, "sampling/sampling_logp_difference/max": 1.8261299133300781, "sampling/sampling_logp_difference/mean": 0.39283981919288635, "step": 1267, "step_time": 17.37716438699863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1772505939006805, "epoch": 0.00634, "grad_norm": 0.23596659302711487, "kl": 0.2590913288295269, "learning_rate": 7.999859535993029e-06, "loss": -0.0715, "step": 1268, "step_time": 7.557205342003726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0625, "completions/mean_terminated_length": 8.100000381469727, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.676818311214447, "epoch": 0.006345, "frac_reward_zero_std": 0.0, "grad_norm": 0.01145158987492323, "kl": 0.14913414511829615, "learning_rate": 7.999859307876284e-06, "loss": -0.0824, "num_tokens": 16804718.0, "reward": -0.3544078469276428, "reward_std": 1.1500078439712524, "rewards/rollout_reward_func/mean": -0.3544078469276428, "rewards/rollout_reward_func/std": 1.150007963180542, "sampling/importance_sampling_ratio/max": 1.0811843872070312, "sampling/importance_sampling_ratio/mean": 0.2591894268989563, "sampling/importance_sampling_ratio/min": 1.391695604979759e-05, "sampling/sampling_logp_difference/max": 2.1134510040283203, "sampling/sampling_logp_difference/mean": 0.5173947811126709, "step": 1269, "step_time": 17.551085424987832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.669967830181122, "epoch": 0.00635, "grad_norm": 0.010520393960177898, "kl": 0.146512258797884, "learning_rate": 7.99985907957446e-06, "loss": -0.0824, "step": 1270, "step_time": 6.830898708009045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 4.545454502105713, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.116498440504074, "epoch": 0.006355, "frac_reward_zero_std": 0.0, "grad_norm": 0.10753462463617325, "kl": 0.2044383492320776, "learning_rate": 7.999858851087556e-06, "loss": -0.1038, "num_tokens": 16838335.0, "reward": 0.6713870763778687, "reward_std": 1.1782320737838745, "rewards/rollout_reward_func/mean": 0.6713870763778687, "rewards/rollout_reward_func/std": 1.1782320737838745, "sampling/importance_sampling_ratio/max": 1.0979996919631958, "sampling/importance_sampling_ratio/mean": 0.6340072154998779, "sampling/importance_sampling_ratio/min": 2.2131632704258664e-06, "sampling/sampling_logp_difference/max": 2.2375969886779785, "sampling/sampling_logp_difference/mean": 0.3850112557411194, "step": 1271, "step_time": 17.291596597991884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.105458527803421, "epoch": 0.00636, "grad_norm": 0.1050892099738121, "kl": 0.20534825883805752, "learning_rate": 7.999858622415574e-06, "loss": -0.1037, "step": 1272, "step_time": 7.81117520800035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.5625, "completions/mean_terminated_length": 5.615385055541992, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.6950062960386276, "epoch": 0.006365, "frac_reward_zero_std": 0.0, "grad_norm": 0.3529621958732605, "kl": 2.953927993774414, "learning_rate": 7.999858393558514e-06, "loss": -0.0542, "num_tokens": 16871919.0, "reward": -0.15056230127811432, "reward_std": 0.9262876510620117, "rewards/rollout_reward_func/mean": -0.15056230127811432, "rewards/rollout_reward_func/std": 0.9262876510620117, "sampling/importance_sampling_ratio/max": 1.0601491928100586, "sampling/importance_sampling_ratio/mean": 0.42163968086242676, "sampling/importance_sampling_ratio/min": 2.5164109729303163e-07, "sampling/sampling_logp_difference/max": 2.5034685134887695, "sampling/sampling_logp_difference/mean": 0.4905734360218048, "step": 1273, "step_time": 15.153441513000871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.707858622074127, "epoch": 0.00637, "grad_norm": 0.29041409492492676, "kl": 2.452945362776518, "learning_rate": 7.999858164516375e-06, "loss": -0.0564, "step": 1274, "step_time": 6.926284194996697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.746114619076252, "epoch": 0.006375, "frac_reward_zero_std": 0.5, "grad_norm": 0.0771087184548378, "kl": 0.23279071412980556, "learning_rate": 7.999857935289157e-06, "loss": -0.036, "num_tokens": 16895265.0, "reward": 1.4214094877243042, "reward_std": 1.0667729377746582, "rewards/rollout_reward_func/mean": 1.4214094877243042, "rewards/rollout_reward_func/std": 1.0667730569839478, "sampling/importance_sampling_ratio/max": 1.0592987537384033, "sampling/importance_sampling_ratio/mean": 0.8914855718612671, "sampling/importance_sampling_ratio/min": 0.001091447426006198, "sampling/sampling_logp_difference/max": 1.078258991241455, "sampling/sampling_logp_difference/mean": 0.11936028301715851, "step": 1275, "step_time": 13.760691356015741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7538558095693588, "epoch": 0.00638, "grad_norm": 0.06961841881275177, "kl": 0.2317180223762989, "learning_rate": 7.999857705876862e-06, "loss": -0.0362, "step": 1276, "step_time": 6.201367908011889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 4.642857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2620898745954037, "epoch": 0.006385, "frac_reward_zero_std": 0.0, "grad_norm": 0.0550839938223362, "kl": 0.2964754179120064, "learning_rate": 7.999857476279486e-06, "loss": -0.0349, "num_tokens": 16922977.0, "reward": 0.8149928450584412, "reward_std": 1.3396856784820557, "rewards/rollout_reward_func/mean": 0.8149928450584412, "rewards/rollout_reward_func/std": 1.3396856784820557, "sampling/importance_sampling_ratio/max": 1.0604639053344727, "sampling/importance_sampling_ratio/mean": 0.7287167310714722, "sampling/importance_sampling_ratio/min": 8.678096492076293e-05, "sampling/sampling_logp_difference/max": 1.4732609987258911, "sampling/sampling_logp_difference/mean": 0.21689169108867645, "step": 1277, "step_time": 14.056485795008484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.2692764718085527, "epoch": 0.00639, "grad_norm": 0.04446010664105415, "kl": 0.30982623621821404, "learning_rate": 7.999857246497032e-06, "loss": -0.035, "step": 1278, "step_time": 6.591453099026694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.4375, "completions/mean_terminated_length": 5.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.828729808330536, "epoch": 0.006395, "frac_reward_zero_std": 0.0, "grad_norm": 0.16881099343299866, "kl": 0.20975497737526894, "learning_rate": 7.999857016529499e-06, "loss": -0.0542, "num_tokens": 16953254.0, "reward": -0.12234681844711304, "reward_std": 1.0976346731185913, "rewards/rollout_reward_func/mean": -0.12234681844711304, "rewards/rollout_reward_func/std": 1.0976346731185913, "sampling/importance_sampling_ratio/max": 1.0535086393356323, "sampling/importance_sampling_ratio/mean": 0.3180575966835022, "sampling/importance_sampling_ratio/min": 0.00019098813936579973, "sampling/sampling_logp_difference/max": 1.3190109729766846, "sampling/sampling_logp_difference/mean": 0.38951873779296875, "step": 1279, "step_time": 15.283306216995697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.836872935295105, "epoch": 0.0064, "grad_norm": 0.1650320589542389, "kl": 0.20218314416706562, "learning_rate": 7.999856786376888e-06, "loss": -0.0544, "step": 1280, "step_time": 6.2778594919946045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 5.6666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6505610346794128, "epoch": 0.006405, "frac_reward_zero_std": 0.0, "grad_norm": 0.27876755595207214, "kl": 2.0581919960677624, "learning_rate": 7.999856556039197e-06, "loss": -0.0596, "num_tokens": 16977237.0, "reward": 1.261540412902832, "reward_std": 1.1914722919464111, "rewards/rollout_reward_func/mean": 1.261540412902832, "rewards/rollout_reward_func/std": 1.1914722919464111, "sampling/importance_sampling_ratio/max": 1.0944249629974365, "sampling/importance_sampling_ratio/mean": 0.6572644710540771, "sampling/importance_sampling_ratio/min": 0.0009276475757360458, "sampling/sampling_logp_difference/max": 1.651572823524475, "sampling/sampling_logp_difference/mean": 0.28389617800712585, "step": 1281, "step_time": 13.685646024983726 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 1.6578312814235687, "epoch": 0.00641, "grad_norm": 0.18707701563835144, "kl": 1.615467183291912, "learning_rate": 7.99985632551643e-06, "loss": -0.0614, "step": 1282, "step_time": 6.812377718015341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 6.333333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.1889137029647827, "epoch": 0.006415, "frac_reward_zero_std": 0.0, "grad_norm": 0.08535996079444885, "kl": 0.23277081549167633, "learning_rate": 7.999856094808581e-06, "loss": -0.0769, "num_tokens": 17007534.0, "reward": 0.02334143966436386, "reward_std": 1.23631751537323, "rewards/rollout_reward_func/mean": 0.02334143966436386, "rewards/rollout_reward_func/std": 1.23631751537323, "sampling/importance_sampling_ratio/max": 1.0690752267837524, "sampling/importance_sampling_ratio/mean": 0.3355914354324341, "sampling/importance_sampling_ratio/min": 1.8120434106094763e-05, "sampling/sampling_logp_difference/max": 2.1318557262420654, "sampling/sampling_logp_difference/mean": 0.5040600299835205, "step": 1283, "step_time": 17.236562619014876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.190023899078369, "epoch": 0.00642, "grad_norm": 0.06377778202295303, "kl": 0.22970592603087425, "learning_rate": 7.999855863915657e-06, "loss": -0.0769, "step": 1284, "step_time": 9.000151222993736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.3125, "completions/mean_terminated_length": 4.818181991577148, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1992609798908234, "epoch": 0.006425, "frac_reward_zero_std": 0.0, "grad_norm": 0.048266686499118805, "kl": 0.19870489835739136, "learning_rate": 7.999855632837652e-06, "loss": -0.0855, "num_tokens": 17037939.0, "reward": -0.40709781646728516, "reward_std": 0.8537864685058594, "rewards/rollout_reward_func/mean": -0.40709781646728516, "rewards/rollout_reward_func/std": 0.8537864089012146, "sampling/importance_sampling_ratio/max": 1.084391474723816, "sampling/importance_sampling_ratio/mean": 0.4865838289260864, "sampling/importance_sampling_ratio/min": 6.561218469869345e-05, "sampling/sampling_logp_difference/max": 1.5889685153961182, "sampling/sampling_logp_difference/mean": 0.3563501834869385, "step": 1285, "step_time": 15.476674742996693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2030811607837677, "epoch": 0.00643, "grad_norm": 0.054812368005514145, "kl": 0.19245600327849388, "learning_rate": 7.99985540157457e-06, "loss": -0.0855, "step": 1286, "step_time": 6.188046829018276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.5625, "completions/mean_terminated_length": 5.18181848526001, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0055824518203735, "epoch": 0.006435, "frac_reward_zero_std": 0.0, "grad_norm": 0.055962707847356796, "kl": 0.16504884883761406, "learning_rate": 7.99985517012641e-06, "loss": -0.0676, "num_tokens": 17065102.0, "reward": 0.8236581087112427, "reward_std": 1.3124990463256836, "rewards/rollout_reward_func/mean": 0.8236581087112427, "rewards/rollout_reward_func/std": 1.3124990463256836, "sampling/importance_sampling_ratio/max": 1.085992693901062, "sampling/importance_sampling_ratio/mean": 0.5919585227966309, "sampling/importance_sampling_ratio/min": 1.831629197113216e-05, "sampling/sampling_logp_difference/max": 1.930221676826477, "sampling/sampling_logp_difference/mean": 0.2801823616027832, "step": 1287, "step_time": 18.68689920201723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0099897980690002, "epoch": 0.00644, "grad_norm": 0.058263540267944336, "kl": 0.1585083305835724, "learning_rate": 7.999854938493168e-06, "loss": -0.0675, "step": 1288, "step_time": 7.026448852993781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6380278170108795, "epoch": 0.006445, "frac_reward_zero_std": 0.0, "grad_norm": 0.037570636719465256, "kl": 0.204185638576746, "learning_rate": 7.99985470667485e-06, "loss": -0.0982, "num_tokens": 17091351.0, "reward": 1.0187523365020752, "reward_std": 1.2431950569152832, "rewards/rollout_reward_func/mean": 1.0187523365020752, "rewards/rollout_reward_func/std": 1.2431951761245728, "sampling/importance_sampling_ratio/max": 1.0875968933105469, "sampling/importance_sampling_ratio/mean": 0.7179087996482849, "sampling/importance_sampling_ratio/min": 5.8428409829502925e-05, "sampling/sampling_logp_difference/max": 2.3063578605651855, "sampling/sampling_logp_difference/mean": 0.27728965878486633, "step": 1289, "step_time": 16.699292853983934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.638232596218586, "epoch": 0.00645, "grad_norm": 0.038072340190410614, "kl": 0.2028723508119583, "learning_rate": 7.999854474671453e-06, "loss": -0.0981, "step": 1290, "step_time": 6.365856043994427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 4.545454502105713, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4096423014998436, "epoch": 0.006455, "frac_reward_zero_std": 0.0, "grad_norm": 0.174085333943367, "kl": 0.43338071182370186, "learning_rate": 7.999854242482977e-06, "loss": -0.071, "num_tokens": 17123299.0, "reward": 0.4842880666255951, "reward_std": 1.2806282043457031, "rewards/rollout_reward_func/mean": 0.4842880666255951, "rewards/rollout_reward_func/std": 1.2806282043457031, "sampling/importance_sampling_ratio/max": 1.1380561590194702, "sampling/importance_sampling_ratio/mean": 0.6191482543945312, "sampling/importance_sampling_ratio/min": 3.9462445755589215e-08, "sampling/sampling_logp_difference/max": 1.9432673454284668, "sampling/sampling_logp_difference/mean": 0.49278801679611206, "step": 1291, "step_time": 15.850280173006468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.412227213382721, "epoch": 0.00646, "grad_norm": 0.1795257180929184, "kl": 0.4171073827892542, "learning_rate": 7.999854010109423e-06, "loss": -0.0713, "step": 1292, "step_time": 7.259351694985526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.375, "completions/mean_terminated_length": 6.363636493682861, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.368223696947098, "epoch": 0.006465, "frac_reward_zero_std": 0.0, "grad_norm": 0.07289346307516098, "kl": 0.33397459238767624, "learning_rate": 7.999853777550791e-06, "loss": -0.0812, "num_tokens": 17151807.0, "reward": 0.31937074661254883, "reward_std": 1.3432915210723877, "rewards/rollout_reward_func/mean": 0.31937074661254883, "rewards/rollout_reward_func/std": 1.3432916402816772, "sampling/importance_sampling_ratio/max": 1.209493637084961, "sampling/importance_sampling_ratio/mean": 0.5410425662994385, "sampling/importance_sampling_ratio/min": 2.264940803797799e-06, "sampling/sampling_logp_difference/max": 2.046306848526001, "sampling/sampling_logp_difference/mean": 0.45003849267959595, "step": 1293, "step_time": 16.114217450012802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.379268705844879, "epoch": 0.00647, "grad_norm": 0.0716785192489624, "kl": 0.3185281977057457, "learning_rate": 7.99985354480708e-06, "loss": -0.0813, "step": 1294, "step_time": 6.870078591979109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.234984964132309, "epoch": 0.006475, "frac_reward_zero_std": 0.0, "grad_norm": 0.10280241072177887, "kl": 0.21834486350417137, "learning_rate": 7.999853311878291e-06, "loss": -0.0602, "num_tokens": 17178134.0, "reward": 0.31095564365386963, "reward_std": 1.4141749143600464, "rewards/rollout_reward_func/mean": 0.31095564365386963, "rewards/rollout_reward_func/std": 1.4141749143600464, "sampling/importance_sampling_ratio/max": 1.1266366243362427, "sampling/importance_sampling_ratio/mean": 0.6142553687095642, "sampling/importance_sampling_ratio/min": 1.5801326753717149e-06, "sampling/sampling_logp_difference/max": 1.8851890563964844, "sampling/sampling_logp_difference/mean": 0.391665518283844, "step": 1295, "step_time": 12.474374075973174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2311404049396515, "epoch": 0.00648, "grad_norm": 0.10901065915822983, "kl": 0.21459580585360527, "learning_rate": 7.999853078764424e-06, "loss": -0.0605, "step": 1296, "step_time": 6.291231158989831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 4.2727274894714355, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.171651393175125, "epoch": 0.006485, "frac_reward_zero_std": 0.0, "grad_norm": 0.05802203342318535, "kl": 0.20281976275146008, "learning_rate": 7.999852845465477e-06, "loss": -0.0826, "num_tokens": 17202969.0, "reward": 0.7585210204124451, "reward_std": 1.1609606742858887, "rewards/rollout_reward_func/mean": 0.7585210204124451, "rewards/rollout_reward_func/std": 1.1609606742858887, "sampling/importance_sampling_ratio/max": 1.0921626091003418, "sampling/importance_sampling_ratio/mean": 0.6314762830734253, "sampling/importance_sampling_ratio/min": 3.096007361591546e-08, "sampling/sampling_logp_difference/max": 2.240518093109131, "sampling/sampling_logp_difference/mean": 0.3400873839855194, "step": 1297, "step_time": 15.670416548004141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.165187567472458, "epoch": 0.00649, "grad_norm": 0.058585088700056076, "kl": 0.20142179541289806, "learning_rate": 7.999852611981453e-06, "loss": -0.0826, "step": 1298, "step_time": 7.468635574987275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 6.6666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.9167485535144806, "epoch": 0.006495, "frac_reward_zero_std": 0.0, "grad_norm": 0.11426135897636414, "kl": 0.10124712949618697, "learning_rate": 7.99985237831235e-06, "loss": -0.0693, "num_tokens": 17235987.0, "reward": -0.26657092571258545, "reward_std": 0.90366131067276, "rewards/rollout_reward_func/mean": -0.26657092571258545, "rewards/rollout_reward_func/std": 0.90366131067276, "sampling/importance_sampling_ratio/max": 1.1717438697814941, "sampling/importance_sampling_ratio/mean": 0.23340308666229248, "sampling/importance_sampling_ratio/min": 0.00023817621695343405, "sampling/sampling_logp_difference/max": 1.9319355487823486, "sampling/sampling_logp_difference/mean": 0.40878134965896606, "step": 1299, "step_time": 17.627567611009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.9077468514442444, "epoch": 0.0065, "grad_norm": 0.12120862305164337, "kl": 0.10004162229597569, "learning_rate": 7.999852144458169e-06, "loss": -0.0696, "step": 1300, "step_time": 6.911646041000495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 4.785714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9173027984797955, "epoch": 0.006505, "frac_reward_zero_std": 0.5, "grad_norm": 0.07961493730545044, "kl": 0.1965775303542614, "learning_rate": 7.99985191041891e-06, "loss": -0.0426, "num_tokens": 17264997.0, "reward": 1.0813417434692383, "reward_std": 1.1765775680541992, "rewards/rollout_reward_func/mean": 1.0813417434692383, "rewards/rollout_reward_func/std": 1.1765776872634888, "sampling/importance_sampling_ratio/max": 1.114737868309021, "sampling/importance_sampling_ratio/mean": 0.8254325985908508, "sampling/importance_sampling_ratio/min": 0.00197750236839056, "sampling/sampling_logp_difference/max": 1.3444701433181763, "sampling/sampling_logp_difference/mean": 0.13785384595394135, "step": 1301, "step_time": 19.59952342302131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9152246490120888, "epoch": 0.00651, "grad_norm": 0.07939023524522781, "kl": 0.1967659592628479, "learning_rate": 7.999851676194571e-06, "loss": -0.0428, "step": 1302, "step_time": 8.430332601026748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1563289910554886, "epoch": 0.006515, "frac_reward_zero_std": 0.0, "grad_norm": 0.112195685505867, "kl": 0.5365705415606499, "learning_rate": 7.999851441785154e-06, "loss": -0.0942, "num_tokens": 17282670.0, "reward": 0.29446345567703247, "reward_std": 1.3020079135894775, "rewards/rollout_reward_func/mean": 0.29446345567703247, "rewards/rollout_reward_func/std": 1.3020079135894775, "sampling/importance_sampling_ratio/max": 1.0942292213439941, "sampling/importance_sampling_ratio/mean": 0.7868927717208862, "sampling/importance_sampling_ratio/min": 8.486674232699443e-06, "sampling/sampling_logp_difference/max": 2.052058696746826, "sampling/sampling_logp_difference/mean": 0.24099214375019073, "step": 1303, "step_time": 6.191482120993896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.14333076775074, "epoch": 0.00652, "grad_norm": 0.10188423842191696, "kl": 0.5549028366804123, "learning_rate": 7.99985120719066e-06, "loss": -0.0945, "step": 1304, "step_time": 3.29680669499794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.6875, "completions/mean_terminated_length": 5.363636493682861, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9031375348567963, "epoch": 0.006525, "frac_reward_zero_std": 0.0, "grad_norm": 0.038024965673685074, "kl": 0.1915879286825657, "learning_rate": 7.999850972411087e-06, "loss": -0.1153, "num_tokens": 17316586.0, "reward": 0.6678591966629028, "reward_std": 1.1873995065689087, "rewards/rollout_reward_func/mean": 0.6678591966629028, "rewards/rollout_reward_func/std": 1.1873996257781982, "sampling/importance_sampling_ratio/max": 1.1680954694747925, "sampling/importance_sampling_ratio/mean": 0.5987265110015869, "sampling/importance_sampling_ratio/min": 6.152978585305391e-06, "sampling/sampling_logp_difference/max": 2.3110179901123047, "sampling/sampling_logp_difference/mean": 0.32825061678886414, "step": 1305, "step_time": 19.436172913978226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.897058218717575, "epoch": 0.00653, "grad_norm": 0.035764601081609726, "kl": 0.19206715561449528, "learning_rate": 7.999850737446436e-06, "loss": -0.1155, "step": 1306, "step_time": 7.8125004040048225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.857142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.314754232764244, "epoch": 0.006535, "frac_reward_zero_std": 0.5, "grad_norm": 0.18650448322296143, "kl": 0.22073009610176086, "learning_rate": 7.999850502296707e-06, "loss": -0.028, "num_tokens": 17337243.0, "reward": 1.1579509973526, "reward_std": 1.2901397943496704, "rewards/rollout_reward_func/mean": 1.1579509973526, "rewards/rollout_reward_func/std": 1.2901397943496704, "sampling/importance_sampling_ratio/max": 1.0484683513641357, "sampling/importance_sampling_ratio/mean": 0.6575326323509216, "sampling/importance_sampling_ratio/min": 0.0010073073208332062, "sampling/sampling_logp_difference/max": 1.5154893398284912, "sampling/sampling_logp_difference/mean": 0.20701958239078522, "step": 1307, "step_time": 12.300376509985654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2756456136703491, "epoch": 0.00654, "grad_norm": 0.19683891534805298, "kl": 0.21787075698375702, "learning_rate": 7.999850266961899e-06, "loss": -0.0289, "step": 1308, "step_time": 6.011832378004328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.1875, "completions/mean_terminated_length": 5.099999904632568, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.376383498311043, "epoch": 0.006545, "frac_reward_zero_std": 0.0, "grad_norm": 0.08958703279495239, "kl": 0.2575030140578747, "learning_rate": 7.999850031442013e-06, "loss": -0.0742, "num_tokens": 17369041.0, "reward": 0.04815131425857544, "reward_std": 1.2504172325134277, "rewards/rollout_reward_func/mean": 0.04815131425857544, "rewards/rollout_reward_func/std": 1.2504172325134277, "sampling/importance_sampling_ratio/max": 1.0700846910476685, "sampling/importance_sampling_ratio/mean": 0.5295262336730957, "sampling/importance_sampling_ratio/min": 2.540396963013336e-06, "sampling/sampling_logp_difference/max": 1.7391481399536133, "sampling/sampling_logp_difference/mean": 0.4172624349594116, "step": 1309, "step_time": 16.084841624018736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.368901051580906, "epoch": 0.00655, "grad_norm": 0.09250356256961823, "kl": 0.26414334401488304, "learning_rate": 7.999849795737049e-06, "loss": -0.0739, "step": 1310, "step_time": 6.9685175490012625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 4.692307949066162, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5931014120578766, "epoch": 0.006555, "frac_reward_zero_std": 0.0, "grad_norm": 0.05621355026960373, "kl": 0.5891161449253559, "learning_rate": 7.999849559847007e-06, "loss": -0.0716, "num_tokens": 17395173.0, "reward": -0.474456250667572, "reward_std": 0.9319693446159363, "rewards/rollout_reward_func/mean": -0.474456250667572, "rewards/rollout_reward_func/std": 0.931969404220581, "sampling/importance_sampling_ratio/max": 1.0922677516937256, "sampling/importance_sampling_ratio/mean": 0.7089844942092896, "sampling/importance_sampling_ratio/min": 3.851333531201817e-05, "sampling/sampling_logp_difference/max": 1.686381459236145, "sampling/sampling_logp_difference/mean": 0.280332088470459, "step": 1311, "step_time": 15.548218652009382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5839031040668488, "epoch": 0.00656, "grad_norm": 0.057306259870529175, "kl": 0.6137681528925896, "learning_rate": 7.999849323771886e-06, "loss": -0.0717, "step": 1312, "step_time": 7.343878433006466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.5625, "completions/mean_terminated_length": 5.18181848526001, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.153827115893364, "epoch": 0.006565, "frac_reward_zero_std": 0.0, "grad_norm": 0.07134890556335449, "kl": 0.2541605867445469, "learning_rate": 7.999849087511688e-06, "loss": -0.0866, "num_tokens": 17425523.0, "reward": 0.7125239372253418, "reward_std": 1.3249062299728394, "rewards/rollout_reward_func/mean": 0.7125239372253418, "rewards/rollout_reward_func/std": 1.3249062299728394, "sampling/importance_sampling_ratio/max": 1.03733491897583, "sampling/importance_sampling_ratio/mean": 0.5227338075637817, "sampling/importance_sampling_ratio/min": 3.683188697323203e-05, "sampling/sampling_logp_difference/max": 2.1461617946624756, "sampling/sampling_logp_difference/mean": 0.34052789211273193, "step": 1313, "step_time": 14.754917120008031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.007352941203862429, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007352941203862429, "entropy": 2.1681249737739563, "epoch": 0.00657, "grad_norm": 0.04446674510836601, "kl": 0.24888485483825207, "learning_rate": 7.99984885106641e-06, "loss": -0.0869, "step": 1314, "step_time": 6.338809671011404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.1991561632603407, "epoch": 0.006575, "frac_reward_zero_std": 0.5, "grad_norm": 0.21531729400157928, "kl": 0.24702192470431328, "learning_rate": 7.999848614436057e-06, "loss": -0.0045, "num_tokens": 17445293.0, "reward": 1.7070598602294922, "reward_std": 0.3451944589614868, "rewards/rollout_reward_func/mean": 1.7070598602294922, "rewards/rollout_reward_func/std": 0.3451944887638092, "sampling/importance_sampling_ratio/max": 1.080895185470581, "sampling/importance_sampling_ratio/mean": 0.9984647035598755, "sampling/importance_sampling_ratio/min": 0.5643932819366455, "sampling/sampling_logp_difference/max": 0.4989117383956909, "sampling/sampling_logp_difference/mean": 0.020343123003840446, "step": 1315, "step_time": 8.971905911006615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.21203982084989548, "epoch": 0.00658, "grad_norm": 0.11467011272907257, "kl": 0.24780362099409103, "learning_rate": 7.999848377620622e-06, "loss": -0.0051, "step": 1316, "step_time": 4.987127420987235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 4.714285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.081419825553894, "epoch": 0.006585, "frac_reward_zero_std": 0.0, "grad_norm": 0.10635054856538773, "kl": 0.4829624481499195, "learning_rate": 7.999848140620111e-06, "loss": -0.0482, "num_tokens": 17471170.0, "reward": 0.9786912798881531, "reward_std": 1.1800596714019775, "rewards/rollout_reward_func/mean": 0.9786912798881531, "rewards/rollout_reward_func/std": 1.1800596714019775, "sampling/importance_sampling_ratio/max": 1.343119502067566, "sampling/importance_sampling_ratio/mean": 0.773434042930603, "sampling/importance_sampling_ratio/min": 9.418008266948164e-05, "sampling/sampling_logp_difference/max": 1.7250474691390991, "sampling/sampling_logp_difference/mean": 0.2302304208278656, "step": 1317, "step_time": 16.339963897989946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0868645161390305, "epoch": 0.00659, "grad_norm": 0.11282024532556534, "kl": 0.4804922230541706, "learning_rate": 7.999847903434523e-06, "loss": -0.0483, "step": 1318, "step_time": 7.231194938998669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 9.4375, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.297478139400482, "epoch": 0.006595, "frac_reward_zero_std": 0.0, "grad_norm": 0.027804018929600716, "kl": 0.23567609675228596, "learning_rate": 7.999847666063854e-06, "loss": -0.0897, "num_tokens": 17506014.0, "reward": 0.2546285390853882, "reward_std": 1.1743972301483154, "rewards/rollout_reward_func/mean": 0.2546285390853882, "rewards/rollout_reward_func/std": 1.174397349357605, "sampling/importance_sampling_ratio/max": 1.0747754573822021, "sampling/importance_sampling_ratio/mean": 0.5288852453231812, "sampling/importance_sampling_ratio/min": 1.5127616279642098e-05, "sampling/sampling_logp_difference/max": 1.6319270133972168, "sampling/sampling_logp_difference/mean": 0.35613638162612915, "step": 1319, "step_time": 18.763294682008564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.304018586874008, "epoch": 0.0066, "grad_norm": 0.030654264613986015, "kl": 0.20840279757976532, "learning_rate": 7.99984742850811e-06, "loss": -0.0898, "step": 1320, "step_time": 7.848057002993301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 8.6875, "completions/mean_terminated_length": 4.300000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.9195680022239685, "epoch": 0.006605, "frac_reward_zero_std": 0.0, "grad_norm": 0.149104043841362, "kl": 0.22253646701574326, "learning_rate": 7.999847190767286e-06, "loss": -0.0601, "num_tokens": 17535768.0, "reward": -0.47121191024780273, "reward_std": 0.9000124931335449, "rewards/rollout_reward_func/mean": -0.47121191024780273, "rewards/rollout_reward_func/std": 0.9000125527381897, "sampling/importance_sampling_ratio/max": 1.0483542680740356, "sampling/importance_sampling_ratio/mean": 0.40156376361846924, "sampling/importance_sampling_ratio/min": 1.4527034863931476e-06, "sampling/sampling_logp_difference/max": 1.849100947380066, "sampling/sampling_logp_difference/mean": 0.4743043780326843, "step": 1321, "step_time": 16.178812244004803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.9887264370918274, "epoch": 0.00661, "grad_norm": 0.1534685343503952, "kl": 0.22187890857458115, "learning_rate": 7.999846952841384e-06, "loss": -0.061, "step": 1322, "step_time": 6.857739219980431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 4.461538791656494, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6670834124088287, "epoch": 0.006615, "frac_reward_zero_std": 0.0, "grad_norm": 0.09271110594272614, "kl": 0.1865390930324793, "learning_rate": 7.999846714730406e-06, "loss": -0.039, "num_tokens": 17568596.0, "reward": -0.08745788037776947, "reward_std": 0.980173647403717, "rewards/rollout_reward_func/mean": -0.08745788037776947, "rewards/rollout_reward_func/std": 0.9801737070083618, "sampling/importance_sampling_ratio/max": 1.0708835124969482, "sampling/importance_sampling_ratio/mean": 0.7637994289398193, "sampling/importance_sampling_ratio/min": 9.660695923230378e-07, "sampling/sampling_logp_difference/max": 1.8540478944778442, "sampling/sampling_logp_difference/mean": 0.2992248237133026, "step": 1323, "step_time": 15.157865642977413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6912704780697823, "epoch": 0.00662, "grad_norm": 0.11702446639537811, "kl": 0.1852660644799471, "learning_rate": 7.999846476434347e-06, "loss": -0.0397, "step": 1324, "step_time": 6.929535201998078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 4.727272987365723, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.944659411907196, "epoch": 0.006625, "frac_reward_zero_std": 0.0, "grad_norm": 0.05570755526423454, "kl": 0.5515646524727345, "learning_rate": 7.999846237953212e-06, "loss": -0.075, "num_tokens": 17593190.0, "reward": 0.633860170841217, "reward_std": 1.3297315835952759, "rewards/rollout_reward_func/mean": 0.633860170841217, "rewards/rollout_reward_func/std": 1.3297317028045654, "sampling/importance_sampling_ratio/max": 1.1157219409942627, "sampling/importance_sampling_ratio/mean": 0.5880967974662781, "sampling/importance_sampling_ratio/min": 7.080930686242937e-07, "sampling/sampling_logp_difference/max": 1.7430474758148193, "sampling/sampling_logp_difference/mean": 0.36805835366249084, "step": 1325, "step_time": 16.370985032001045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9523207545280457, "epoch": 0.00663, "grad_norm": 0.058586087077856064, "kl": 0.5500326193869114, "learning_rate": 7.999845999286998e-06, "loss": -0.075, "step": 1326, "step_time": 6.9332192959846 }, { "clip_ratio/high_max": 0.01315789483487606, "clip_ratio/high_mean": 0.00657894741743803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00657894741743803, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 8.5625, "completions/mean_terminated_length": 4.099999904632568, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.8114302158355713, "epoch": 0.006635, "frac_reward_zero_std": 0.0, "grad_norm": 0.06860049813985825, "kl": 0.31041284231469035, "learning_rate": 7.999845760435706e-06, "loss": -0.0782, "num_tokens": 17625666.0, "reward": -0.27013468742370605, "reward_std": 1.267714500427246, "rewards/rollout_reward_func/mean": -0.27013468742370605, "rewards/rollout_reward_func/std": 1.267714500427246, "sampling/importance_sampling_ratio/max": 1.0562344789505005, "sampling/importance_sampling_ratio/mean": 0.43590182065963745, "sampling/importance_sampling_ratio/min": 2.2626391427138515e-08, "sampling/sampling_logp_difference/max": 2.163844108581543, "sampling/sampling_logp_difference/mean": 0.5088568925857544, "step": 1327, "step_time": 17.85791636099748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.83369380235672, "epoch": 0.00664, "grad_norm": 0.06658769398927689, "kl": 0.3246287638321519, "learning_rate": 7.999845521399337e-06, "loss": -0.0786, "step": 1328, "step_time": 7.529031707003014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.4392167776823044, "epoch": 0.006645, "frac_reward_zero_std": 0.5, "grad_norm": 0.0580592043697834, "kl": 0.24713202938437462, "learning_rate": 7.99984528217789e-06, "loss": -0.0343, "num_tokens": 17643629.0, "reward": 1.782395601272583, "reward_std": 0.6853758692741394, "rewards/rollout_reward_func/mean": 1.782395601272583, "rewards/rollout_reward_func/std": 0.6853759288787842, "sampling/importance_sampling_ratio/max": 1.0411492586135864, "sampling/importance_sampling_ratio/mean": 0.92295241355896, "sampling/importance_sampling_ratio/min": 0.01058290433138609, "sampling/sampling_logp_difference/max": 0.8600554466247559, "sampling/sampling_logp_difference/mean": 0.06396124511957169, "step": 1329, "step_time": 9.758580441994127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4542626105248928, "epoch": 0.00665, "grad_norm": 0.05520978942513466, "kl": 0.24612611159682274, "learning_rate": 7.999845042771365e-06, "loss": -0.0341, "step": 1330, "step_time": 4.858599424987915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.375, "completions/mean_terminated_length": 7.285714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.7862757444381714, "epoch": 0.006655, "frac_reward_zero_std": 0.0, "grad_norm": 0.03423348814249039, "kl": 0.32193753495812416, "learning_rate": 7.99984480317976e-06, "loss": -0.0742, "num_tokens": 17665366.0, "reward": 0.702775239944458, "reward_std": 1.4993627071380615, "rewards/rollout_reward_func/mean": 0.702775239944458, "rewards/rollout_reward_func/std": 1.499362826347351, "sampling/importance_sampling_ratio/max": 1.0312567949295044, "sampling/importance_sampling_ratio/mean": 0.510682225227356, "sampling/importance_sampling_ratio/min": 6.092439264193672e-08, "sampling/sampling_logp_difference/max": 2.199026584625244, "sampling/sampling_logp_difference/mean": 0.5284592509269714, "step": 1331, "step_time": 14.278916682989802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.787290632724762, "epoch": 0.00666, "grad_norm": 0.02912178263068199, "kl": 0.2975638844072819, "learning_rate": 7.99984456340308e-06, "loss": -0.0743, "step": 1332, "step_time": 6.624471836985322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.828286290168762, "epoch": 0.006665, "frac_reward_zero_std": 0.0, "grad_norm": 0.11985079199075699, "kl": 0.09342948906123638, "learning_rate": 7.999844323441321e-06, "loss": -0.1048, "num_tokens": 17698418.0, "reward": -0.022553324699401855, "reward_std": 1.1044330596923828, "rewards/rollout_reward_func/mean": -0.022553324699401855, "rewards/rollout_reward_func/std": 1.1044331789016724, "sampling/importance_sampling_ratio/max": 1.0862611532211304, "sampling/importance_sampling_ratio/mean": 0.39740321040153503, "sampling/importance_sampling_ratio/min": 7.253254921124608e-07, "sampling/sampling_logp_difference/max": 1.743070125579834, "sampling/sampling_logp_difference/mean": 0.4130955934524536, "step": 1333, "step_time": 17.818739355003345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8292612433433533, "epoch": 0.00667, "grad_norm": 0.11748238652944565, "kl": 0.09399369731545448, "learning_rate": 7.999844083294484e-06, "loss": -0.105, "step": 1334, "step_time": 6.928244636001182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 5.000000476837158, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.338573943823576, "epoch": 0.006675, "frac_reward_zero_std": 0.0, "grad_norm": 0.240078866481781, "kl": 0.38610414043068886, "learning_rate": 7.99984384296257e-06, "loss": -0.0738, "num_tokens": 17716489.0, "reward": 1.6110639572143555, "reward_std": 0.9766883254051208, "rewards/rollout_reward_func/mean": 1.6110639572143555, "rewards/rollout_reward_func/std": 0.9766883850097656, "sampling/importance_sampling_ratio/max": 1.0258510112762451, "sampling/importance_sampling_ratio/mean": 0.7372833490371704, "sampling/importance_sampling_ratio/min": 8.582728696637787e-07, "sampling/sampling_logp_difference/max": 1.9809246063232422, "sampling/sampling_logp_difference/mean": 0.2966993749141693, "step": 1335, "step_time": 9.863340803000028 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "entropy": 1.3110096529126167, "epoch": 0.00668, "grad_norm": 0.2085789442062378, "kl": 0.39783161133527756, "learning_rate": 7.999843602445575e-06, "loss": -0.0744, "step": 1336, "step_time": 5.128451778989984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.625, "completions/mean_terminated_length": 6.000000476837158, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.6865764260292053, "epoch": 0.006685, "frac_reward_zero_std": 0.0, "grad_norm": 0.24482738971710205, "kl": 0.11929506529122591, "learning_rate": 7.999843361743504e-06, "loss": -0.0338, "num_tokens": 17751025.0, "reward": -0.3030086159706116, "reward_std": 0.9207515120506287, "rewards/rollout_reward_func/mean": -0.3030086159706116, "rewards/rollout_reward_func/std": 0.9207515120506287, "sampling/importance_sampling_ratio/max": 0.8781278133392334, "sampling/importance_sampling_ratio/mean": 0.10683619230985641, "sampling/importance_sampling_ratio/min": 2.385211246291874e-07, "sampling/sampling_logp_difference/max": 1.8878902196884155, "sampling/sampling_logp_difference/mean": 0.5331518650054932, "step": 1337, "step_time": 18.461264181969455 }, { "clip_ratio/high_max": 0.07500000018626451, "clip_ratio/high_mean": 0.03750000009313226, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03750000009313226, "entropy": 3.6598448157310486, "epoch": 0.00669, "grad_norm": 0.12427155673503876, "kl": 0.10304322466254234, "learning_rate": 7.999843120856359e-06, "loss": -0.0356, "step": 1338, "step_time": 7.221834310010308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 5.538461685180664, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.080152874812484, "epoch": 0.006695, "frac_reward_zero_std": 0.0, "grad_norm": 0.10218437016010284, "kl": 0.5847746431827545, "learning_rate": 7.999842879784131e-06, "loss": -0.0796, "num_tokens": 17778681.0, "reward": 0.8278708457946777, "reward_std": 1.332484245300293, "rewards/rollout_reward_func/mean": 0.8278708457946777, "rewards/rollout_reward_func/std": 1.332484245300293, "sampling/importance_sampling_ratio/max": 1.064875602722168, "sampling/importance_sampling_ratio/mean": 0.656886637210846, "sampling/importance_sampling_ratio/min": 1.5598905633851246e-07, "sampling/sampling_logp_difference/max": 2.196051597595215, "sampling/sampling_logp_difference/mean": 0.44100984930992126, "step": 1339, "step_time": 16.57942340100999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.065569080412388, "epoch": 0.0067, "grad_norm": 0.1049494594335556, "kl": 0.6098682545125484, "learning_rate": 7.999842638526828e-06, "loss": -0.0793, "step": 1340, "step_time": 7.060913486988284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 4.545454502105713, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.494986981153488, "epoch": 0.006705, "frac_reward_zero_std": 0.0, "grad_norm": 0.10759598016738892, "kl": 0.22028033807873726, "learning_rate": 7.999842397084445e-06, "loss": -0.0696, "num_tokens": 17808509.0, "reward": -0.06630590558052063, "reward_std": 1.1228530406951904, "rewards/rollout_reward_func/mean": -0.06630590558052063, "rewards/rollout_reward_func/std": 1.1228530406951904, "sampling/importance_sampling_ratio/max": 1.0579118728637695, "sampling/importance_sampling_ratio/mean": 0.5015223026275635, "sampling/importance_sampling_ratio/min": 8.299568889924558e-07, "sampling/sampling_logp_difference/max": 2.0723161697387695, "sampling/sampling_logp_difference/mean": 0.39745867252349854, "step": 1341, "step_time": 14.94763320099446 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 2.46552973985672, "epoch": 0.00671, "grad_norm": 0.11729452759027481, "kl": 0.2184775285422802, "learning_rate": 7.999842155456986e-06, "loss": -0.0695, "step": 1342, "step_time": 7.148986693980987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 8.6875, "completions/mean_terminated_length": 4.300000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9875843226909637, "epoch": 0.006715, "frac_reward_zero_std": 0.0, "grad_norm": 0.030300935730338097, "kl": 0.2206433340907097, "learning_rate": 7.99984191364445e-06, "loss": -0.0714, "num_tokens": 17844265.0, "reward": 0.16570690274238586, "reward_std": 1.112540602684021, "rewards/rollout_reward_func/mean": 0.16570690274238586, "rewards/rollout_reward_func/std": 1.112540602684021, "sampling/importance_sampling_ratio/max": 1.2310292720794678, "sampling/importance_sampling_ratio/mean": 0.5922708511352539, "sampling/importance_sampling_ratio/min": 2.728477011260111e-05, "sampling/sampling_logp_difference/max": 2.4174065589904785, "sampling/sampling_logp_difference/mean": 0.34961336851119995, "step": 1343, "step_time": 19.171430605973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9905830323696136, "epoch": 0.00672, "grad_norm": 0.028677303344011307, "kl": 0.22102881968021393, "learning_rate": 7.999841671646833e-06, "loss": -0.0713, "step": 1344, "step_time": 7.992430690996116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7664123997092247, "epoch": 0.006725, "frac_reward_zero_std": 0.0, "grad_norm": 0.131189227104187, "kl": 0.714882668107748, "learning_rate": 7.999841429464142e-06, "loss": -0.0769, "num_tokens": 17865364.0, "reward": 0.719342052936554, "reward_std": 1.3913631439208984, "rewards/rollout_reward_func/mean": 0.719342052936554, "rewards/rollout_reward_func/std": 1.3913631439208984, "sampling/importance_sampling_ratio/max": 1.1168622970581055, "sampling/importance_sampling_ratio/mean": 0.6632947325706482, "sampling/importance_sampling_ratio/min": 1.1713659660017584e-06, "sampling/sampling_logp_difference/max": 2.1541452407836914, "sampling/sampling_logp_difference/mean": 0.32833367586135864, "step": 1345, "step_time": 12.732826152001508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7678108513355255, "epoch": 0.00673, "grad_norm": 0.12677007913589478, "kl": 0.6481539830565453, "learning_rate": 7.999841187096373e-06, "loss": -0.0773, "step": 1346, "step_time": 5.360915096011013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.875, "completions/mean_terminated_length": 5.1666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8883287012577057, "epoch": 0.006735, "frac_reward_zero_std": 0.0, "grad_norm": 0.05120697245001793, "kl": 0.21029169484972954, "learning_rate": 7.999840944543524e-06, "loss": -0.095, "num_tokens": 17897330.0, "reward": 0.3067620098590851, "reward_std": 1.1879197359085083, "rewards/rollout_reward_func/mean": 0.3067620098590851, "rewards/rollout_reward_func/std": 1.1879197359085083, "sampling/importance_sampling_ratio/max": 1.1180061101913452, "sampling/importance_sampling_ratio/mean": 0.6511277556419373, "sampling/importance_sampling_ratio/min": 0.0004493810993153602, "sampling/sampling_logp_difference/max": 1.8705244064331055, "sampling/sampling_logp_difference/mean": 0.32329392433166504, "step": 1347, "step_time": 18.107890899991617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8841806054115295, "epoch": 0.00674, "grad_norm": 0.03539058566093445, "kl": 0.2112856935709715, "learning_rate": 7.9998407018056e-06, "loss": -0.0951, "step": 1348, "step_time": 8.193183076014975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.571783073246479, "epoch": 0.006745, "frac_reward_zero_std": 0.0, "grad_norm": 0.09250695258378983, "kl": 0.37799597159028053, "learning_rate": 7.999840458882597e-06, "loss": -0.0746, "num_tokens": 17925343.0, "reward": 0.3222734034061432, "reward_std": 0.9523821473121643, "rewards/rollout_reward_func/mean": 0.3222734034061432, "rewards/rollout_reward_func/std": 0.9523820877075195, "sampling/importance_sampling_ratio/max": 1.0567506551742554, "sampling/importance_sampling_ratio/mean": 0.6448124647140503, "sampling/importance_sampling_ratio/min": 3.4816504921764135e-05, "sampling/sampling_logp_difference/max": 2.2627272605895996, "sampling/sampling_logp_difference/mean": 0.2732219994068146, "step": 1349, "step_time": 16.22597212801338 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.5511172264814377, "epoch": 0.00675, "grad_norm": 0.06525716185569763, "kl": 0.3827589228749275, "learning_rate": 7.999840215774515e-06, "loss": -0.0748, "step": 1350, "step_time": 6.49862094897253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.3125, "completions/mean_terminated_length": 4.600000381469727, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6228247527033091, "epoch": 0.006755, "frac_reward_zero_std": 0.0, "grad_norm": 0.02392483316361904, "kl": 0.16692401096224785, "learning_rate": 7.999839972481357e-06, "loss": -0.073, "num_tokens": 17944064.0, "reward": 1.2013286352157593, "reward_std": 1.099095106124878, "rewards/rollout_reward_func/mean": 1.2013286352157593, "rewards/rollout_reward_func/std": 1.0990952253341675, "sampling/importance_sampling_ratio/max": 1.0269488096237183, "sampling/importance_sampling_ratio/mean": 0.8927499055862427, "sampling/importance_sampling_ratio/min": 0.0030738883651793003, "sampling/sampling_logp_difference/max": 1.0029563903808594, "sampling/sampling_logp_difference/mean": 0.10881608724594116, "step": 1351, "step_time": 9.730022522984655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6228189039975405, "epoch": 0.00676, "grad_norm": 0.024154679849743843, "kl": 0.1675935685634613, "learning_rate": 7.99983972900312e-06, "loss": -0.073, "step": 1352, "step_time": 5.458217656007037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2808560132980347, "epoch": 0.006765, "frac_reward_zero_std": 0.0, "grad_norm": 0.2709345519542694, "kl": 0.2347213588654995, "learning_rate": 7.999839485339807e-06, "loss": -0.104, "num_tokens": 17978874.0, "reward": 0.5646815299987793, "reward_std": 1.2377185821533203, "rewards/rollout_reward_func/mean": 0.5646815299987793, "rewards/rollout_reward_func/std": 1.2377185821533203, "sampling/importance_sampling_ratio/max": 1.1894463300704956, "sampling/importance_sampling_ratio/mean": 0.6282301545143127, "sampling/importance_sampling_ratio/min": 2.1295469196047634e-05, "sampling/sampling_logp_difference/max": 1.4418749809265137, "sampling/sampling_logp_difference/mean": 0.37033596634864807, "step": 1353, "step_time": 18.63448512100149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2716978192329407, "epoch": 0.00677, "grad_norm": 0.20336177945137024, "kl": 0.23626411333680153, "learning_rate": 7.999839241491415e-06, "loss": -0.1057, "step": 1354, "step_time": 6.9200838030083105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.625, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.074180483818054, "epoch": 0.006775, "frac_reward_zero_std": 0.0, "grad_norm": 0.04131563380360603, "kl": 0.18146351538598537, "learning_rate": 7.999838997457947e-06, "loss": -0.0897, "num_tokens": 18007545.0, "reward": -0.30966198444366455, "reward_std": 1.1889564990997314, "rewards/rollout_reward_func/mean": -0.30966198444366455, "rewards/rollout_reward_func/std": 1.1889564990997314, "sampling/importance_sampling_ratio/max": 1.0674813985824585, "sampling/importance_sampling_ratio/mean": 0.32174786925315857, "sampling/importance_sampling_ratio/min": 6.017350528964016e-08, "sampling/sampling_logp_difference/max": 2.1870670318603516, "sampling/sampling_logp_difference/mean": 0.5043544769287109, "step": 1355, "step_time": 15.141085085982922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.0717700123786926, "epoch": 0.00678, "grad_norm": 0.045516662299633026, "kl": 0.180947445333004, "learning_rate": 7.999838753239401e-06, "loss": -0.0896, "step": 1356, "step_time": 6.363659216003725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0315035581588745, "epoch": 0.006785, "frac_reward_zero_std": 0.5, "grad_norm": 0.07897519320249557, "kl": 0.25018611550331116, "learning_rate": 7.999838508835776e-06, "loss": -0.039, "num_tokens": 18033036.0, "reward": 1.2560405731201172, "reward_std": 1.149194359779358, "rewards/rollout_reward_func/mean": 1.2560405731201172, "rewards/rollout_reward_func/std": 1.149194359779358, "sampling/importance_sampling_ratio/max": 1.1765480041503906, "sampling/importance_sampling_ratio/mean": 0.7594804763793945, "sampling/importance_sampling_ratio/min": 0.0008454967173747718, "sampling/sampling_logp_difference/max": 1.0955379009246826, "sampling/sampling_logp_difference/mean": 0.1419430375099182, "step": 1357, "step_time": 16.649391974016908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.026388643309474, "epoch": 0.00679, "grad_norm": 0.06950822472572327, "kl": 0.2478979378938675, "learning_rate": 7.999838264247076e-06, "loss": -0.0394, "step": 1358, "step_time": 7.037046619996545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 9.6875, "completions/mean_terminated_length": 4.777777671813965, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4933217763900757, "epoch": 0.006795, "frac_reward_zero_std": 0.0, "grad_norm": 0.15894664824008942, "kl": 0.5371606852859259, "learning_rate": 7.999838019473297e-06, "loss": -0.055, "num_tokens": 18065602.0, "reward": -0.2614745795726776, "reward_std": 1.0529357194900513, "rewards/rollout_reward_func/mean": -0.2614745795726776, "rewards/rollout_reward_func/std": 1.0529358386993408, "sampling/importance_sampling_ratio/max": 1.157448410987854, "sampling/importance_sampling_ratio/mean": 0.42165127396583557, "sampling/importance_sampling_ratio/min": 3.560933328117244e-05, "sampling/sampling_logp_difference/max": 1.8886401653289795, "sampling/sampling_logp_difference/mean": 0.4220159649848938, "step": 1359, "step_time": 17.652276548018563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.476569890975952, "epoch": 0.0068, "grad_norm": 0.13412441313266754, "kl": 0.46637412533164024, "learning_rate": 7.99983777451444e-06, "loss": -0.0553, "step": 1360, "step_time": 6.94686357997125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.5625, "completions/mean_terminated_length": 6.083333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2680190801620483, "epoch": 0.006805, "frac_reward_zero_std": 0.0, "grad_norm": 0.3602352738380432, "kl": 2.1211394481360912, "learning_rate": 7.999837529370507e-06, "loss": -0.074, "num_tokens": 18096206.0, "reward": 0.7032792568206787, "reward_std": 1.2778210639953613, "rewards/rollout_reward_func/mean": 0.7032792568206787, "rewards/rollout_reward_func/std": 1.2778210639953613, "sampling/importance_sampling_ratio/max": 1.0436632633209229, "sampling/importance_sampling_ratio/mean": 0.5736443996429443, "sampling/importance_sampling_ratio/min": 2.4152075184247224e-06, "sampling/sampling_logp_difference/max": 1.965389370918274, "sampling/sampling_logp_difference/mean": 0.3943885564804077, "step": 1361, "step_time": 16.833149989004596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.270521581172943, "epoch": 0.00681, "grad_norm": 0.28499430418014526, "kl": 1.6875194134190679, "learning_rate": 7.999837284041495e-06, "loss": -0.0771, "step": 1362, "step_time": 6.865006555992295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.1875, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.053442656993866, "epoch": 0.006815, "frac_reward_zero_std": 0.0, "grad_norm": 0.20198582112789154, "kl": 0.13045807741582394, "learning_rate": 7.999837038527408e-06, "loss": -0.0605, "num_tokens": 18131193.0, "reward": -0.38799989223480225, "reward_std": 0.9329397082328796, "rewards/rollout_reward_func/mean": -0.38799989223480225, "rewards/rollout_reward_func/std": 0.9329397678375244, "sampling/importance_sampling_ratio/max": 1.0968372821807861, "sampling/importance_sampling_ratio/mean": 0.3564281463623047, "sampling/importance_sampling_ratio/min": 4.786893015307214e-08, "sampling/sampling_logp_difference/max": 1.8124864101409912, "sampling/sampling_logp_difference/mean": 0.46121615171432495, "step": 1363, "step_time": 19.59197779498936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.064710855484009, "epoch": 0.00682, "grad_norm": 0.19297951459884644, "kl": 0.1317305639386177, "learning_rate": 7.99983679282824e-06, "loss": -0.061, "step": 1364, "step_time": 6.933439120999537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.822967529296875, "epoch": 0.006825, "frac_reward_zero_std": 0.5, "grad_norm": 0.16610002517700195, "kl": 0.1441338062286377, "learning_rate": 7.999836546943998e-06, "loss": -0.0512, "num_tokens": 18155869.0, "reward": 1.0549439191818237, "reward_std": 1.2244292497634888, "rewards/rollout_reward_func/mean": 1.0549439191818237, "rewards/rollout_reward_func/std": 1.2244292497634888, "sampling/importance_sampling_ratio/max": 1.0272789001464844, "sampling/importance_sampling_ratio/mean": 0.6836214065551758, "sampling/importance_sampling_ratio/min": 6.087035444579669e-07, "sampling/sampling_logp_difference/max": 1.9324908256530762, "sampling/sampling_logp_difference/mean": 0.272090882062912, "step": 1365, "step_time": 15.643159080995247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8194702565670013, "epoch": 0.00683, "grad_norm": 0.16470515727996826, "kl": 0.1433651428669691, "learning_rate": 7.999836300874676e-06, "loss": -0.0515, "step": 1366, "step_time": 5.970798138994724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 4.800000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.9833142161369324, "epoch": 0.006835, "frac_reward_zero_std": 0.0, "grad_norm": 0.20506654679775238, "kl": 0.1451786644756794, "learning_rate": 7.999836054620279e-06, "loss": -0.0847, "num_tokens": 18186418.0, "reward": -0.08558693528175354, "reward_std": 1.1691884994506836, "rewards/rollout_reward_func/mean": -0.08558693528175354, "rewards/rollout_reward_func/std": 1.1691884994506836, "sampling/importance_sampling_ratio/max": 1.0360037088394165, "sampling/importance_sampling_ratio/mean": 0.44716352224349976, "sampling/importance_sampling_ratio/min": 4.866074210241322e-10, "sampling/sampling_logp_difference/max": 2.3418989181518555, "sampling/sampling_logp_difference/mean": 0.5673446655273438, "step": 1367, "step_time": 17.167982387007214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.9878461956977844, "epoch": 0.00684, "grad_norm": 0.19662052392959595, "kl": 0.1447743922472, "learning_rate": 7.999835808180803e-06, "loss": -0.085, "step": 1368, "step_time": 7.541606262995629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 4.454545497894287, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.010033994913101, "epoch": 0.006845, "frac_reward_zero_std": 0.0, "grad_norm": 0.12524068355560303, "kl": 0.4400589242577553, "learning_rate": 7.99983556155625e-06, "loss": -0.0956, "num_tokens": 18209603.0, "reward": 0.9213575124740601, "reward_std": 1.3170382976531982, "rewards/rollout_reward_func/mean": 0.9213575124740601, "rewards/rollout_reward_func/std": 1.3170382976531982, "sampling/importance_sampling_ratio/max": 1.195794939994812, "sampling/importance_sampling_ratio/mean": 0.6633983254432678, "sampling/importance_sampling_ratio/min": 1.7675091612545657e-06, "sampling/sampling_logp_difference/max": 1.920708179473877, "sampling/sampling_logp_difference/mean": 0.3192882537841797, "step": 1369, "step_time": 14.756081499013817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0067873001098633, "epoch": 0.00685, "grad_norm": 0.12476561963558197, "kl": 0.4400915652513504, "learning_rate": 7.99983531474662e-06, "loss": -0.0956, "step": 1370, "step_time": 5.632068316990626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.92307710647583, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5601446479558945, "epoch": 0.006855, "frac_reward_zero_std": 0.0, "grad_norm": 0.24482084810733795, "kl": 0.8984644711017609, "learning_rate": 7.999835067751912e-06, "loss": -0.0733, "num_tokens": 18237368.0, "reward": 1.057657241821289, "reward_std": 1.2497751712799072, "rewards/rollout_reward_func/mean": 1.057657241821289, "rewards/rollout_reward_func/std": 1.2497751712799072, "sampling/importance_sampling_ratio/max": 1.05018150806427, "sampling/importance_sampling_ratio/mean": 0.6217069625854492, "sampling/importance_sampling_ratio/min": 7.041199978630175e-07, "sampling/sampling_logp_difference/max": 2.0352721214294434, "sampling/sampling_logp_difference/mean": 0.3972274661064148, "step": 1371, "step_time": 17.621010944989393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.5644701439887285, "epoch": 0.00686, "grad_norm": 0.16277633607387543, "kl": 0.80373465269804, "learning_rate": 7.999834820572128e-06, "loss": -0.0741, "step": 1372, "step_time": 8.248178622990963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 5.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9572428464889526, "epoch": 0.006865, "frac_reward_zero_std": 0.0, "grad_norm": 0.06444841623306274, "kl": 0.4856326412409544, "learning_rate": 7.999834573207266e-06, "loss": -0.0756, "num_tokens": 18270224.0, "reward": 0.6923489570617676, "reward_std": 1.2642385959625244, "rewards/rollout_reward_func/mean": 0.6923489570617676, "rewards/rollout_reward_func/std": 1.2642385959625244, "sampling/importance_sampling_ratio/max": 1.099092960357666, "sampling/importance_sampling_ratio/mean": 0.6504306197166443, "sampling/importance_sampling_ratio/min": 1.2531180004771159e-07, "sampling/sampling_logp_difference/max": 2.6676995754241943, "sampling/sampling_logp_difference/mean": 0.40372759103775024, "step": 1373, "step_time": 17.47754591400735 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 1.9614920914173126, "epoch": 0.00687, "grad_norm": 0.05122916027903557, "kl": 0.4284590035676956, "learning_rate": 7.999834325657326e-06, "loss": -0.0758, "step": 1374, "step_time": 6.918433950006147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 4.5714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.369463711977005, "epoch": 0.006875, "frac_reward_zero_std": 0.0, "grad_norm": 0.08085360378026962, "kl": 0.4015170969069004, "learning_rate": 7.999834077922309e-06, "loss": -0.079, "num_tokens": 18298117.0, "reward": -0.09692218154668808, "reward_std": 0.8956209421157837, "rewards/rollout_reward_func/mean": -0.09692218154668808, "rewards/rollout_reward_func/std": 0.8956209421157837, "sampling/importance_sampling_ratio/max": 1.0571943521499634, "sampling/importance_sampling_ratio/mean": 0.7482897043228149, "sampling/importance_sampling_ratio/min": 4.739751489069022e-07, "sampling/sampling_logp_difference/max": 2.233332633972168, "sampling/sampling_logp_difference/mean": 0.2559569180011749, "step": 1375, "step_time": 15.391532979003387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3878367692232132, "epoch": 0.00688, "grad_norm": 0.07555007934570312, "kl": 0.35776349157094955, "learning_rate": 7.999833830002214e-06, "loss": -0.0794, "step": 1376, "step_time": 7.111936382993008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 4.44444465637207, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.328248765319586, "epoch": 0.006885, "frac_reward_zero_std": 0.0, "grad_norm": 0.055176667869091034, "kl": 0.22514118999242783, "learning_rate": 7.999833581897045e-06, "loss": -0.1063, "num_tokens": 18331347.0, "reward": 0.4518990218639374, "reward_std": 1.1718072891235352, "rewards/rollout_reward_func/mean": 0.4518990218639374, "rewards/rollout_reward_func/std": 1.1718072891235352, "sampling/importance_sampling_ratio/max": 1.2089753150939941, "sampling/importance_sampling_ratio/mean": 0.5466164350509644, "sampling/importance_sampling_ratio/min": 6.1354530771495774e-06, "sampling/sampling_logp_difference/max": 1.9900233745574951, "sampling/sampling_logp_difference/mean": 0.4373987913131714, "step": 1377, "step_time": 17.041824657993857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3289203234016895, "epoch": 0.00689, "grad_norm": 0.056327614933252335, "kl": 0.21650074049830437, "learning_rate": 7.999833333606797e-06, "loss": -0.1063, "step": 1378, "step_time": 6.964056466997135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.8125, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.166358917951584, "epoch": 0.006895, "frac_reward_zero_std": 0.0, "grad_norm": 0.07761669158935547, "kl": 0.17675810493528843, "learning_rate": 7.999833085131471e-06, "loss": -0.1019, "num_tokens": 18363822.0, "reward": 0.3033963739871979, "reward_std": 1.2684636116027832, "rewards/rollout_reward_func/mean": 0.3033963739871979, "rewards/rollout_reward_func/std": 1.2684636116027832, "sampling/importance_sampling_ratio/max": 1.1884419918060303, "sampling/importance_sampling_ratio/mean": 0.4779936373233795, "sampling/importance_sampling_ratio/min": 3.474361420785499e-08, "sampling/sampling_logp_difference/max": 2.2205967903137207, "sampling/sampling_logp_difference/mean": 0.5102303624153137, "step": 1379, "step_time": 16.726153106981656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.1771362125873566, "epoch": 0.0069, "grad_norm": 0.08082396537065506, "kl": 0.1738332398235798, "learning_rate": 7.999832836471068e-06, "loss": -0.1017, "step": 1380, "step_time": 6.891714967016014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 4.230769157409668, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4396848492324352, "epoch": 0.006905, "frac_reward_zero_std": 0.0, "grad_norm": 0.12106131762266159, "kl": 0.2878483086824417, "learning_rate": 7.999832587625589e-06, "loss": -0.0856, "num_tokens": 18383418.0, "reward": 1.4085874557495117, "reward_std": 1.124900221824646, "rewards/rollout_reward_func/mean": 1.4085874557495117, "rewards/rollout_reward_func/std": 1.1249003410339355, "sampling/importance_sampling_ratio/max": 1.1277021169662476, "sampling/importance_sampling_ratio/mean": 0.786071240901947, "sampling/importance_sampling_ratio/min": 1.3879175639885943e-06, "sampling/sampling_logp_difference/max": 1.5814666748046875, "sampling/sampling_logp_difference/mean": 0.31338614225387573, "step": 1381, "step_time": 11.092873619010788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4450354985892773, "epoch": 0.00691, "grad_norm": 0.10945619642734528, "kl": 0.28882045298814774, "learning_rate": 7.99983233859503e-06, "loss": -0.0857, "step": 1382, "step_time": 5.095957842000644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.92307710647583, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2131920754909515, "epoch": 0.006915, "frac_reward_zero_std": 0.5, "grad_norm": 0.013851255178451538, "kl": 0.26285067573189735, "learning_rate": 7.999832089379398e-06, "loss": -0.0555, "num_tokens": 18408086.0, "reward": 0.9701064825057983, "reward_std": 1.3929002285003662, "rewards/rollout_reward_func/mean": 0.9701064825057983, "rewards/rollout_reward_func/std": 1.3929003477096558, "sampling/importance_sampling_ratio/max": 1.0721229314804077, "sampling/importance_sampling_ratio/mean": 0.7085354328155518, "sampling/importance_sampling_ratio/min": 6.184337070225254e-11, "sampling/sampling_logp_difference/max": 2.1796140670776367, "sampling/sampling_logp_difference/mean": 0.3578079640865326, "step": 1383, "step_time": 16.113560668978607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.203631281852722, "epoch": 0.00692, "grad_norm": 0.014815696515142918, "kl": 0.2671169713139534, "learning_rate": 7.999831839978686e-06, "loss": -0.0555, "step": 1384, "step_time": 7.342316980008036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 5.6666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4560585618019104, "epoch": 0.006925, "frac_reward_zero_std": 0.0, "grad_norm": 0.05452530458569527, "kl": 0.4550937619060278, "learning_rate": 7.999831590392898e-06, "loss": -0.092, "num_tokens": 18441362.0, "reward": 0.15878000855445862, "reward_std": 1.298377513885498, "rewards/rollout_reward_func/mean": 0.15878000855445862, "rewards/rollout_reward_func/std": 1.298377513885498, "sampling/importance_sampling_ratio/max": 1.0488239526748657, "sampling/importance_sampling_ratio/mean": 0.45760637521743774, "sampling/importance_sampling_ratio/min": 9.284755651606247e-05, "sampling/sampling_logp_difference/max": 1.7510758638381958, "sampling/sampling_logp_difference/mean": 0.36750274896621704, "step": 1385, "step_time": 19.67650207101542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4531275033950806, "epoch": 0.00693, "grad_norm": 0.0563616119325161, "kl": 0.40216026455163956, "learning_rate": 7.999831340622032e-06, "loss": -0.0921, "step": 1386, "step_time": 7.562454156999593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 5.0714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.455667570233345, "epoch": 0.006935, "frac_reward_zero_std": 0.0, "grad_norm": 0.2149190753698349, "kl": 0.5309103652834892, "learning_rate": 7.99983109066609e-06, "loss": -0.0121, "num_tokens": 18472777.0, "reward": 0.07614171504974365, "reward_std": 1.1393311023712158, "rewards/rollout_reward_func/mean": 0.07614171504974365, "rewards/rollout_reward_func/std": 1.1393312215805054, "sampling/importance_sampling_ratio/max": 1.1029937267303467, "sampling/importance_sampling_ratio/mean": 0.6968955993652344, "sampling/importance_sampling_ratio/min": 2.455883259244729e-05, "sampling/sampling_logp_difference/max": 1.6376152038574219, "sampling/sampling_logp_difference/mean": 0.23057466745376587, "step": 1387, "step_time": 15.972875380990445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.456946462392807, "epoch": 0.00694, "grad_norm": 0.20760026574134827, "kl": 0.5024054497480392, "learning_rate": 7.99983084052507e-06, "loss": -0.0128, "step": 1388, "step_time": 7.392313233023742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 4.769230842590332, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0228012800216675, "epoch": 0.006945, "frac_reward_zero_std": 0.0, "grad_norm": 0.17030024528503418, "kl": 0.3792194463312626, "learning_rate": 7.999830590198974e-06, "loss": -0.0198, "num_tokens": 18504896.0, "reward": -0.24552634358406067, "reward_std": 0.8201720118522644, "rewards/rollout_reward_func/mean": -0.24552634358406067, "rewards/rollout_reward_func/std": 0.8201720118522644, "sampling/importance_sampling_ratio/max": 1.156201720237732, "sampling/importance_sampling_ratio/mean": 0.653622567653656, "sampling/importance_sampling_ratio/min": 8.024040653253905e-06, "sampling/sampling_logp_difference/max": 2.1498115062713623, "sampling/sampling_logp_difference/mean": 0.3058586120605469, "step": 1389, "step_time": 14.251064886018867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0341133177280426, "epoch": 0.00695, "grad_norm": 0.154511496424675, "kl": 0.3841978833079338, "learning_rate": 7.999830339687801e-06, "loss": -0.0207, "step": 1390, "step_time": 6.888390357984463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.4375, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0558114163577557, "epoch": 0.006955, "frac_reward_zero_std": 0.0, "grad_norm": 0.04503186419606209, "kl": 0.26731136068701744, "learning_rate": 7.99983008899155e-06, "loss": -0.0726, "num_tokens": 18538299.0, "reward": 0.7945674657821655, "reward_std": 1.1696147918701172, "rewards/rollout_reward_func/mean": 0.7945674657821655, "rewards/rollout_reward_func/std": 1.1696149110794067, "sampling/importance_sampling_ratio/max": 1.4739958047866821, "sampling/importance_sampling_ratio/mean": 0.6867108345031738, "sampling/importance_sampling_ratio/min": 8.175288002121306e-08, "sampling/sampling_logp_difference/max": 1.9747428894042969, "sampling/sampling_logp_difference/mean": 0.44636017084121704, "step": 1391, "step_time": 16.91111254702264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.063531920313835, "epoch": 0.00696, "grad_norm": 0.04944785311818123, "kl": 0.26623788848519325, "learning_rate": 7.999829838110223e-06, "loss": -0.0724, "step": 1392, "step_time": 6.9319888829923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 6.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.707305669784546, "epoch": 0.006965, "frac_reward_zero_std": 0.0, "grad_norm": 0.07447847723960876, "kl": 0.4723961278796196, "learning_rate": 7.999829587043818e-06, "loss": -0.0511, "num_tokens": 18560553.0, "reward": 0.6444873809814453, "reward_std": 1.4991424083709717, "rewards/rollout_reward_func/mean": 0.6444873809814453, "rewards/rollout_reward_func/std": 1.4991422891616821, "sampling/importance_sampling_ratio/max": 1.1003804206848145, "sampling/importance_sampling_ratio/mean": 0.40735194087028503, "sampling/importance_sampling_ratio/min": 6.0306092564133e-08, "sampling/sampling_logp_difference/max": 2.092458963394165, "sampling/sampling_logp_difference/mean": 0.4996272921562195, "step": 1393, "step_time": 13.939963430995704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7094348669052124, "epoch": 0.00697, "grad_norm": 0.07773245126008987, "kl": 0.48031984455883503, "learning_rate": 7.999829335792336e-06, "loss": -0.051, "step": 1394, "step_time": 5.803548946016235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 6.153846263885498, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6535226106643677, "epoch": 0.006975, "frac_reward_zero_std": 0.5, "grad_norm": 0.19432878494262695, "kl": 0.14397036656737328, "learning_rate": 7.999829084355778e-06, "loss": -0.0295, "num_tokens": 18581985.0, "reward": 1.0187397003173828, "reward_std": 1.3570867776870728, "rewards/rollout_reward_func/mean": 1.0187397003173828, "rewards/rollout_reward_func/std": 1.3570867776870728, "sampling/importance_sampling_ratio/max": 1.0550012588500977, "sampling/importance_sampling_ratio/mean": 0.6277254223823547, "sampling/importance_sampling_ratio/min": 0.00010695956007111818, "sampling/sampling_logp_difference/max": 1.9355517625808716, "sampling/sampling_logp_difference/mean": 0.21827104687690735, "step": 1395, "step_time": 13.535231037007179 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 1.6517040431499481, "epoch": 0.00698, "grad_norm": 0.045007478445768356, "kl": 0.14362671971321106, "learning_rate": 7.999828832734144e-06, "loss": -0.03, "step": 1396, "step_time": 5.861694286984857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.375, "completions/mean_terminated_length": 5.833333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.0530163049697876, "epoch": 0.006985, "frac_reward_zero_std": 0.0, "grad_norm": 0.057566020637750626, "kl": 0.2535227444022894, "learning_rate": 7.99982858092743e-06, "loss": -0.1051, "num_tokens": 18614043.0, "reward": 0.36668503284454346, "reward_std": 1.3118976354599, "rewards/rollout_reward_func/mean": 0.36668503284454346, "rewards/rollout_reward_func/std": 1.3118977546691895, "sampling/importance_sampling_ratio/max": 1.117834210395813, "sampling/importance_sampling_ratio/mean": 0.48826420307159424, "sampling/importance_sampling_ratio/min": 1.755231096467469e-05, "sampling/sampling_logp_difference/max": 2.1265223026275635, "sampling/sampling_logp_difference/mean": 0.48742544651031494, "step": 1397, "step_time": 15.744029459005105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.047208845615387, "epoch": 0.00699, "grad_norm": 0.0531027726829052, "kl": 0.23760711774230003, "learning_rate": 7.999828328935642e-06, "loss": -0.1053, "step": 1398, "step_time": 7.6164874189998955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 4.2727274894714355, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.976477026939392, "epoch": 0.006995, "frac_reward_zero_std": 0.0, "grad_norm": 0.035474665462970734, "kl": 0.1810198500752449, "learning_rate": 7.999828076758775e-06, "loss": -0.1077, "num_tokens": 18646761.0, "reward": 0.8401550650596619, "reward_std": 1.2585171461105347, "rewards/rollout_reward_func/mean": 0.8401550650596619, "rewards/rollout_reward_func/std": 1.2585171461105347, "sampling/importance_sampling_ratio/max": 1.086146354675293, "sampling/importance_sampling_ratio/mean": 0.6546054482460022, "sampling/importance_sampling_ratio/min": 5.420862635219237e-06, "sampling/sampling_logp_difference/max": 2.273050308227539, "sampling/sampling_logp_difference/mean": 0.3153703212738037, "step": 1399, "step_time": 17.11392052398878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9784556031227112, "epoch": 0.007, "grad_norm": 0.03485412895679474, "kl": 0.18064086139202118, "learning_rate": 7.999827824396833e-06, "loss": -0.1078, "step": 1400, "step_time": 6.821997179999016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.875, "completions/mean_terminated_length": 4.599999904632568, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.444196969270706, "epoch": 0.007005, "frac_reward_zero_std": 0.0, "grad_norm": 0.14136211574077606, "kl": 0.9586338317021728, "learning_rate": 7.999827571849813e-06, "loss": -0.1037, "num_tokens": 18679115.0, "reward": 0.5132105350494385, "reward_std": 1.3147938251495361, "rewards/rollout_reward_func/mean": 0.5132105350494385, "rewards/rollout_reward_func/std": 1.3147939443588257, "sampling/importance_sampling_ratio/max": 1.1106940507888794, "sampling/importance_sampling_ratio/mean": 0.5712136030197144, "sampling/importance_sampling_ratio/min": 1.0402285965938063e-07, "sampling/sampling_logp_difference/max": 2.375680446624756, "sampling/sampling_logp_difference/mean": 0.47856590151786804, "step": 1401, "step_time": 16.223258969010203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4403601363301277, "epoch": 0.00701, "grad_norm": 0.13143697381019592, "kl": 0.9157658573240042, "learning_rate": 7.999827319117716e-06, "loss": -0.1041, "step": 1402, "step_time": 6.982832883004448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 4.642857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3763617351651192, "epoch": 0.007015, "frac_reward_zero_std": 0.0, "grad_norm": 0.09903562068939209, "kl": 0.5126289427280426, "learning_rate": 7.999827066200542e-06, "loss": -0.0781, "num_tokens": 18701943.0, "reward": 1.1883219480514526, "reward_std": 1.2348264455795288, "rewards/rollout_reward_func/mean": 1.1883219480514526, "rewards/rollout_reward_func/std": 1.2348264455795288, "sampling/importance_sampling_ratio/max": 1.0843724012374878, "sampling/importance_sampling_ratio/mean": 0.7830926179885864, "sampling/importance_sampling_ratio/min": 6.586464587599039e-05, "sampling/sampling_logp_difference/max": 1.8372374773025513, "sampling/sampling_logp_difference/mean": 0.25669053196907043, "step": 1403, "step_time": 13.276824996995856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3782019019126892, "epoch": 0.00702, "grad_norm": 0.10720507055521011, "kl": 0.5333787277340889, "learning_rate": 7.999826813098292e-06, "loss": -0.0782, "step": 1404, "step_time": 6.829136260013911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.5625, "completions/mean_terminated_length": 4.5625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9201991707086563, "epoch": 0.007025, "frac_reward_zero_std": 0.0, "grad_norm": 0.28544339537620544, "kl": 0.6289511397480965, "learning_rate": 7.999826559810965e-06, "loss": -0.0731, "num_tokens": 18719685.0, "reward": 1.5809712409973145, "reward_std": 0.41465967893600464, "rewards/rollout_reward_func/mean": 1.5809712409973145, "rewards/rollout_reward_func/std": 0.41465964913368225, "sampling/importance_sampling_ratio/max": 1.0813654661178589, "sampling/importance_sampling_ratio/mean": 0.817307710647583, "sampling/importance_sampling_ratio/min": 0.00018455767713021487, "sampling/sampling_logp_difference/max": 1.6734964847564697, "sampling/sampling_logp_difference/mean": 0.1878596693277359, "step": 1405, "step_time": 6.170253897988005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8932553604245186, "epoch": 0.00703, "grad_norm": 0.2392733246088028, "kl": 0.6388145014643669, "learning_rate": 7.999826306338563e-06, "loss": -0.0739, "step": 1406, "step_time": 3.3573231319896877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 5.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.064397931098938, "epoch": 0.007035, "frac_reward_zero_std": 0.0, "grad_norm": 0.07639311254024506, "kl": 0.21153869479894638, "learning_rate": 7.999826052681082e-06, "loss": -0.087, "num_tokens": 18754818.0, "reward": 0.33969709277153015, "reward_std": 1.1685659885406494, "rewards/rollout_reward_func/mean": 0.33969709277153015, "rewards/rollout_reward_func/std": 1.1685659885406494, "sampling/importance_sampling_ratio/max": 1.1860524415969849, "sampling/importance_sampling_ratio/mean": 0.5744030475616455, "sampling/importance_sampling_ratio/min": 0.00031254126224666834, "sampling/sampling_logp_difference/max": 1.350191354751587, "sampling/sampling_logp_difference/mean": 0.30811697244644165, "step": 1407, "step_time": 16.52639617798559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0590625405311584, "epoch": 0.00704, "grad_norm": 0.07625075429677963, "kl": 0.21473834291100502, "learning_rate": 7.999825798838525e-06, "loss": -0.0871, "step": 1408, "step_time": 7.665666412998689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 5.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3283899575471878, "epoch": 0.007045, "frac_reward_zero_std": 0.0, "grad_norm": 0.19626165926456451, "kl": 0.26778510957956314, "learning_rate": 7.999825544810891e-06, "loss": -0.0823, "num_tokens": 18780542.0, "reward": 0.1780903935432434, "reward_std": 1.5054900646209717, "rewards/rollout_reward_func/mean": 0.1780903935432434, "rewards/rollout_reward_func/std": 1.5054900646209717, "sampling/importance_sampling_ratio/max": 1.1122440099716187, "sampling/importance_sampling_ratio/mean": 0.7122453451156616, "sampling/importance_sampling_ratio/min": 0.0024111263919621706, "sampling/sampling_logp_difference/max": 1.3146741390228271, "sampling/sampling_logp_difference/mean": 0.20184436440467834, "step": 1409, "step_time": 13.539135879997048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2893444299697876, "epoch": 0.00705, "grad_norm": 0.14861556887626648, "kl": 0.27989163249731064, "learning_rate": 7.99982529059818e-06, "loss": -0.0833, "step": 1410, "step_time": 6.723566260989173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5803809463977814, "epoch": 0.007055, "frac_reward_zero_std": 0.5, "grad_norm": 0.013084790669381618, "kl": 0.31385577097535133, "learning_rate": 7.999825036200393e-06, "loss": -0.039, "num_tokens": 18798024.0, "reward": 1.763964056968689, "reward_std": 0.7384424805641174, "rewards/rollout_reward_func/mean": 1.763964056968689, "rewards/rollout_reward_func/std": 0.7384424805641174, "sampling/importance_sampling_ratio/max": 1.0497245788574219, "sampling/importance_sampling_ratio/mean": 0.966522753238678, "sampling/importance_sampling_ratio/min": 5.6243584367621224e-06, "sampling/sampling_logp_difference/max": 1.9980171918869019, "sampling/sampling_logp_difference/mean": 0.14341996610164642, "step": 1411, "step_time": 6.097834446991328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5756939351558685, "epoch": 0.00706, "grad_norm": 0.014409626834094524, "kl": 0.3275342807173729, "learning_rate": 7.99982478161753e-06, "loss": -0.039, "step": 1412, "step_time": 3.2870806459977757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4704919755458832, "epoch": 0.007065, "frac_reward_zero_std": 0.0, "grad_norm": 0.05835827440023422, "kl": 0.24385948479175568, "learning_rate": 7.999824526849587e-06, "loss": -0.0783, "num_tokens": 18832516.0, "reward": 0.6450879573822021, "reward_std": 1.1969976425170898, "rewards/rollout_reward_func/mean": 0.6450879573822021, "rewards/rollout_reward_func/std": 1.1969976425170898, "sampling/importance_sampling_ratio/max": 1.1929041147232056, "sampling/importance_sampling_ratio/mean": 0.7448025941848755, "sampling/importance_sampling_ratio/min": 0.001226550666615367, "sampling/sampling_logp_difference/max": 1.2758033275604248, "sampling/sampling_logp_difference/mean": 0.22980371117591858, "step": 1413, "step_time": 16.549893180010258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4606383591890335, "epoch": 0.00707, "grad_norm": 0.04910695552825928, "kl": 0.24507893808186054, "learning_rate": 7.999824271896571e-06, "loss": -0.0784, "step": 1414, "step_time": 7.681171743970481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.483066663146019, "epoch": 0.007075, "frac_reward_zero_std": 0.5, "grad_norm": 0.09232092648744583, "kl": 0.2143610157072544, "learning_rate": 7.999824016758477e-06, "loss": -0.0505, "num_tokens": 18858796.0, "reward": 0.9972181916236877, "reward_std": 1.2638452053070068, "rewards/rollout_reward_func/mean": 0.9972181916236877, "rewards/rollout_reward_func/std": 1.2638452053070068, "sampling/importance_sampling_ratio/max": 1.090333342552185, "sampling/importance_sampling_ratio/mean": 0.7569371461868286, "sampling/importance_sampling_ratio/min": 8.611366730804093e-09, "sampling/sampling_logp_difference/max": 1.8422491550445557, "sampling/sampling_logp_difference/mean": 0.25584539771080017, "step": 1415, "step_time": 15.976035323998076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4756975509226322, "epoch": 0.00708, "grad_norm": 0.09518308192491531, "kl": 0.21452771872282028, "learning_rate": 7.999823761435306e-06, "loss": -0.0502, "step": 1416, "step_time": 6.415255748986965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.4375, "completions/mean_terminated_length": 4.733333587646484, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0455108135938644, "epoch": 0.007085, "frac_reward_zero_std": 0.5, "grad_norm": 0.09292315691709518, "kl": 0.2976651303470135, "learning_rate": 7.999823505927059e-06, "loss": -0.0435, "num_tokens": 18885131.0, "reward": 1.0174555778503418, "reward_std": 1.2270077466964722, "rewards/rollout_reward_func/mean": 1.0174555778503418, "rewards/rollout_reward_func/std": 1.2270077466964722, "sampling/importance_sampling_ratio/max": 1.289631724357605, "sampling/importance_sampling_ratio/mean": 0.8487774729728699, "sampling/importance_sampling_ratio/min": 4.1975137719418854e-05, "sampling/sampling_logp_difference/max": 1.661656141281128, "sampling/sampling_logp_difference/mean": 0.2104242742061615, "step": 1417, "step_time": 15.380228762995102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0418659821152687, "epoch": 0.00709, "grad_norm": 0.0754985585808754, "kl": 0.3000359144061804, "learning_rate": 7.999823250233735e-06, "loss": -0.0438, "step": 1418, "step_time": 6.706800111016491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.9375, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2294997479766607, "epoch": 0.007095, "frac_reward_zero_std": 0.0, "grad_norm": 0.12522995471954346, "kl": 0.3175138384103775, "learning_rate": 7.999822994355336e-06, "loss": -0.031, "num_tokens": 18904433.0, "reward": 0.347084641456604, "reward_std": 1.2851014137268066, "rewards/rollout_reward_func/mean": 0.347084641456604, "rewards/rollout_reward_func/std": 1.2851014137268066, "sampling/importance_sampling_ratio/max": 1.036934494972229, "sampling/importance_sampling_ratio/mean": 0.8042929172515869, "sampling/importance_sampling_ratio/min": 4.575670391204767e-05, "sampling/sampling_logp_difference/max": 1.8108813762664795, "sampling/sampling_logp_difference/mean": 0.24821346998214722, "step": 1419, "step_time": 9.846493637014646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.219998300075531, "epoch": 0.0071, "grad_norm": 0.1213374063372612, "kl": 0.32821862772107124, "learning_rate": 7.999822738291858e-06, "loss": -0.0314, "step": 1420, "step_time": 4.931678961991565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 5.428571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7340913647785783, "epoch": 0.007105, "frac_reward_zero_std": 0.0, "grad_norm": 0.04056743159890175, "kl": 0.7153232432901859, "learning_rate": 7.999822482043305e-06, "loss": -0.0935, "num_tokens": 18927142.0, "reward": 1.2763417959213257, "reward_std": 1.1993474960327148, "rewards/rollout_reward_func/mean": 1.2763417959213257, "rewards/rollout_reward_func/std": 1.1993474960327148, "sampling/importance_sampling_ratio/max": 1.032945156097412, "sampling/importance_sampling_ratio/mean": 0.7004033327102661, "sampling/importance_sampling_ratio/min": 3.5988767876915517e-07, "sampling/sampling_logp_difference/max": 2.1925973892211914, "sampling/sampling_logp_difference/mean": 0.34480729699134827, "step": 1421, "step_time": 13.649612073990284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7340892739593983, "epoch": 0.00711, "grad_norm": 0.04438788443803787, "kl": 0.7466057538986206, "learning_rate": 7.999822225609675e-06, "loss": -0.0934, "step": 1422, "step_time": 6.261328816006426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.4375, "completions/mean_terminated_length": 5.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5902931690216064, "epoch": 0.007115, "frac_reward_zero_std": 0.0, "grad_norm": 0.038496166467666626, "kl": 0.4305649623274803, "learning_rate": 7.999821968990969e-06, "loss": -0.035, "num_tokens": 18957455.0, "reward": 0.08519315719604492, "reward_std": 1.3490315675735474, "rewards/rollout_reward_func/mean": 0.08519315719604492, "rewards/rollout_reward_func/std": 1.3490315675735474, "sampling/importance_sampling_ratio/max": 1.087819218635559, "sampling/importance_sampling_ratio/mean": 0.40982985496520996, "sampling/importance_sampling_ratio/min": 5.418604359874735e-06, "sampling/sampling_logp_difference/max": 2.420435905456543, "sampling/sampling_logp_difference/mean": 0.41878563165664673, "step": 1423, "step_time": 15.610839507018682 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 2.5882633924484253, "epoch": 0.00712, "grad_norm": 0.038369208574295044, "kl": 0.42915529757738113, "learning_rate": 7.999821712187186e-06, "loss": -0.035, "step": 1424, "step_time": 6.220910918011214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.22419461980462074, "epoch": 0.007125, "frac_reward_zero_std": 0.5, "grad_norm": 0.12585929036140442, "kl": 0.28100766986608505, "learning_rate": 7.999821455198328e-06, "loss": 0.0067, "num_tokens": 18976285.0, "reward": 1.7570854425430298, "reward_std": 0.30458125472068787, "rewards/rollout_reward_func/mean": 1.7570854425430298, "rewards/rollout_reward_func/std": 0.30458128452301025, "sampling/importance_sampling_ratio/max": 1.0545376539230347, "sampling/importance_sampling_ratio/mean": 0.965168297290802, "sampling/importance_sampling_ratio/min": 0.4751747250556946, "sampling/sampling_logp_difference/max": 0.7352477312088013, "sampling/sampling_logp_difference/mean": 0.0243691336363554, "step": 1425, "step_time": 10.129203362987028 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.20526872482150793, "epoch": 0.00713, "grad_norm": 0.07801421731710434, "kl": 0.27931516245007515, "learning_rate": 7.99982119802439e-06, "loss": 0.0063, "step": 1426, "step_time": 5.173449507012265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 4.363636493682861, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1137519478797913, "epoch": 0.007135, "frac_reward_zero_std": 0.0, "grad_norm": 0.10803596675395966, "kl": 0.23379965126514435, "learning_rate": 7.99982094066538e-06, "loss": -0.0927, "num_tokens": 19011435.0, "reward": 0.3406534194946289, "reward_std": 1.1410608291625977, "rewards/rollout_reward_func/mean": 0.3406534194946289, "rewards/rollout_reward_func/std": 1.1410609483718872, "sampling/importance_sampling_ratio/max": 1.086643099784851, "sampling/importance_sampling_ratio/mean": 0.592056155204773, "sampling/importance_sampling_ratio/min": 4.8382503337052185e-08, "sampling/sampling_logp_difference/max": 2.1857528686523438, "sampling/sampling_logp_difference/mean": 0.36646977066993713, "step": 1427, "step_time": 17.745628548014793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1045845448970795, "epoch": 0.00714, "grad_norm": 0.10701961815357208, "kl": 0.24214541167020798, "learning_rate": 7.99982068312129e-06, "loss": -0.093, "step": 1428, "step_time": 7.960202507994836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 4.533333778381348, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7127768509089947, "epoch": 0.007145, "frac_reward_zero_std": 0.0, "grad_norm": 0.3314371705055237, "kl": 0.785788968205452, "learning_rate": 7.999820425392124e-06, "loss": -0.0032, "num_tokens": 19041653.0, "reward": 1.121766209602356, "reward_std": 1.0358320474624634, "rewards/rollout_reward_func/mean": 1.121766209602356, "rewards/rollout_reward_func/std": 1.0358320474624634, "sampling/importance_sampling_ratio/max": 1.0869284868240356, "sampling/importance_sampling_ratio/mean": 0.8283393383026123, "sampling/importance_sampling_ratio/min": 0.0014789080014452338, "sampling/sampling_logp_difference/max": 1.6493942737579346, "sampling/sampling_logp_difference/mean": 0.1360115259885788, "step": 1429, "step_time": 13.910818814008962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7237799614667892, "epoch": 0.00715, "grad_norm": 0.37571224570274353, "kl": 0.7674850523471832, "learning_rate": 7.999820167477884e-06, "loss": -0.0041, "step": 1430, "step_time": 6.415927813999588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.14537265338003635, "epoch": 0.007155, "frac_reward_zero_std": 0.5, "grad_norm": 0.09931555390357971, "kl": 0.24900054186582565, "learning_rate": 7.999819909378565e-06, "loss": 0.003, "num_tokens": 19059182.0, "reward": 1.4784655570983887, "reward_std": 0.7059835195541382, "rewards/rollout_reward_func/mean": 1.4784655570983887, "rewards/rollout_reward_func/std": 0.705983579158783, "sampling/importance_sampling_ratio/max": 1.0997583866119385, "sampling/importance_sampling_ratio/mean": 1.029574990272522, "sampling/importance_sampling_ratio/min": 0.9802578091621399, "sampling/sampling_logp_difference/max": 0.10882772505283356, "sampling/sampling_logp_difference/mean": 0.008931975811719894, "step": 1431, "step_time": 6.203922547007096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.15818042121827602, "epoch": 0.00716, "grad_norm": 0.10695767402648926, "kl": 0.24789997935295105, "learning_rate": 7.99981965109417e-06, "loss": 0.0029, "step": 1432, "step_time": 3.401469017000636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8406313639134169, "epoch": 0.007165, "frac_reward_zero_std": 0.5, "grad_norm": 0.13078050315380096, "kl": 0.9373122602701187, "learning_rate": 7.9998193926247e-06, "loss": -0.0301, "num_tokens": 19082212.0, "reward": 0.5438340902328491, "reward_std": 1.0953973531723022, "rewards/rollout_reward_func/mean": 0.5438340902328491, "rewards/rollout_reward_func/std": 1.0953973531723022, "sampling/importance_sampling_ratio/max": 1.0347152948379517, "sampling/importance_sampling_ratio/mean": 0.8946754336357117, "sampling/importance_sampling_ratio/min": 4.031700200357591e-07, "sampling/sampling_logp_difference/max": 1.8486683368682861, "sampling/sampling_logp_difference/mean": 0.1855592429637909, "step": 1433, "step_time": 12.205727929991554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.846281947568059, "epoch": 0.00717, "grad_norm": 0.11486435681581497, "kl": 0.8533507660031319, "learning_rate": 7.999819133970152e-06, "loss": -0.0306, "step": 1434, "step_time": 5.649146562005626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.37605060636997223, "epoch": 0.007175, "frac_reward_zero_std": 0.0, "grad_norm": 0.310674250125885, "kl": 0.26506225392222404, "learning_rate": 7.999818875130531e-06, "loss": -0.0182, "num_tokens": 19103638.0, "reward": 1.371695637702942, "reward_std": 0.8567004799842834, "rewards/rollout_reward_func/mean": 1.371695637702942, "rewards/rollout_reward_func/std": 0.856700599193573, "sampling/importance_sampling_ratio/max": 1.1003497838974, "sampling/importance_sampling_ratio/mean": 0.9288300275802612, "sampling/importance_sampling_ratio/min": 0.5120342373847961, "sampling/sampling_logp_difference/max": 0.3838844895362854, "sampling/sampling_logp_difference/mean": 0.0395892858505249, "step": 1435, "step_time": 9.496568614980788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4002584032714367, "epoch": 0.00718, "grad_norm": 0.2932412922382355, "kl": 0.2624587342143059, "learning_rate": 7.999818616105831e-06, "loss": -0.02, "step": 1436, "step_time": 4.974347114999546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 5.200000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.8185700476169586, "epoch": 0.007185, "frac_reward_zero_std": 0.0, "grad_norm": 0.0683021992444992, "kl": 1.0756178386509418, "learning_rate": 7.999818356896055e-06, "loss": -0.0984, "num_tokens": 19137217.0, "reward": 0.16632705926895142, "reward_std": 1.1296789646148682, "rewards/rollout_reward_func/mean": 0.16632705926895142, "rewards/rollout_reward_func/std": 1.1296790838241577, "sampling/importance_sampling_ratio/max": 1.1004503965377808, "sampling/importance_sampling_ratio/mean": 0.46636873483657837, "sampling/importance_sampling_ratio/min": 1.3465299986137325e-08, "sampling/sampling_logp_difference/max": 2.049755573272705, "sampling/sampling_logp_difference/mean": 0.5044118762016296, "step": 1437, "step_time": 16.819849343999522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.828131005167961, "epoch": 0.00719, "grad_norm": 0.06750534474849701, "kl": 1.0780407674610615, "learning_rate": 7.999818097501202e-06, "loss": -0.0984, "step": 1438, "step_time": 7.189277429002686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.08740652166306973, "epoch": 0.007195, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003027777711395174, "kl": 0.22556427121162415, "learning_rate": 7.999817837921274e-06, "loss": 0.0006, "num_tokens": 19152589.0, "reward": 2.0, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 2.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.0158048868179321, "sampling/importance_sampling_ratio/mean": 1.0111007690429688, "sampling/importance_sampling_ratio/min": 1.0059406757354736, "sampling/sampling_logp_difference/max": 0.009997047483921051, "sampling/sampling_logp_difference/mean": 0.0028402390889823437, "step": 1439, "step_time": 6.02512824999576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08950452879071236, "epoch": 0.0072, "grad_norm": 0.00030491023790091276, "kl": 0.22539202868938446, "learning_rate": 7.999817578156271e-06, "loss": 0.0006, "step": 1440, "step_time": 2.923103313005413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 5.384615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2320061028003693, "epoch": 0.007205, "frac_reward_zero_std": 0.0, "grad_norm": 0.14300429821014404, "kl": 0.2433238998055458, "learning_rate": 7.99981731820619e-06, "loss": -0.0921, "num_tokens": 19181220.0, "reward": 0.8343297839164734, "reward_std": 1.3054507970809937, "rewards/rollout_reward_func/mean": 0.8343297839164734, "rewards/rollout_reward_func/std": 1.3054507970809937, "sampling/importance_sampling_ratio/max": 1.1001924276351929, "sampling/importance_sampling_ratio/mean": 0.681982159614563, "sampling/importance_sampling_ratio/min": 1.2336273584878654e-06, "sampling/sampling_logp_difference/max": 1.9194592237472534, "sampling/sampling_logp_difference/mean": 0.3982720971107483, "step": 1441, "step_time": 13.664909936996992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2332948744297028, "epoch": 0.00721, "grad_norm": 0.15413200855255127, "kl": 0.24231760203838348, "learning_rate": 7.999817058071033e-06, "loss": -0.0925, "step": 1442, "step_time": 6.302765447006095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 5.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.572334237396717, "epoch": 0.007215, "frac_reward_zero_std": 0.0, "grad_norm": 0.11078332364559174, "kl": 0.6181844845414162, "learning_rate": 7.9998167977508e-06, "loss": -0.0877, "num_tokens": 19202166.0, "reward": 0.8386475443840027, "reward_std": 1.51698899269104, "rewards/rollout_reward_func/mean": 0.8386475443840027, "rewards/rollout_reward_func/std": 1.51698899269104, "sampling/importance_sampling_ratio/max": 1.0535911321640015, "sampling/importance_sampling_ratio/mean": 0.679814875125885, "sampling/importance_sampling_ratio/min": 2.2520704078488052e-05, "sampling/sampling_logp_difference/max": 2.030712127685547, "sampling/sampling_logp_difference/mean": 0.32496780157089233, "step": 1443, "step_time": 11.542827817014768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5826972797513008, "epoch": 0.00722, "grad_norm": 0.10035646706819534, "kl": 0.6328193508088589, "learning_rate": 7.99981653724549e-06, "loss": -0.0878, "step": 1444, "step_time": 5.92633999598911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 4.615384578704834, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.341328889131546, "epoch": 0.007225, "frac_reward_zero_std": 0.5, "grad_norm": 0.2773815989494324, "kl": 0.31543276831507683, "learning_rate": 7.999816276555105e-06, "loss": -0.0145, "num_tokens": 19220758.0, "reward": 1.1323604583740234, "reward_std": 1.2746375799179077, "rewards/rollout_reward_func/mean": 1.1323604583740234, "rewards/rollout_reward_func/std": 1.2746376991271973, "sampling/importance_sampling_ratio/max": 1.0195473432540894, "sampling/importance_sampling_ratio/mean": 0.580145537853241, "sampling/importance_sampling_ratio/min": 6.269131347380608e-08, "sampling/sampling_logp_difference/max": 2.311483383178711, "sampling/sampling_logp_difference/mean": 0.42683348059654236, "step": 1445, "step_time": 7.139884564996464 }, { "clip_ratio/high_max": 0.04545454680919647, "clip_ratio/high_mean": 0.022727273404598236, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022727273404598236, "entropy": 2.3121635615825653, "epoch": 0.00723, "grad_norm": 0.16311144828796387, "kl": 0.30940569192171097, "learning_rate": 7.999816015679643e-06, "loss": -0.0155, "step": 1446, "step_time": 3.3375286890222924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 9.6875, "completions/mean_terminated_length": 4.777777671813965, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.423421621322632, "epoch": 0.007235, "frac_reward_zero_std": 0.0, "grad_norm": 0.12717081606388092, "kl": 0.2555876187980175, "learning_rate": 7.999815754619105e-06, "loss": -0.0629, "num_tokens": 19254219.0, "reward": 0.23854884505271912, "reward_std": 1.184119701385498, "rewards/rollout_reward_func/mean": 0.23854884505271912, "rewards/rollout_reward_func/std": 1.1841195821762085, "sampling/importance_sampling_ratio/max": 1.1604924201965332, "sampling/importance_sampling_ratio/mean": 0.4679987132549286, "sampling/importance_sampling_ratio/min": 4.7565148975081684e-07, "sampling/sampling_logp_difference/max": 1.7976579666137695, "sampling/sampling_logp_difference/mean": 0.3775421380996704, "step": 1447, "step_time": 18.891930049008806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4198839366436005, "epoch": 0.00724, "grad_norm": 0.1228310838341713, "kl": 0.25376753509044647, "learning_rate": 7.99981549337349e-06, "loss": -0.0628, "step": 1448, "step_time": 7.665516857989132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.875, "completions/mean_terminated_length": 6.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.1473318338394165, "epoch": 0.007245, "frac_reward_zero_std": 0.0, "grad_norm": 0.07487091422080994, "kl": 0.5713435336947441, "learning_rate": 7.999815231942802e-06, "loss": -0.0966, "num_tokens": 19285872.0, "reward": 0.22086402773857117, "reward_std": 1.2388592958450317, "rewards/rollout_reward_func/mean": 0.22086402773857117, "rewards/rollout_reward_func/std": 1.2388591766357422, "sampling/importance_sampling_ratio/max": 1.2043370008468628, "sampling/importance_sampling_ratio/mean": 0.39077097177505493, "sampling/importance_sampling_ratio/min": 2.6268135115969926e-06, "sampling/sampling_logp_difference/max": 2.112440347671509, "sampling/sampling_logp_difference/mean": 0.5836032629013062, "step": 1449, "step_time": 15.295669718005229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.1352908313274384, "epoch": 0.00725, "grad_norm": 0.07756389677524567, "kl": 0.5696869008243084, "learning_rate": 7.999814970327035e-06, "loss": -0.0966, "step": 1450, "step_time": 7.3987452239962295 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 8.6875, "completions/mean_terminated_length": 4.300000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.646317094564438, "epoch": 0.007255, "frac_reward_zero_std": 0.0, "grad_norm": 0.19306012988090515, "kl": 0.27857656218111515, "learning_rate": 7.999814708526193e-06, "loss": -0.08, "num_tokens": 19311935.0, "reward": 0.8928926587104797, "reward_std": 1.2386674880981445, "rewards/rollout_reward_func/mean": 0.8928926587104797, "rewards/rollout_reward_func/std": 1.2386674880981445, "sampling/importance_sampling_ratio/max": 1.0648905038833618, "sampling/importance_sampling_ratio/mean": 0.47413766384124756, "sampling/importance_sampling_ratio/min": 1.9225390701649303e-07, "sampling/sampling_logp_difference/max": 2.1639649868011475, "sampling/sampling_logp_difference/mean": 0.4475133419036865, "step": 1451, "step_time": 14.495212664973224 }, { "clip_ratio/high_max": 0.08522727340459824, "clip_ratio/high_mean": 0.04261363670229912, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04261363670229912, "entropy": 2.6214818954467773, "epoch": 0.00726, "grad_norm": 0.16703559458255768, "kl": 0.25410640612244606, "learning_rate": 7.999814446540275e-06, "loss": -0.0806, "step": 1452, "step_time": 6.666014394984813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5847051106393337, "epoch": 0.007265, "frac_reward_zero_std": 0.0, "grad_norm": 0.18957702815532684, "kl": 0.3166162818670273, "learning_rate": 7.999814184369282e-06, "loss": -0.0662, "num_tokens": 19333297.0, "reward": 0.7149906158447266, "reward_std": 1.4617078304290771, "rewards/rollout_reward_func/mean": 0.7149906158447266, "rewards/rollout_reward_func/std": 1.4617079496383667, "sampling/importance_sampling_ratio/max": 1.223185420036316, "sampling/importance_sampling_ratio/mean": 0.7639106512069702, "sampling/importance_sampling_ratio/min": 6.356657991091197e-07, "sampling/sampling_logp_difference/max": 2.0617008209228516, "sampling/sampling_logp_difference/mean": 0.3979518413543701, "step": 1453, "step_time": 9.984867766004754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5742236711084843, "epoch": 0.00727, "grad_norm": 0.1784231960773468, "kl": 0.32719289511442184, "learning_rate": 7.999813922013212e-06, "loss": -0.0661, "step": 1454, "step_time": 5.1125924740190385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 9.4375, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3702537417411804, "epoch": 0.007275, "frac_reward_zero_std": 0.0, "grad_norm": 0.057751692831516266, "kl": 0.4640843700617552, "learning_rate": 7.999813659472065e-06, "loss": -0.0643, "num_tokens": 19368765.0, "reward": -0.1289878487586975, "reward_std": 0.742163896560669, "rewards/rollout_reward_func/mean": -0.1289878487586975, "rewards/rollout_reward_func/std": 0.742163896560669, "sampling/importance_sampling_ratio/max": 1.0961655378341675, "sampling/importance_sampling_ratio/mean": 0.5061389207839966, "sampling/importance_sampling_ratio/min": 6.221721093879751e-08, "sampling/sampling_logp_difference/max": 2.671501636505127, "sampling/sampling_logp_difference/mean": 0.4007500410079956, "step": 1455, "step_time": 18.2803477270063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.375440627336502, "epoch": 0.00728, "grad_norm": 0.05496335029602051, "kl": 0.4469052515923977, "learning_rate": 7.999813396745843e-06, "loss": -0.0643, "step": 1456, "step_time": 7.8483592820120975 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.928571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7428742311894894, "epoch": 0.007285, "frac_reward_zero_std": 0.0, "grad_norm": 0.13825972378253937, "kl": 0.7608034014701843, "learning_rate": 7.999813133834544e-06, "loss": -0.0105, "num_tokens": 19388909.0, "reward": 0.4907039999961853, "reward_std": 1.434077501296997, "rewards/rollout_reward_func/mean": 0.4907039999961853, "rewards/rollout_reward_func/std": 1.434077501296997, "sampling/importance_sampling_ratio/max": 1.1243911981582642, "sampling/importance_sampling_ratio/mean": 0.7891937494277954, "sampling/importance_sampling_ratio/min": 3.411303239886365e-08, "sampling/sampling_logp_difference/max": 2.0204086303710938, "sampling/sampling_logp_difference/mean": 0.32579952478408813, "step": 1457, "step_time": 10.942760472986265 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.7384979873895645, "epoch": 0.00729, "grad_norm": 0.1328229159116745, "kl": 0.7572666704654694, "learning_rate": 7.999812870738171e-06, "loss": -0.0106, "step": 1458, "step_time": 5.395622450989322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.4375, "completions/mean_terminated_length": 4.583333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6574837155640125, "epoch": 0.007295, "frac_reward_zero_std": 0.0, "grad_norm": 0.03686262667179108, "kl": 0.39539423771202564, "learning_rate": 7.99981260745672e-06, "loss": -0.0619, "num_tokens": 19419809.0, "reward": 0.5429925322532654, "reward_std": 1.2966623306274414, "rewards/rollout_reward_func/mean": 0.5429925322532654, "rewards/rollout_reward_func/std": 1.2966623306274414, "sampling/importance_sampling_ratio/max": 1.1903189420700073, "sampling/importance_sampling_ratio/mean": 0.6719770431518555, "sampling/importance_sampling_ratio/min": 1.3746564491157187e-06, "sampling/sampling_logp_difference/max": 2.3439230918884277, "sampling/sampling_logp_difference/mean": 0.4112212657928467, "step": 1459, "step_time": 14.172450098005356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6525713987648487, "epoch": 0.0073, "grad_norm": 0.037157732993364334, "kl": 0.3825281746685505, "learning_rate": 7.999812343990194e-06, "loss": -0.062, "step": 1460, "step_time": 6.878995399994892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 5.357142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6314996778964996, "epoch": 0.007305, "frac_reward_zero_std": 0.0, "grad_norm": 0.08230588585138321, "kl": 0.4999273717403412, "learning_rate": 7.999812080338592e-06, "loss": -0.0933, "num_tokens": 19444038.0, "reward": 0.9796775579452515, "reward_std": 1.1937294006347656, "rewards/rollout_reward_func/mean": 0.9796775579452515, "rewards/rollout_reward_func/std": 1.1937295198440552, "sampling/importance_sampling_ratio/max": 1.1102776527404785, "sampling/importance_sampling_ratio/mean": 0.7267751097679138, "sampling/importance_sampling_ratio/min": 3.4582313674036413e-05, "sampling/sampling_logp_difference/max": 1.7593348026275635, "sampling/sampling_logp_difference/mean": 0.2859486937522888, "step": 1461, "step_time": 12.196562840996194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.63313527405262, "epoch": 0.00731, "grad_norm": 0.08680163323879242, "kl": 0.49560462310910225, "learning_rate": 7.999811816501915e-06, "loss": -0.0931, "step": 1462, "step_time": 5.8365514899924165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.375, "completions/mean_terminated_length": 6.615385055541992, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3512189388275146, "epoch": 0.007315, "frac_reward_zero_std": 0.0, "grad_norm": 0.5567861795425415, "kl": 0.243495412170887, "learning_rate": 7.999811552480161e-06, "loss": -0.0232, "num_tokens": 19470939.0, "reward": -0.1229388415813446, "reward_std": 1.164912462234497, "rewards/rollout_reward_func/mean": -0.1229388415813446, "rewards/rollout_reward_func/std": 1.164912462234497, "sampling/importance_sampling_ratio/max": 1.1386057138442993, "sampling/importance_sampling_ratio/mean": 0.5228478908538818, "sampling/importance_sampling_ratio/min": 6.547735392814502e-05, "sampling/sampling_logp_difference/max": 2.0372467041015625, "sampling/sampling_logp_difference/mean": 0.34728607535362244, "step": 1463, "step_time": 14.152338676009094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.05625000037252903, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05625000037252903, "entropy": 2.368689000606537, "epoch": 0.00732, "grad_norm": 0.19233563542366028, "kl": 0.2438411582261324, "learning_rate": 7.999811288273333e-06, "loss": -0.027, "step": 1464, "step_time": 6.853755475996877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.3125, "completions/mean_terminated_length": 5.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8088655769824982, "epoch": 0.007325, "frac_reward_zero_std": 0.0, "grad_norm": 0.10770764946937561, "kl": 0.1702505499124527, "learning_rate": 7.999811023881427e-06, "loss": -0.0406, "num_tokens": 19494271.0, "reward": -0.6649453639984131, "reward_std": 0.7594596147537231, "rewards/rollout_reward_func/mean": -0.6649453639984131, "rewards/rollout_reward_func/std": 0.7594596147537231, "sampling/importance_sampling_ratio/max": 1.1857293844223022, "sampling/importance_sampling_ratio/mean": 0.620519757270813, "sampling/importance_sampling_ratio/min": 0.0001342163304798305, "sampling/sampling_logp_difference/max": 1.5802438259124756, "sampling/sampling_logp_difference/mean": 0.24430593848228455, "step": 1465, "step_time": 13.675867824014858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.019444444682449102, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019444444682449102, "entropy": 1.819427728652954, "epoch": 0.00733, "grad_norm": 0.10532443970441818, "kl": 0.179043710231781, "learning_rate": 7.999810759304447e-06, "loss": -0.0408, "step": 1466, "step_time": 6.5150562320050085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.7368223667144775, "epoch": 0.007335, "frac_reward_zero_std": 0.0, "grad_norm": 0.19186140596866608, "kl": 0.121115499176085, "learning_rate": 7.99981049454239e-06, "loss": -0.1049, "num_tokens": 19532009.0, "reward": 0.15233123302459717, "reward_std": 1.2708113193511963, "rewards/rollout_reward_func/mean": 0.15233123302459717, "rewards/rollout_reward_func/std": 1.2708113193511963, "sampling/importance_sampling_ratio/max": 1.3268101215362549, "sampling/importance_sampling_ratio/mean": 0.4881369471549988, "sampling/importance_sampling_ratio/min": 7.467244722647592e-05, "sampling/sampling_logp_difference/max": 1.6800119876861572, "sampling/sampling_logp_difference/mean": 0.3911798596382141, "step": 1467, "step_time": 20.439891798014287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.746295928955078, "epoch": 0.00734, "grad_norm": 0.20284320414066315, "kl": 0.11755872145295143, "learning_rate": 7.999810229595256e-06, "loss": -0.1058, "step": 1468, "step_time": 7.812137156986864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.3125, "completions/mean_terminated_length": 5.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4261662364006042, "epoch": 0.007345, "frac_reward_zero_std": 0.0, "grad_norm": 0.3234354555606842, "kl": 0.799496628344059, "learning_rate": 7.99980996446305e-06, "loss": -0.0858, "num_tokens": 19559481.0, "reward": 1.0712182521820068, "reward_std": 1.2180025577545166, "rewards/rollout_reward_func/mean": 1.0712182521820068, "rewards/rollout_reward_func/std": 1.2180025577545166, "sampling/importance_sampling_ratio/max": 1.0732386112213135, "sampling/importance_sampling_ratio/mean": 0.5725064873695374, "sampling/importance_sampling_ratio/min": 5.018555953029136e-07, "sampling/sampling_logp_difference/max": 1.7957109212875366, "sampling/sampling_logp_difference/mean": 0.4153805375099182, "step": 1469, "step_time": 18.203313782985788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4151430130004883, "epoch": 0.00735, "grad_norm": 0.2952388823032379, "kl": 0.7655444517731667, "learning_rate": 7.999809699145766e-06, "loss": -0.0872, "step": 1470, "step_time": 7.996463374991436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 5.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0835091024637222, "epoch": 0.007355, "frac_reward_zero_std": 0.0, "grad_norm": 0.16799893975257874, "kl": 0.17340787407010794, "learning_rate": 7.999809433643407e-06, "loss": -0.1064, "num_tokens": 19595602.0, "reward": 0.23690351843833923, "reward_std": 1.2377036809921265, "rewards/rollout_reward_func/mean": 0.23690351843833923, "rewards/rollout_reward_func/std": 1.237703800201416, "sampling/importance_sampling_ratio/max": 1.1459059715270996, "sampling/importance_sampling_ratio/mean": 0.4889192581176758, "sampling/importance_sampling_ratio/min": 0.00021522394672501832, "sampling/sampling_logp_difference/max": 1.4015530347824097, "sampling/sampling_logp_difference/mean": 0.3187318444252014, "step": 1471, "step_time": 17.62863341999764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013888888992369175, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013888888992369175, "entropy": 2.064443051815033, "epoch": 0.00736, "grad_norm": 0.08897202461957932, "kl": 0.17763910256326199, "learning_rate": 7.99980916795597e-06, "loss": -0.1073, "step": 1472, "step_time": 7.017731157990056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 5.6666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0199004411697388, "epoch": 0.007365, "frac_reward_zero_std": 0.0, "grad_norm": 0.05994289740920067, "kl": 0.36400217935442924, "learning_rate": 7.99980890208346e-06, "loss": -0.056, "num_tokens": 19620884.0, "reward": 0.48938047885894775, "reward_std": 1.2979671955108643, "rewards/rollout_reward_func/mean": 0.48938047885894775, "rewards/rollout_reward_func/std": 1.2979673147201538, "sampling/importance_sampling_ratio/max": 1.1180747747421265, "sampling/importance_sampling_ratio/mean": 0.6006075143814087, "sampling/importance_sampling_ratio/min": 6.716320058330894e-05, "sampling/sampling_logp_difference/max": 1.6966980695724487, "sampling/sampling_logp_difference/mean": 0.3284192979335785, "step": 1473, "step_time": 18.16127961401071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0268724858760834, "epoch": 0.00737, "grad_norm": 0.056530945003032684, "kl": 0.3611858580261469, "learning_rate": 7.999808636025872e-06, "loss": -0.0559, "step": 1474, "step_time": 6.978979193008854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 5.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.020966947078705, "epoch": 0.007375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2605935037136078, "kl": 0.10959331505000591, "learning_rate": 7.99980836978321e-06, "loss": -0.1046, "num_tokens": 19649485.0, "reward": 0.31238171458244324, "reward_std": 1.2112339735031128, "rewards/rollout_reward_func/mean": 0.31238171458244324, "rewards/rollout_reward_func/std": 1.2112339735031128, "sampling/importance_sampling_ratio/max": 1.1236004829406738, "sampling/importance_sampling_ratio/mean": 0.4482538104057312, "sampling/importance_sampling_ratio/min": 4.1980021592280536e-07, "sampling/sampling_logp_difference/max": 2.0047669410705566, "sampling/sampling_logp_difference/mean": 0.4771813750267029, "step": 1475, "step_time": 16.087976473019808 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018750000279396772, "entropy": 3.017954856157303, "epoch": 0.00738, "grad_norm": 0.09029316157102585, "kl": 0.11547713167965412, "learning_rate": 7.999808103355471e-06, "loss": -0.1062, "step": 1476, "step_time": 7.728687172973878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7785131931304932, "epoch": 0.007385, "frac_reward_zero_std": 0.0, "grad_norm": 0.03934640809893608, "kl": 0.1924713458865881, "learning_rate": 7.999807836742657e-06, "loss": -0.0852, "num_tokens": 19670203.0, "reward": 1.0690737962722778, "reward_std": 1.2532157897949219, "rewards/rollout_reward_func/mean": 1.0690737962722778, "rewards/rollout_reward_func/std": 1.2532157897949219, "sampling/importance_sampling_ratio/max": 1.077163577079773, "sampling/importance_sampling_ratio/mean": 0.7140989303588867, "sampling/importance_sampling_ratio/min": 3.1747545108373743e-06, "sampling/sampling_logp_difference/max": 1.7706583738327026, "sampling/sampling_logp_difference/mean": 0.27938979864120483, "step": 1477, "step_time": 11.417364959983388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7776436507701874, "epoch": 0.00739, "grad_norm": 0.0356522835791111, "kl": 0.20266283862292767, "learning_rate": 7.999807569944769e-06, "loss": -0.0853, "step": 1478, "step_time": 5.696292330001597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 5.090909004211426, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4129472374916077, "epoch": 0.007395, "frac_reward_zero_std": 0.0, "grad_norm": 0.08994831889867783, "kl": 0.33194117434322834, "learning_rate": 7.999807302961803e-06, "loss": -0.0633, "num_tokens": 19704577.0, "reward": 0.17812339961528778, "reward_std": 1.2196074724197388, "rewards/rollout_reward_func/mean": 0.17812339961528778, "rewards/rollout_reward_func/std": 1.2196074724197388, "sampling/importance_sampling_ratio/max": 1.0515475273132324, "sampling/importance_sampling_ratio/mean": 0.4930156469345093, "sampling/importance_sampling_ratio/min": 1.891650072138873e-07, "sampling/sampling_logp_difference/max": 1.8421483039855957, "sampling/sampling_logp_difference/mean": 0.4080522656440735, "step": 1479, "step_time": 17.879282591995434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.415932297706604, "epoch": 0.0074, "grad_norm": 0.09978650510311127, "kl": 0.3491690903902054, "learning_rate": 7.999807035793762e-06, "loss": -0.0634, "step": 1480, "step_time": 8.867195304002962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 4.461538791656494, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5565518289804459, "epoch": 0.007405, "frac_reward_zero_std": 0.0, "grad_norm": 0.2570553719997406, "kl": 0.2430429719388485, "learning_rate": 7.999806768440647e-06, "loss": -0.086, "num_tokens": 19740208.0, "reward": 0.6440104246139526, "reward_std": 1.1607284545898438, "rewards/rollout_reward_func/mean": 0.6440104246139526, "rewards/rollout_reward_func/std": 1.1607284545898438, "sampling/importance_sampling_ratio/max": 1.0604628324508667, "sampling/importance_sampling_ratio/mean": 0.7127262949943542, "sampling/importance_sampling_ratio/min": 0.0002067083987640217, "sampling/sampling_logp_difference/max": 1.2923054695129395, "sampling/sampling_logp_difference/mean": 0.2551776170730591, "step": 1481, "step_time": 18.75202894800168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.564446335658431, "epoch": 0.00741, "grad_norm": 0.24328583478927612, "kl": 0.24508366361260414, "learning_rate": 7.999806500902455e-06, "loss": -0.0866, "step": 1482, "step_time": 7.905793133017141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 4.9166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0689151883125305, "epoch": 0.007415, "frac_reward_zero_std": 0.0, "grad_norm": 0.015691328793764114, "kl": 0.21566129848361015, "learning_rate": 7.999806233179187e-06, "loss": -0.0847, "num_tokens": 19772550.0, "reward": 0.6118212342262268, "reward_std": 1.1856136322021484, "rewards/rollout_reward_func/mean": 0.6118212342262268, "rewards/rollout_reward_func/std": 1.185613751411438, "sampling/importance_sampling_ratio/max": 1.0471073389053345, "sampling/importance_sampling_ratio/mean": 0.711058497428894, "sampling/importance_sampling_ratio/min": 3.032740778508014e-07, "sampling/sampling_logp_difference/max": 1.7269352674484253, "sampling/sampling_logp_difference/mean": 0.39079219102859497, "step": 1483, "step_time": 18.23462312700576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0720179975032806, "epoch": 0.00742, "grad_norm": 0.016159087419509888, "kl": 0.21533046662807465, "learning_rate": 7.999805965270845e-06, "loss": -0.0848, "step": 1484, "step_time": 8.230581537995022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.0625, "completions/mean_terminated_length": 5.909090995788574, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4484206438064575, "epoch": 0.007425, "frac_reward_zero_std": 0.0, "grad_norm": 0.1650974303483963, "kl": 0.3862779997289181, "learning_rate": 7.999805697177426e-06, "loss": -0.0654, "num_tokens": 19803904.0, "reward": 0.4360363185405731, "reward_std": 1.2869503498077393, "rewards/rollout_reward_func/mean": 0.4360363185405731, "rewards/rollout_reward_func/std": 1.2869503498077393, "sampling/importance_sampling_ratio/max": 1.0517771244049072, "sampling/importance_sampling_ratio/mean": 0.4872197210788727, "sampling/importance_sampling_ratio/min": 4.992011236026883e-05, "sampling/sampling_logp_difference/max": 1.7154618501663208, "sampling/sampling_logp_difference/mean": 0.3685925602912903, "step": 1485, "step_time": 16.807130702014547 }, { "clip_ratio/high_max": 0.01315789483487606, "clip_ratio/high_mean": 0.00657894741743803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00657894741743803, "entropy": 2.4535944163799286, "epoch": 0.00743, "grad_norm": 0.14133666455745697, "kl": 0.3854820504784584, "learning_rate": 7.999805428898932e-06, "loss": -0.066, "step": 1486, "step_time": 7.718240827001864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.5625, "completions/mean_terminated_length": 4.866666793823242, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.049189817160368, "epoch": 0.007435, "frac_reward_zero_std": 0.0, "grad_norm": 0.1334032565355301, "kl": 0.8264877833425999, "learning_rate": 7.999805160435365e-06, "loss": -0.0568, "num_tokens": 19833029.0, "reward": 1.1082444190979004, "reward_std": 1.0784350633621216, "rewards/rollout_reward_func/mean": 1.1082444190979004, "rewards/rollout_reward_func/std": 1.0784350633621216, "sampling/importance_sampling_ratio/max": 1.0987235307693481, "sampling/importance_sampling_ratio/mean": 0.7923523187637329, "sampling/importance_sampling_ratio/min": 1.107168145608739e-06, "sampling/sampling_logp_difference/max": 1.9954626560211182, "sampling/sampling_logp_difference/mean": 0.23284727334976196, "step": 1487, "step_time": 13.574998336000135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0540140625089407, "epoch": 0.00744, "grad_norm": 0.15010780096054077, "kl": 0.7223892323672771, "learning_rate": 7.99980489178672e-06, "loss": -0.0575, "step": 1488, "step_time": 6.26707155200711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8042061552405357, "epoch": 0.007445, "frac_reward_zero_std": 0.0, "grad_norm": 0.03711480647325516, "kl": 0.25897064432501793, "learning_rate": 7.999804622952999e-06, "loss": -0.0444, "num_tokens": 19865003.0, "reward": 0.7248730659484863, "reward_std": 1.4134773015975952, "rewards/rollout_reward_func/mean": 0.7248730659484863, "rewards/rollout_reward_func/std": 1.4134773015975952, "sampling/importance_sampling_ratio/max": 1.083322525024414, "sampling/importance_sampling_ratio/mean": 0.9025859832763672, "sampling/importance_sampling_ratio/min": 8.606236951891333e-05, "sampling/sampling_logp_difference/max": 1.523558259010315, "sampling/sampling_logp_difference/mean": 0.170466810464859, "step": 1489, "step_time": 15.421552034997148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8158394657075405, "epoch": 0.00745, "grad_norm": 0.0484720841050148, "kl": 0.2585720419883728, "learning_rate": 7.999804353934203e-06, "loss": -0.0446, "step": 1490, "step_time": 6.843496743982541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2205045074224472, "epoch": 0.007455, "frac_reward_zero_std": 0.0, "grad_norm": 0.055855054408311844, "kl": 0.18095367774367332, "learning_rate": 7.999804084730332e-06, "loss": -0.0536, "num_tokens": 19894931.0, "reward": 0.23912042379379272, "reward_std": 1.1832518577575684, "rewards/rollout_reward_func/mean": 0.23912042379379272, "rewards/rollout_reward_func/std": 1.1832518577575684, "sampling/importance_sampling_ratio/max": 1.0734310150146484, "sampling/importance_sampling_ratio/mean": 0.7913995385169983, "sampling/importance_sampling_ratio/min": 0.0004754334222525358, "sampling/sampling_logp_difference/max": 1.1900403499603271, "sampling/sampling_logp_difference/mean": 0.17263148725032806, "step": 1491, "step_time": 16.31510887799959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2262866124510765, "epoch": 0.00746, "grad_norm": 0.06631192564964294, "kl": 0.1805138811469078, "learning_rate": 7.999803815341386e-06, "loss": -0.0535, "step": 1492, "step_time": 6.8953364539920585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7823536582291126, "epoch": 0.007465, "frac_reward_zero_std": 0.5, "grad_norm": 0.013470995239913464, "kl": 0.20979464426636696, "learning_rate": 7.999803545767366e-06, "loss": -0.0566, "num_tokens": 19919477.0, "reward": 1.2495291233062744, "reward_std": 0.9646222591400146, "rewards/rollout_reward_func/mean": 1.2495291233062744, "rewards/rollout_reward_func/std": 0.9646223783493042, "sampling/importance_sampling_ratio/max": 1.0667752027511597, "sampling/importance_sampling_ratio/mean": 0.8329833745956421, "sampling/importance_sampling_ratio/min": 0.002576948842033744, "sampling/sampling_logp_difference/max": 1.02189040184021, "sampling/sampling_logp_difference/mean": 0.1265021413564682, "step": 1493, "step_time": 15.152621484987321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7859179824590683, "epoch": 0.00747, "grad_norm": 0.014444909989833832, "kl": 0.20918740704655647, "learning_rate": 7.999803276008267e-06, "loss": -0.0565, "step": 1494, "step_time": 6.339687685991521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 4.461538791656494, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0841589756309986, "epoch": 0.007475, "frac_reward_zero_std": 0.5, "grad_norm": 0.041502706706523895, "kl": 0.26556512527167797, "learning_rate": 7.999803006064094e-06, "loss": -0.0316, "num_tokens": 19944725.0, "reward": 0.818926215171814, "reward_std": 1.2510221004486084, "rewards/rollout_reward_func/mean": 0.818926215171814, "rewards/rollout_reward_func/std": 1.2510221004486084, "sampling/importance_sampling_ratio/max": 1.0714305639266968, "sampling/importance_sampling_ratio/mean": 0.7378710508346558, "sampling/importance_sampling_ratio/min": 1.1046748795706662e-06, "sampling/sampling_logp_difference/max": 2.024229049682617, "sampling/sampling_logp_difference/mean": 0.19926807284355164, "step": 1495, "step_time": 13.959415461999015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.1160076651722193, "epoch": 0.00748, "grad_norm": 0.023223161697387695, "kl": 0.2657831907272339, "learning_rate": 7.999802735934847e-06, "loss": -0.0317, "step": 1496, "step_time": 6.354327018998447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2100292891263962, "epoch": 0.007485, "frac_reward_zero_std": 0.0, "grad_norm": 0.04110812768340111, "kl": 0.3319787345826626, "learning_rate": 7.999802465620524e-06, "loss": -0.0857, "num_tokens": 19970070.0, "reward": 1.3838471174240112, "reward_std": 0.9771910309791565, "rewards/rollout_reward_func/mean": 1.3838471174240112, "rewards/rollout_reward_func/std": 0.9771910905838013, "sampling/importance_sampling_ratio/max": 1.1288585662841797, "sampling/importance_sampling_ratio/mean": 0.7293139696121216, "sampling/importance_sampling_ratio/min": 0.0027051223441958427, "sampling/sampling_logp_difference/max": 1.586754322052002, "sampling/sampling_logp_difference/mean": 0.17945614457130432, "step": 1497, "step_time": 17.00191973200708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2125456631183624, "epoch": 0.00749, "grad_norm": 0.042144473642110825, "kl": 0.32639745622873306, "learning_rate": 7.999802195121125e-06, "loss": -0.0856, "step": 1498, "step_time": 7.245824763987912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7487117052078247, "epoch": 0.007495, "frac_reward_zero_std": 0.0, "grad_norm": 0.014528167434036732, "kl": 0.21244237199425697, "learning_rate": 7.999801924436653e-06, "loss": -0.0349, "num_tokens": 19997568.0, "reward": 0.5322491526603699, "reward_std": 1.3872334957122803, "rewards/rollout_reward_func/mean": 0.5322491526603699, "rewards/rollout_reward_func/std": 1.3872336149215698, "sampling/importance_sampling_ratio/max": 1.0749433040618896, "sampling/importance_sampling_ratio/mean": 0.6687562465667725, "sampling/importance_sampling_ratio/min": 7.764564361423254e-05, "sampling/sampling_logp_difference/max": 1.5105202198028564, "sampling/sampling_logp_difference/mean": 0.2277374565601349, "step": 1499, "step_time": 14.316150993996416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7550337314605713, "epoch": 0.0075, "grad_norm": 0.01614874228835106, "kl": 0.2127131223678589, "learning_rate": 7.999801653567104e-06, "loss": -0.0349, "step": 1500, "step_time": 6.326808317011455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 5.625, "completions/mean_terminated_length": 4.142857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1832325272262096, "epoch": 0.007505, "frac_reward_zero_std": 0.0, "grad_norm": 0.031240135431289673, "kl": 0.5202484913170338, "learning_rate": 7.99980138251248e-06, "loss": -0.0918, "num_tokens": 20020351.0, "reward": 1.2382967472076416, "reward_std": 1.190963864326477, "rewards/rollout_reward_func/mean": 1.2382967472076416, "rewards/rollout_reward_func/std": 1.190963864326477, "sampling/importance_sampling_ratio/max": 1.1148204803466797, "sampling/importance_sampling_ratio/mean": 0.7795323729515076, "sampling/importance_sampling_ratio/min": 0.00032069129520095885, "sampling/sampling_logp_difference/max": 1.5385191440582275, "sampling/sampling_logp_difference/mean": 0.21958981454372406, "step": 1501, "step_time": 13.448393061989918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1868649534881115, "epoch": 0.00751, "grad_norm": 0.030062658712267876, "kl": 0.5460498705506325, "learning_rate": 7.99980111127278e-06, "loss": -0.0918, "step": 1502, "step_time": 6.251524215011159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.9375, "completions/mean_terminated_length": 4.200000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8985204845666885, "epoch": 0.007515, "frac_reward_zero_std": 0.0, "grad_norm": 0.1609419733285904, "kl": 0.3399246856570244, "learning_rate": 7.999800839848006e-06, "loss": -0.0647, "num_tokens": 20047993.0, "reward": 1.1140892505645752, "reward_std": 1.1862561702728271, "rewards/rollout_reward_func/mean": 1.1140892505645752, "rewards/rollout_reward_func/std": 1.1862561702728271, "sampling/importance_sampling_ratio/max": 1.0758602619171143, "sampling/importance_sampling_ratio/mean": 0.8764278888702393, "sampling/importance_sampling_ratio/min": 2.2418208800445427e-07, "sampling/sampling_logp_difference/max": 1.8129104375839233, "sampling/sampling_logp_difference/mean": 0.2324294149875641, "step": 1503, "step_time": 12.039600883013918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9270583167672157, "epoch": 0.00752, "grad_norm": 0.13274532556533813, "kl": 0.3449060246348381, "learning_rate": 7.999800568238156e-06, "loss": -0.0657, "step": 1504, "step_time": 5.895356286986498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.875, "completions/mean_terminated_length": 5.1666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7078895270824432, "epoch": 0.007525, "frac_reward_zero_std": 0.0, "grad_norm": 0.05269431322813034, "kl": 0.3863151781260967, "learning_rate": 7.999800296443233e-06, "loss": -0.0575, "num_tokens": 20082366.0, "reward": 0.8903161287307739, "reward_std": 1.2422467470169067, "rewards/rollout_reward_func/mean": 0.8903161287307739, "rewards/rollout_reward_func/std": 1.2422468662261963, "sampling/importance_sampling_ratio/max": 1.0965102910995483, "sampling/importance_sampling_ratio/mean": 0.5483286380767822, "sampling/importance_sampling_ratio/min": 0.00011807636474259198, "sampling/sampling_logp_difference/max": 1.948000192642212, "sampling/sampling_logp_difference/mean": 0.3095080852508545, "step": 1505, "step_time": 18.25788128100976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7219111323356628, "epoch": 0.00753, "grad_norm": 0.05569199100136757, "kl": 0.3867430053651333, "learning_rate": 7.999800024463232e-06, "loss": -0.0577, "step": 1506, "step_time": 7.759575076997862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 4.9166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8205993548035622, "epoch": 0.007535, "frac_reward_zero_std": 0.0, "grad_norm": 0.038005247712135315, "kl": 0.23146803304553032, "learning_rate": 7.999799752298156e-06, "loss": -0.0948, "num_tokens": 20112442.0, "reward": 0.5989412069320679, "reward_std": 1.4388431310653687, "rewards/rollout_reward_func/mean": 0.5989412069320679, "rewards/rollout_reward_func/std": 1.4388432502746582, "sampling/importance_sampling_ratio/max": 1.07081139087677, "sampling/importance_sampling_ratio/mean": 0.6222844123840332, "sampling/importance_sampling_ratio/min": 6.369489710777998e-06, "sampling/sampling_logp_difference/max": 1.764236569404602, "sampling/sampling_logp_difference/mean": 0.3395959436893463, "step": 1507, "step_time": 20.569913891988108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8193331360816956, "epoch": 0.00754, "grad_norm": 0.037401556968688965, "kl": 0.22814658656716347, "learning_rate": 7.999799479948006e-06, "loss": -0.0949, "step": 1508, "step_time": 9.789754541998263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.538461685180664, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6061278283596039, "epoch": 0.007545, "frac_reward_zero_std": 0.0, "grad_norm": 0.22420448064804077, "kl": 0.4675753340125084, "learning_rate": 7.999799207412781e-06, "loss": -0.038, "num_tokens": 20137483.0, "reward": 1.0131440162658691, "reward_std": 1.1974310874938965, "rewards/rollout_reward_func/mean": 1.0131440162658691, "rewards/rollout_reward_func/std": 1.1974310874938965, "sampling/importance_sampling_ratio/max": 1.0484236478805542, "sampling/importance_sampling_ratio/mean": 0.6382145285606384, "sampling/importance_sampling_ratio/min": 8.672855074109975e-06, "sampling/sampling_logp_difference/max": 1.7562639713287354, "sampling/sampling_logp_difference/mean": 0.2825651168823242, "step": 1509, "step_time": 13.900200039992342 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.005434782709926367, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01585144968703389, "entropy": 1.651414841413498, "epoch": 0.00755, "grad_norm": 0.18034736812114716, "kl": 0.4172847755253315, "learning_rate": 7.999798934692482e-06, "loss": -0.0403, "step": 1510, "step_time": 5.873777405984583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.428571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4773999154567719, "epoch": 0.007555, "frac_reward_zero_std": 0.0, "grad_norm": 0.2402796447277069, "kl": 0.3804287537932396, "learning_rate": 7.999798661787106e-06, "loss": -0.0366, "num_tokens": 20166031.0, "reward": 0.46761929988861084, "reward_std": 1.331716775894165, "rewards/rollout_reward_func/mean": 0.46761929988861084, "rewards/rollout_reward_func/std": 1.3317168951034546, "sampling/importance_sampling_ratio/max": 1.1038751602172852, "sampling/importance_sampling_ratio/mean": 0.7578715682029724, "sampling/importance_sampling_ratio/min": 3.3280346087849466e-06, "sampling/sampling_logp_difference/max": 2.0464913845062256, "sampling/sampling_logp_difference/mean": 0.2664772570133209, "step": 1511, "step_time": 14.223920223012101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.57388436794281, "epoch": 0.00756, "grad_norm": 0.1678513139486313, "kl": 0.38228457048535347, "learning_rate": 7.999798388696656e-06, "loss": -0.0388, "step": 1512, "step_time": 6.33482574300433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 5.200000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.621690034866333, "epoch": 0.007565, "frac_reward_zero_std": 0.0, "grad_norm": 0.13931584358215332, "kl": 0.1866415124386549, "learning_rate": 7.99979811542113e-06, "loss": -0.0715, "num_tokens": 20197842.0, "reward": 0.2541400194168091, "reward_std": 1.2402949333190918, "rewards/rollout_reward_func/mean": 0.2541400194168091, "rewards/rollout_reward_func/std": 1.2402949333190918, "sampling/importance_sampling_ratio/max": 1.0520386695861816, "sampling/importance_sampling_ratio/mean": 0.5088316202163696, "sampling/importance_sampling_ratio/min": 5.973530278424732e-06, "sampling/sampling_logp_difference/max": 2.154599666595459, "sampling/sampling_logp_difference/mean": 0.4308692216873169, "step": 1513, "step_time": 15.244002387000364 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 2.650425434112549, "epoch": 0.00757, "grad_norm": 0.12915651500225067, "kl": 0.18568795919418335, "learning_rate": 7.999797841960529e-06, "loss": -0.071, "step": 1514, "step_time": 6.478239502990618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.875, "completions/mean_terminated_length": 5.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.2754642367362976, "epoch": 0.007575, "frac_reward_zero_std": 0.0, "grad_norm": 0.13285844027996063, "kl": 0.34651092533022165, "learning_rate": 7.999797568314853e-06, "loss": -0.0263, "num_tokens": 20231298.0, "reward": -0.4651893377304077, "reward_std": 0.9543967843055725, "rewards/rollout_reward_func/mean": -0.4651893377304077, "rewards/rollout_reward_func/std": 0.9543968439102173, "sampling/importance_sampling_ratio/max": 1.0168260335922241, "sampling/importance_sampling_ratio/mean": 0.18390898406505585, "sampling/importance_sampling_ratio/min": 3.5064201711065834e-06, "sampling/sampling_logp_difference/max": 1.9255812168121338, "sampling/sampling_logp_difference/mean": 0.466081827878952, "step": 1515, "step_time": 20.61211106798146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.3007445335388184, "epoch": 0.00758, "grad_norm": 0.14253318309783936, "kl": 0.3438131529837847, "learning_rate": 7.999797294484104e-06, "loss": -0.0267, "step": 1516, "step_time": 8.065717076009605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.625, "completions/mean_terminated_length": 5.692307949066162, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.506408214569092, "epoch": 0.007585, "frac_reward_zero_std": 0.0, "grad_norm": 0.2580949068069458, "kl": 0.27337414771318436, "learning_rate": 7.999797020468278e-06, "loss": -0.0675, "num_tokens": 20270340.0, "reward": 0.1906830370426178, "reward_std": 1.2599159479141235, "rewards/rollout_reward_func/mean": 0.1906830370426178, "rewards/rollout_reward_func/std": 1.2599159479141235, "sampling/importance_sampling_ratio/max": 1.0934652090072632, "sampling/importance_sampling_ratio/mean": 0.6295026540756226, "sampling/importance_sampling_ratio/min": 5.346475973055931e-07, "sampling/sampling_logp_difference/max": 2.2131035327911377, "sampling/sampling_logp_difference/mean": 0.39361369609832764, "step": 1517, "step_time": 19.125454378983704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.56825590133667, "epoch": 0.00759, "grad_norm": 0.2295393943786621, "kl": 0.2727389745414257, "learning_rate": 7.999796746267379e-06, "loss": -0.0684, "step": 1518, "step_time": 8.182645267006592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 4.545454502105713, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.482828438282013, "epoch": 0.007595, "frac_reward_zero_std": 0.0, "grad_norm": 0.10651760548353195, "kl": 0.2060452476143837, "learning_rate": 7.999796471881403e-06, "loss": 0.0166, "num_tokens": 20300538.0, "reward": -0.42892971634864807, "reward_std": 0.6498684287071228, "rewards/rollout_reward_func/mean": -0.42892971634864807, "rewards/rollout_reward_func/std": 0.6498684287071228, "sampling/importance_sampling_ratio/max": 1.0681747198104858, "sampling/importance_sampling_ratio/mean": 0.41819337010383606, "sampling/importance_sampling_ratio/min": 1.9614760731201386e-06, "sampling/sampling_logp_difference/max": 2.0365703105926514, "sampling/sampling_logp_difference/mean": 0.3960499167442322, "step": 1519, "step_time": 17.046406687994022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.537435084581375, "epoch": 0.0076, "grad_norm": 0.1015002653002739, "kl": 0.21117005124688148, "learning_rate": 7.99979619731035e-06, "loss": 0.0166, "step": 1520, "step_time": 6.205708114008303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 6.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.0049384236335754, "epoch": 0.007605, "frac_reward_zero_std": 0.0, "grad_norm": 0.3825834393501282, "kl": 0.24456085823476315, "learning_rate": 7.999795922554227e-06, "loss": -0.0555, "num_tokens": 20328401.0, "reward": 0.2721431255340576, "reward_std": 1.2538264989852905, "rewards/rollout_reward_func/mean": 0.2721431255340576, "rewards/rollout_reward_func/std": 1.25382661819458, "sampling/importance_sampling_ratio/max": 1.0540541410446167, "sampling/importance_sampling_ratio/mean": 0.36711859703063965, "sampling/importance_sampling_ratio/min": 6.363026727740362e-07, "sampling/sampling_logp_difference/max": 2.1051442623138428, "sampling/sampling_logp_difference/mean": 0.4639948606491089, "step": 1521, "step_time": 17.812676701010787 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 2.937268316745758, "epoch": 0.00761, "grad_norm": 0.34117913246154785, "kl": 0.24442052096128464, "learning_rate": 7.999795647613027e-06, "loss": -0.057, "step": 1522, "step_time": 7.819211591006024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 6.461538791656494, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.0831353664398193, "epoch": 0.007615, "frac_reward_zero_std": 0.0, "grad_norm": 0.12052477151155472, "kl": 0.4000001922249794, "learning_rate": 7.99979537248675e-06, "loss": -0.0815, "num_tokens": 20359614.0, "reward": 0.03804130107164383, "reward_std": 1.0498042106628418, "rewards/rollout_reward_func/mean": 0.03804130107164383, "rewards/rollout_reward_func/std": 1.0498042106628418, "sampling/importance_sampling_ratio/max": 1.0860154628753662, "sampling/importance_sampling_ratio/mean": 0.2567802965641022, "sampling/importance_sampling_ratio/min": 4.836234438698739e-06, "sampling/sampling_logp_difference/max": 1.8933185338974, "sampling/sampling_logp_difference/mean": 0.47561153769493103, "step": 1523, "step_time": 17.39498372800881 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 2.93727707862854, "epoch": 0.00762, "grad_norm": 0.07922134548425674, "kl": 0.35181019455194473, "learning_rate": 7.999795097175401e-06, "loss": -0.0823, "step": 1524, "step_time": 7.649555897005484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.92307710647583, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8254558742046356, "epoch": 0.007625, "frac_reward_zero_std": 0.0, "grad_norm": 0.021494152024388313, "kl": 0.35996851697564125, "learning_rate": 7.999794821678977e-06, "loss": -0.0834, "num_tokens": 20382940.0, "reward": 1.2970726490020752, "reward_std": 1.1262338161468506, "rewards/rollout_reward_func/mean": 1.2970726490020752, "rewards/rollout_reward_func/std": 1.1262339353561401, "sampling/importance_sampling_ratio/max": 1.0427199602127075, "sampling/importance_sampling_ratio/mean": 0.6492049694061279, "sampling/importance_sampling_ratio/min": 3.2392304092354607e-06, "sampling/sampling_logp_difference/max": 1.8823539018630981, "sampling/sampling_logp_difference/mean": 0.29701635241508484, "step": 1525, "step_time": 14.196023260999937 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 1.7802757173776627, "epoch": 0.00763, "grad_norm": 0.018709704279899597, "kl": 0.39834949001669884, "learning_rate": 7.999794545997478e-06, "loss": -0.0834, "step": 1526, "step_time": 5.948636815010104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8962409049272537, "epoch": 0.007635, "frac_reward_zero_std": 0.0, "grad_norm": 0.11017637699842453, "kl": 0.19050713069736958, "learning_rate": 7.999794270130904e-06, "loss": -0.0679, "num_tokens": 20412411.0, "reward": 0.04833385348320007, "reward_std": 1.423960566520691, "rewards/rollout_reward_func/mean": 0.04833385348320007, "rewards/rollout_reward_func/std": 1.423960566520691, "sampling/importance_sampling_ratio/max": 1.0732730627059937, "sampling/importance_sampling_ratio/mean": 0.6186167001724243, "sampling/importance_sampling_ratio/min": 4.280704160919413e-06, "sampling/sampling_logp_difference/max": 1.9764184951782227, "sampling/sampling_logp_difference/mean": 0.31077200174331665, "step": 1527, "step_time": 15.255672187995515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8643947541713715, "epoch": 0.00764, "grad_norm": 0.10491383075714111, "kl": 0.1776568368077278, "learning_rate": 7.999793994079254e-06, "loss": -0.0678, "step": 1528, "step_time": 6.329973603002145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.875, "completions/mean_terminated_length": 5.636363983154297, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9155349731445312, "epoch": 0.007645, "frac_reward_zero_std": 0.5, "grad_norm": 0.03089815191924572, "kl": 0.24755142256617546, "learning_rate": 7.99979371784253e-06, "loss": -0.0363, "num_tokens": 20434911.0, "reward": 0.7110514044761658, "reward_std": 1.4934046268463135, "rewards/rollout_reward_func/mean": 0.7110514044761658, "rewards/rollout_reward_func/std": 1.4934046268463135, "sampling/importance_sampling_ratio/max": 1.0263540744781494, "sampling/importance_sampling_ratio/mean": 0.5732482075691223, "sampling/importance_sampling_ratio/min": 9.082799579118728e-08, "sampling/sampling_logp_difference/max": 2.177311658859253, "sampling/sampling_logp_difference/mean": 0.2861360013484955, "step": 1529, "step_time": 14.87117100598698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9119966328144073, "epoch": 0.00765, "grad_norm": 0.02801753208041191, "kl": 0.23665702529251575, "learning_rate": 7.999793441420733e-06, "loss": -0.0364, "step": 1530, "step_time": 6.148801261995686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3441714346408844, "epoch": 0.007655, "frac_reward_zero_std": 0.0, "grad_norm": 0.07795219123363495, "kl": 0.25812946259975433, "learning_rate": 7.99979316481386e-06, "loss": -0.0741, "num_tokens": 20464681.0, "reward": 0.17622068524360657, "reward_std": 1.0876137018203735, "rewards/rollout_reward_func/mean": 0.17622068524360657, "rewards/rollout_reward_func/std": 1.087613582611084, "sampling/importance_sampling_ratio/max": 1.051753044128418, "sampling/importance_sampling_ratio/mean": 0.4884134531021118, "sampling/importance_sampling_ratio/min": 2.2080216695030686e-06, "sampling/sampling_logp_difference/max": 2.170757532119751, "sampling/sampling_logp_difference/mean": 0.46477821469306946, "step": 1531, "step_time": 13.616108942005667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.33270001411438, "epoch": 0.00766, "grad_norm": 0.07667775452136993, "kl": 0.25911399722099304, "learning_rate": 7.99979288802191e-06, "loss": -0.0743, "step": 1532, "step_time": 6.709850747996825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6719506531953812, "epoch": 0.007665, "frac_reward_zero_std": 0.0, "grad_norm": 0.12161798030138016, "kl": 0.5940358303487301, "learning_rate": 7.999792611044888e-06, "loss": -0.0807, "num_tokens": 20486627.0, "reward": 0.4771895408630371, "reward_std": 1.454817533493042, "rewards/rollout_reward_func/mean": 0.4771895408630371, "rewards/rollout_reward_func/std": 1.454817533493042, "sampling/importance_sampling_ratio/max": 1.1261857748031616, "sampling/importance_sampling_ratio/mean": 0.6217947602272034, "sampling/importance_sampling_ratio/min": 1.8367527445661835e-05, "sampling/sampling_logp_difference/max": 2.0804343223571777, "sampling/sampling_logp_difference/mean": 0.31613028049468994, "step": 1533, "step_time": 13.901055231995997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.662802204489708, "epoch": 0.00767, "grad_norm": 0.11887979507446289, "kl": 0.5914565324783325, "learning_rate": 7.999792333882792e-06, "loss": -0.0807, "step": 1534, "step_time": 5.843595458005439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.125, "completions/mean_terminated_length": 7.142857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.971673607826233, "epoch": 0.007675, "frac_reward_zero_std": 0.0, "grad_norm": 0.22445857524871826, "kl": 0.09828888718038797, "learning_rate": 7.99979205653562e-06, "loss": -0.0858, "num_tokens": 20518922.0, "reward": -0.08002422749996185, "reward_std": 1.2862886190414429, "rewards/rollout_reward_func/mean": -0.08002422749996185, "rewards/rollout_reward_func/std": 1.2862886190414429, "sampling/importance_sampling_ratio/max": 1.1267534494400024, "sampling/importance_sampling_ratio/mean": 0.27465611696243286, "sampling/importance_sampling_ratio/min": 4.0974387047754135e-06, "sampling/sampling_logp_difference/max": 1.8971450328826904, "sampling/sampling_logp_difference/mean": 0.42508113384246826, "step": 1535, "step_time": 16.215740447005373 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 2.9452720880508423, "epoch": 0.00768, "grad_norm": 0.09047019481658936, "kl": 0.09650505147874355, "learning_rate": 7.999791779003372e-06, "loss": -0.087, "step": 1536, "step_time": 6.737084043998038 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.4375, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2158902287483215, "epoch": 0.007685, "frac_reward_zero_std": 0.0, "grad_norm": 0.0533030666410923, "kl": 0.21370912343263626, "learning_rate": 7.999791501286051e-06, "loss": -0.0909, "num_tokens": 20550178.0, "reward": 0.38275107741355896, "reward_std": 1.3494099378585815, "rewards/rollout_reward_func/mean": 0.38275107741355896, "rewards/rollout_reward_func/std": 1.349410057067871, "sampling/importance_sampling_ratio/max": 1.073794960975647, "sampling/importance_sampling_ratio/mean": 0.44504958391189575, "sampling/importance_sampling_ratio/min": 0.00012700998922809958, "sampling/sampling_logp_difference/max": 1.821387529373169, "sampling/sampling_logp_difference/mean": 0.325431227684021, "step": 1537, "step_time": 15.249893133004662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2100821137428284, "epoch": 0.00769, "grad_norm": 0.05411839112639427, "kl": 0.22379858046770096, "learning_rate": 7.999791223383655e-06, "loss": -0.0908, "step": 1538, "step_time": 6.480966312010423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.5625, "completions/mean_terminated_length": 5.615385055541992, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.060099333524704, "epoch": 0.007695, "frac_reward_zero_std": 0.0, "grad_norm": 0.019544247537851334, "kl": 0.18632808327674866, "learning_rate": 7.999790945296184e-06, "loss": -0.1049, "num_tokens": 20582240.0, "reward": 0.8569144010543823, "reward_std": 1.320298671722412, "rewards/rollout_reward_func/mean": 0.8569144010543823, "rewards/rollout_reward_func/std": 1.3202985525131226, "sampling/importance_sampling_ratio/max": 1.0720232725143433, "sampling/importance_sampling_ratio/mean": 0.693911612033844, "sampling/importance_sampling_ratio/min": 3.466068164925673e-06, "sampling/sampling_logp_difference/max": 1.5533649921417236, "sampling/sampling_logp_difference/mean": 0.33946770429611206, "step": 1539, "step_time": 16.559580378016108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0377023816108704, "epoch": 0.0077, "grad_norm": 0.016446325927972794, "kl": 0.1904049813747406, "learning_rate": 7.99979066702364e-06, "loss": -0.1049, "step": 1540, "step_time": 8.268619681999553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4643515944480896, "epoch": 0.007705, "frac_reward_zero_std": 0.5, "grad_norm": 0.0258014015853405, "kl": 0.23514768667519093, "learning_rate": 7.99979038856602e-06, "loss": -0.0328, "num_tokens": 20608678.0, "reward": 1.0673282146453857, "reward_std": 1.314171552658081, "rewards/rollout_reward_func/mean": 1.0673282146453857, "rewards/rollout_reward_func/std": 1.314171552658081, "sampling/importance_sampling_ratio/max": 1.04005765914917, "sampling/importance_sampling_ratio/mean": 0.7025964260101318, "sampling/importance_sampling_ratio/min": 0.00029586805612780154, "sampling/sampling_logp_difference/max": 1.4089159965515137, "sampling/sampling_logp_difference/mean": 0.19526849687099457, "step": 1541, "step_time": 15.513639140990563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005434782709926367, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005434782709926367, "entropy": 1.4543925523757935, "epoch": 0.00771, "grad_norm": 0.027410846203565598, "kl": 0.24265547469258308, "learning_rate": 7.999790109923327e-06, "loss": -0.0328, "step": 1542, "step_time": 6.001793970004655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0459517817944288, "epoch": 0.007715, "frac_reward_zero_std": 0.0, "grad_norm": 0.0067240349017083645, "kl": 0.30239056795835495, "learning_rate": 7.999789831095557e-06, "loss": -0.0499, "num_tokens": 20636173.0, "reward": 1.5047029256820679, "reward_std": 1.0428332090377808, "rewards/rollout_reward_func/mean": 1.5047029256820679, "rewards/rollout_reward_func/std": 1.0428332090377808, "sampling/importance_sampling_ratio/max": 1.0540454387664795, "sampling/importance_sampling_ratio/mean": 0.8950643539428711, "sampling/importance_sampling_ratio/min": 4.3381481873439043e-07, "sampling/sampling_logp_difference/max": 2.1588447093963623, "sampling/sampling_logp_difference/mean": 0.2442551702260971, "step": 1543, "step_time": 10.768490728994948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0446610730141401, "epoch": 0.00772, "grad_norm": 0.006503725424408913, "kl": 0.30264395102858543, "learning_rate": 7.999789552082715e-06, "loss": -0.0499, "step": 1544, "step_time": 5.071578781018616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.8125, "completions/mean_terminated_length": 4.357142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4660114403814077, "epoch": 0.007725, "frac_reward_zero_std": 0.5, "grad_norm": 0.15945595502853394, "kl": 0.8066875077784061, "learning_rate": 7.999789272884797e-06, "loss": -0.0514, "num_tokens": 20660706.0, "reward": 1.426748275756836, "reward_std": 0.9847249984741211, "rewards/rollout_reward_func/mean": 1.426748275756836, "rewards/rollout_reward_func/std": 0.9847249388694763, "sampling/importance_sampling_ratio/max": 1.06563401222229, "sampling/importance_sampling_ratio/mean": 0.8395694494247437, "sampling/importance_sampling_ratio/min": 5.986398718960118e-07, "sampling/sampling_logp_difference/max": 2.1463654041290283, "sampling/sampling_logp_difference/mean": 0.2669076919555664, "step": 1545, "step_time": 12.055062949992134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4543687123805285, "epoch": 0.00773, "grad_norm": 0.13548800349235535, "kl": 0.7731481790542603, "learning_rate": 7.999788993501806e-06, "loss": -0.052, "step": 1546, "step_time": 5.577729772994644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.9375, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.151645326986909, "epoch": 0.007735, "frac_reward_zero_std": 0.0, "grad_norm": 0.12815342843532562, "kl": 0.7883340194821358, "learning_rate": 7.999788713933739e-06, "loss": -0.079, "num_tokens": 20684354.0, "reward": 1.2287662029266357, "reward_std": 1.0884429216384888, "rewards/rollout_reward_func/mean": 1.2287662029266357, "rewards/rollout_reward_func/std": 1.0884430408477783, "sampling/importance_sampling_ratio/max": 1.085785984992981, "sampling/importance_sampling_ratio/mean": 0.8082810044288635, "sampling/importance_sampling_ratio/min": 0.0004380730970297009, "sampling/sampling_logp_difference/max": 1.8032565116882324, "sampling/sampling_logp_difference/mean": 0.2187638133764267, "step": 1547, "step_time": 15.431690290002734 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.103687772527337, "epoch": 0.00774, "grad_norm": 0.07126756012439728, "kl": 0.7250153496861458, "learning_rate": 7.999788434180598e-06, "loss": -0.0799, "step": 1548, "step_time": 5.902150449997862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 5.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.085115134716034, "epoch": 0.007745, "frac_reward_zero_std": 0.0, "grad_norm": 0.05242596194148064, "kl": 0.23098788782954216, "learning_rate": 7.999788154242382e-06, "loss": -0.1132, "num_tokens": 20719912.0, "reward": 0.572591245174408, "reward_std": 1.1335750818252563, "rewards/rollout_reward_func/mean": 0.572591245174408, "rewards/rollout_reward_func/std": 1.1335750818252563, "sampling/importance_sampling_ratio/max": 1.2656923532485962, "sampling/importance_sampling_ratio/mean": 0.605186939239502, "sampling/importance_sampling_ratio/min": 0.0007817232399247587, "sampling/sampling_logp_difference/max": 1.8059395551681519, "sampling/sampling_logp_difference/mean": 0.31986862421035767, "step": 1549, "step_time": 18.601480880010058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0777051746845245, "epoch": 0.00775, "grad_norm": 0.04723619297146797, "kl": 0.23230072855949402, "learning_rate": 7.999787874119093e-06, "loss": -0.1134, "step": 1550, "step_time": 7.873678138013929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 5.230769634246826, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6915482431650162, "epoch": 0.007755, "frac_reward_zero_std": 0.0, "grad_norm": 0.10905507951974869, "kl": 0.226417301222682, "learning_rate": 7.999787593810729e-06, "loss": -0.1042, "num_tokens": 20748188.0, "reward": 0.17745241522789001, "reward_std": 1.0763546228408813, "rewards/rollout_reward_func/mean": 0.17745241522789001, "rewards/rollout_reward_func/std": 1.0763546228408813, "sampling/importance_sampling_ratio/max": 1.1591874361038208, "sampling/importance_sampling_ratio/mean": 0.6983181238174438, "sampling/importance_sampling_ratio/min": 4.875967078987742e-07, "sampling/sampling_logp_difference/max": 2.0048305988311768, "sampling/sampling_logp_difference/mean": 0.3599054217338562, "step": 1551, "step_time": 15.993700627033832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6764667816460133, "epoch": 0.00776, "grad_norm": 0.0955715999007225, "kl": 0.22787103056907654, "learning_rate": 7.99978731331729e-06, "loss": -0.1046, "step": 1552, "step_time": 7.679445449975901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 4.285714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8475283458828926, "epoch": 0.007765, "frac_reward_zero_std": 0.0, "grad_norm": 0.024428386241197586, "kl": 0.4188525676727295, "learning_rate": 7.999787032638778e-06, "loss": -0.0308, "num_tokens": 20778155.0, "reward": 0.14642909169197083, "reward_std": 1.1101090908050537, "rewards/rollout_reward_func/mean": 0.14642909169197083, "rewards/rollout_reward_func/std": 1.1101090908050537, "sampling/importance_sampling_ratio/max": 1.0794165134429932, "sampling/importance_sampling_ratio/mean": 0.8698129653930664, "sampling/importance_sampling_ratio/min": 1.0776961971714627e-05, "sampling/sampling_logp_difference/max": 1.244591236114502, "sampling/sampling_logp_difference/mean": 0.1897207498550415, "step": 1553, "step_time": 12.70955436800432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8411749415099621, "epoch": 0.00777, "grad_norm": 0.018719831481575966, "kl": 0.4396424926817417, "learning_rate": 7.99978675177519e-06, "loss": -0.0307, "step": 1554, "step_time": 6.150874925995595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.857142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.659175157546997, "epoch": 0.007775, "frac_reward_zero_std": 0.0, "grad_norm": 0.1943524032831192, "kl": 0.43484293669462204, "learning_rate": 7.999786470726528e-06, "loss": -0.0222, "num_tokens": 20810726.0, "reward": -0.2607135772705078, "reward_std": 0.8295110464096069, "rewards/rollout_reward_func/mean": -0.2607135772705078, "rewards/rollout_reward_func/std": 0.8295111060142517, "sampling/importance_sampling_ratio/max": 1.368445634841919, "sampling/importance_sampling_ratio/mean": 0.8083757162094116, "sampling/importance_sampling_ratio/min": 9.894906543195248e-05, "sampling/sampling_logp_difference/max": 1.898808479309082, "sampling/sampling_logp_difference/mean": 0.3192141354084015, "step": 1555, "step_time": 15.926549256008002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6641005277633667, "epoch": 0.00778, "grad_norm": 0.22495870292186737, "kl": 0.41738901287317276, "learning_rate": 7.999786189492793e-06, "loss": -0.0226, "step": 1556, "step_time": 7.2030377839982975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9086478482931852, "epoch": 0.007785, "frac_reward_zero_std": 0.5, "grad_norm": 0.09679912030696869, "kl": 0.24441824108362198, "learning_rate": 7.999785908073983e-06, "loss": -0.0526, "num_tokens": 20835658.0, "reward": 1.1229037046432495, "reward_std": 0.719067394733429, "rewards/rollout_reward_func/mean": 1.1229037046432495, "rewards/rollout_reward_func/std": 0.719067394733429, "sampling/importance_sampling_ratio/max": 1.1977847814559937, "sampling/importance_sampling_ratio/mean": 0.9073226451873779, "sampling/importance_sampling_ratio/min": 1.2921566849399824e-05, "sampling/sampling_logp_difference/max": 1.9183239936828613, "sampling/sampling_logp_difference/mean": 0.18215355277061462, "step": 1557, "step_time": 12.561868232995039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9114984832704067, "epoch": 0.00779, "grad_norm": 0.10878516733646393, "kl": 0.24400174617767334, "learning_rate": 7.999785626470098e-06, "loss": -0.0525, "step": 1558, "step_time": 5.846529326998279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 4.785714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3801593855023384, "epoch": 0.007795, "frac_reward_zero_std": 0.5, "grad_norm": 0.13028527796268463, "kl": 0.20742139592766762, "learning_rate": 7.99978534468114e-06, "loss": -0.0211, "num_tokens": 20860800.0, "reward": 0.698205828666687, "reward_std": 1.3994969129562378, "rewards/rollout_reward_func/mean": 0.698205828666687, "rewards/rollout_reward_func/std": 1.3994970321655273, "sampling/importance_sampling_ratio/max": 1.1691721677780151, "sampling/importance_sampling_ratio/mean": 0.7733914852142334, "sampling/importance_sampling_ratio/min": 4.485051249503158e-05, "sampling/sampling_logp_difference/max": 1.5110912322998047, "sampling/sampling_logp_difference/mean": 0.2101261019706726, "step": 1559, "step_time": 15.146046520982054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.390872448682785, "epoch": 0.0078, "grad_norm": 0.15978337824344635, "kl": 0.2017417773604393, "learning_rate": 7.999785062707106e-06, "loss": -0.0216, "step": 1560, "step_time": 6.412847719999263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.3987392783164978, "epoch": 0.007805, "frac_reward_zero_std": 0.0, "grad_norm": 0.07615537196397781, "kl": 0.2682795524597168, "learning_rate": 7.999784780547998e-06, "loss": -0.0164, "num_tokens": 20891578.0, "reward": 0.4308644235134125, "reward_std": 1.2442721128463745, "rewards/rollout_reward_func/mean": 0.4308644235134125, "rewards/rollout_reward_func/std": 1.2442721128463745, "sampling/importance_sampling_ratio/max": 1.080594539642334, "sampling/importance_sampling_ratio/mean": 0.9581669569015503, "sampling/importance_sampling_ratio/min": 0.0007692103972658515, "sampling/sampling_logp_difference/max": 1.201256275177002, "sampling/sampling_logp_difference/mean": 0.0859757736325264, "step": 1561, "step_time": 13.82792962199892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4015552792698145, "epoch": 0.00781, "grad_norm": 0.08820011466741562, "kl": 0.26762763038277626, "learning_rate": 7.999784498203818e-06, "loss": -0.0167, "step": 1562, "step_time": 6.91560837700672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.875, "completions/mean_terminated_length": 5.1666669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0619717240333557, "epoch": 0.007815, "frac_reward_zero_std": 0.0, "grad_norm": 0.14575812220573425, "kl": 0.1949353888630867, "learning_rate": 7.999784215674564e-06, "loss": -0.1026, "num_tokens": 20921635.0, "reward": 0.521227240562439, "reward_std": 1.3805598020553589, "rewards/rollout_reward_func/mean": 0.521227240562439, "rewards/rollout_reward_func/std": 1.3805598020553589, "sampling/importance_sampling_ratio/max": 1.02372145652771, "sampling/importance_sampling_ratio/mean": 0.5608640313148499, "sampling/importance_sampling_ratio/min": 9.964812488760799e-05, "sampling/sampling_logp_difference/max": 2.112166404724121, "sampling/sampling_logp_difference/mean": 0.35877442359924316, "step": 1563, "step_time": 14.425666236973484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.06356992572546, "epoch": 0.00782, "grad_norm": 0.132533997297287, "kl": 0.19622017443180084, "learning_rate": 7.999783932960233e-06, "loss": -0.1027, "step": 1564, "step_time": 6.7692609039950185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.3125, "completions/mean_terminated_length": 3.7857143878936768, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8395735509693623, "epoch": 0.007825, "frac_reward_zero_std": 0.5, "grad_norm": 0.30920377373695374, "kl": 0.31895508617162704, "learning_rate": 7.99978365006083e-06, "loss": -0.0328, "num_tokens": 20945401.0, "reward": 0.8269400596618652, "reward_std": 0.853466272354126, "rewards/rollout_reward_func/mean": 0.8269400596618652, "rewards/rollout_reward_func/std": 0.853466272354126, "sampling/importance_sampling_ratio/max": 1.079579472541809, "sampling/importance_sampling_ratio/mean": 0.8948307633399963, "sampling/importance_sampling_ratio/min": 0.0009292835020460188, "sampling/sampling_logp_difference/max": 1.512864112854004, "sampling/sampling_logp_difference/mean": 0.1390298455953598, "step": 1565, "step_time": 12.608899758008192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02443609107285738, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02443609107285738, "entropy": 0.8576296642422676, "epoch": 0.00783, "grad_norm": 0.1617961823940277, "kl": 0.3079238049685955, "learning_rate": 7.999783366976351e-06, "loss": -0.0345, "step": 1566, "step_time": 6.404134677010006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.2698619831353426, "epoch": 0.007835, "frac_reward_zero_std": 0.5, "grad_norm": 0.10650969296693802, "kl": 1.7639063373208046, "learning_rate": 7.9997830837068e-06, "loss": -0.0323, "num_tokens": 20960649.0, "reward": 1.8125, "reward_std": 0.75, "rewards/rollout_reward_func/mean": 1.8125, "rewards/rollout_reward_func/std": 0.75, "sampling/importance_sampling_ratio/max": 1.0189250707626343, "sampling/importance_sampling_ratio/mean": 0.9505162239074707, "sampling/importance_sampling_ratio/min": 0.0581066869199276, "sampling/sampling_logp_difference/max": 1.464607834815979, "sampling/sampling_logp_difference/mean": 0.04708271101117134, "step": 1567, "step_time": 5.931986275012605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.27241974510252476, "epoch": 0.00784, "grad_norm": 0.08905132114887238, "kl": 1.5780533030629158, "learning_rate": 7.999782800252173e-06, "loss": -0.0328, "step": 1568, "step_time": 3.012574592998135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 5.0714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3740921467542648, "epoch": 0.007845, "frac_reward_zero_std": 0.0, "grad_norm": 0.22506147623062134, "kl": 0.2875054143369198, "learning_rate": 7.999782516612474e-06, "loss": -0.069, "num_tokens": 20995729.0, "reward": 0.9752438068389893, "reward_std": 1.0278362035751343, "rewards/rollout_reward_func/mean": 0.9752438068389893, "rewards/rollout_reward_func/std": 1.0278362035751343, "sampling/importance_sampling_ratio/max": 1.0526094436645508, "sampling/importance_sampling_ratio/mean": 0.672661542892456, "sampling/importance_sampling_ratio/min": 0.00017325177032034844, "sampling/sampling_logp_difference/max": 1.7976223230361938, "sampling/sampling_logp_difference/mean": 0.24322818219661713, "step": 1569, "step_time": 17.987454108006204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.4340142160654068, "epoch": 0.00785, "grad_norm": 0.23844462633132935, "kl": 0.30695799738168716, "learning_rate": 7.9997822327877e-06, "loss": -0.0712, "step": 1570, "step_time": 8.288106909996714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.3125, "completions/mean_terminated_length": 6.900000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.879183053970337, "epoch": 0.007855, "frac_reward_zero_std": 0.0, "grad_norm": 0.06361930817365646, "kl": 0.2847503125667572, "learning_rate": 7.999781948777851e-06, "loss": -0.0551, "num_tokens": 21027245.0, "reward": -0.0671965479850769, "reward_std": 1.1685854196548462, "rewards/rollout_reward_func/mean": -0.0671965479850769, "rewards/rollout_reward_func/std": 1.1685854196548462, "sampling/importance_sampling_ratio/max": 1.0889962911605835, "sampling/importance_sampling_ratio/mean": 0.279185950756073, "sampling/importance_sampling_ratio/min": 4.848354251407727e-07, "sampling/sampling_logp_difference/max": 2.249133586883545, "sampling/sampling_logp_difference/mean": 0.49814170598983765, "step": 1571, "step_time": 16.743656852006097 }, { "clip_ratio/high_max": 0.05356060713529587, "clip_ratio/high_mean": 0.026780303567647934, "clip_ratio/low_mean": 0.004999999888241291, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03178030252456665, "entropy": 2.9224551916122437, "epoch": 0.00786, "grad_norm": 0.03448035940527916, "kl": 0.2568521089851856, "learning_rate": 7.999781664582929e-06, "loss": -0.0554, "step": 1572, "step_time": 6.840892331005307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.928571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4660751074552536, "epoch": 0.007865, "frac_reward_zero_std": 0.5, "grad_norm": 0.07868831604719162, "kl": 1.2239835634827614, "learning_rate": 7.999781380202933e-06, "loss": -0.0453, "num_tokens": 21050211.0, "reward": 0.7595788240432739, "reward_std": 1.3852260112762451, "rewards/rollout_reward_func/mean": 0.7595788240432739, "rewards/rollout_reward_func/std": 1.3852260112762451, "sampling/importance_sampling_ratio/max": 1.0957518815994263, "sampling/importance_sampling_ratio/mean": 0.6621929407119751, "sampling/importance_sampling_ratio/min": 0.00010283336450811476, "sampling/sampling_logp_difference/max": 1.6226756572723389, "sampling/sampling_logp_difference/mean": 0.22671562433242798, "step": 1573, "step_time": 13.593997727002716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4808849841356277, "epoch": 0.00787, "grad_norm": 0.06252603232860565, "kl": 1.0530102290213108, "learning_rate": 7.999781095637862e-06, "loss": -0.0458, "step": 1574, "step_time": 5.608011185991927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.3125, "completions/mean_terminated_length": 7.5714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.5359650254249573, "epoch": 0.007875, "frac_reward_zero_std": 0.0, "grad_norm": 0.0792132094502449, "kl": 0.16747470200061798, "learning_rate": 7.999780810887718e-06, "loss": -0.0622, "num_tokens": 21079917.0, "reward": -0.3490871489048004, "reward_std": 1.1695473194122314, "rewards/rollout_reward_func/mean": -0.3490871489048004, "rewards/rollout_reward_func/std": 1.169547438621521, "sampling/importance_sampling_ratio/max": 1.0586638450622559, "sampling/importance_sampling_ratio/mean": 0.22209839522838593, "sampling/importance_sampling_ratio/min": 1.1591474731176277e-06, "sampling/sampling_logp_difference/max": 2.141313076019287, "sampling/sampling_logp_difference/mean": 0.5006207823753357, "step": 1575, "step_time": 16.867919252006686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.5434099435806274, "epoch": 0.00788, "grad_norm": 0.08281552046537399, "kl": 0.16692256554961205, "learning_rate": 7.999780525952501e-06, "loss": -0.0624, "step": 1576, "step_time": 6.799072340014391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.625, "completions/mean_terminated_length": 6.000000476837158, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.162981331348419, "epoch": 0.007885, "frac_reward_zero_std": 0.0, "grad_norm": 0.038817014545202255, "kl": 0.14728958811610937, "learning_rate": 7.999780240832209e-06, "loss": -0.0723, "num_tokens": 21112304.0, "reward": -0.36809098720550537, "reward_std": 0.8846924304962158, "rewards/rollout_reward_func/mean": -0.36809098720550537, "rewards/rollout_reward_func/std": 0.8846924901008606, "sampling/importance_sampling_ratio/max": 1.0223853588104248, "sampling/importance_sampling_ratio/mean": 0.2448844313621521, "sampling/importance_sampling_ratio/min": 3.0388324034902325e-07, "sampling/sampling_logp_difference/max": 2.1092782020568848, "sampling/sampling_logp_difference/mean": 0.4674988389015198, "step": 1577, "step_time": 22.203379847996985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.180298864841461, "epoch": 0.00789, "grad_norm": 0.040786340832710266, "kl": 0.15196883399039507, "learning_rate": 7.999779955526843e-06, "loss": -0.0725, "step": 1578, "step_time": 8.080925648988341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 7.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.8886353373527527, "epoch": 0.007895, "frac_reward_zero_std": 0.0, "grad_norm": 0.060260288417339325, "kl": 0.08911823388189077, "learning_rate": 7.999779670036402e-06, "loss": -0.089, "num_tokens": 21140481.0, "reward": -0.23635417222976685, "reward_std": 1.1014055013656616, "rewards/rollout_reward_func/mean": -0.23635417222976685, "rewards/rollout_reward_func/std": 1.1014055013656616, "sampling/importance_sampling_ratio/max": 1.093285083770752, "sampling/importance_sampling_ratio/mean": 0.23176386952400208, "sampling/importance_sampling_ratio/min": 3.9986602473618404e-08, "sampling/sampling_logp_difference/max": 2.207996368408203, "sampling/sampling_logp_difference/mean": 0.583102822303772, "step": 1579, "step_time": 14.760581683003693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.8908411860466003, "epoch": 0.0079, "grad_norm": 0.062361303716897964, "kl": 0.0876702107489109, "learning_rate": 7.99977938436089e-06, "loss": -0.0889, "step": 1580, "step_time": 7.113328339022701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.4375, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2979727387428284, "epoch": 0.007905, "frac_reward_zero_std": 0.0, "grad_norm": 0.20711283385753632, "kl": 0.15882493183016777, "learning_rate": 7.999779098500301e-06, "loss": -0.0955, "num_tokens": 21171960.0, "reward": 0.30949532985687256, "reward_std": 1.2091379165649414, "rewards/rollout_reward_func/mean": 0.30949532985687256, "rewards/rollout_reward_func/std": 1.2091377973556519, "sampling/importance_sampling_ratio/max": 1.1239523887634277, "sampling/importance_sampling_ratio/mean": 0.5676710605621338, "sampling/importance_sampling_ratio/min": 4.3713600462069735e-05, "sampling/sampling_logp_difference/max": 1.5753637552261353, "sampling/sampling_logp_difference/mean": 0.3569669723510742, "step": 1581, "step_time": 18.073007427010452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.3170518577098846, "epoch": 0.00791, "grad_norm": 0.15240950882434845, "kl": 0.16529999021440744, "learning_rate": 7.99977881245464e-06, "loss": -0.0963, "step": 1582, "step_time": 7.041597311996156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 12.5625, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.077075719833374, "epoch": 0.007915, "frac_reward_zero_std": 0.0, "grad_norm": 0.04939902201294899, "kl": 0.06776386126875877, "learning_rate": 7.999778526223905e-06, "loss": -0.1025, "num_tokens": 21208629.0, "reward": -0.10198749601840973, "reward_std": 1.1589984893798828, "rewards/rollout_reward_func/mean": -0.10198749601840973, "rewards/rollout_reward_func/std": 1.1589986085891724, "sampling/importance_sampling_ratio/max": 1.0958162546157837, "sampling/importance_sampling_ratio/mean": 0.26654356718063354, "sampling/importance_sampling_ratio/min": 2.439118361508008e-06, "sampling/sampling_logp_difference/max": 1.6237174272537231, "sampling/sampling_logp_difference/mean": 0.43048757314682007, "step": 1583, "step_time": 19.536788644996705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.078476905822754, "epoch": 0.00792, "grad_norm": 0.0497070848941803, "kl": 0.06734395772218704, "learning_rate": 7.999778239808096e-06, "loss": -0.1025, "step": 1584, "step_time": 7.289267464992008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 10.6875, "completions/mean_terminated_length": 3.857142925262451, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.176213502883911, "epoch": 0.007925, "frac_reward_zero_std": 0.0, "grad_norm": 0.1680632084608078, "kl": 0.18221540190279484, "learning_rate": 7.999777953207212e-06, "loss": -0.102, "num_tokens": 21241963.0, "reward": 0.16054664552211761, "reward_std": 1.3283151388168335, "rewards/rollout_reward_func/mean": 0.16054664552211761, "rewards/rollout_reward_func/std": 1.3283151388168335, "sampling/importance_sampling_ratio/max": 1.0715113878250122, "sampling/importance_sampling_ratio/mean": 0.371511846780777, "sampling/importance_sampling_ratio/min": 4.996614734409377e-06, "sampling/sampling_logp_difference/max": 1.821528673171997, "sampling/sampling_logp_difference/mean": 0.46596139669418335, "step": 1585, "step_time": 18.672259260012652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.152269721031189, "epoch": 0.00793, "grad_norm": 0.15028710663318634, "kl": 0.1871190406382084, "learning_rate": 7.999777666421255e-06, "loss": -0.1027, "step": 1586, "step_time": 6.914530186011689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 5.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.2062012553215027, "epoch": 0.007935, "frac_reward_zero_std": 0.0, "grad_norm": 0.043253857642412186, "kl": 0.1349391844123602, "learning_rate": 7.999777379450226e-06, "loss": -0.0954, "num_tokens": 21273577.0, "reward": 0.02902570366859436, "reward_std": 1.1692270040512085, "rewards/rollout_reward_func/mean": 0.02902570366859436, "rewards/rollout_reward_func/std": 1.1692270040512085, "sampling/importance_sampling_ratio/max": 1.1533336639404297, "sampling/importance_sampling_ratio/mean": 0.4924774467945099, "sampling/importance_sampling_ratio/min": 8.609297275086192e-09, "sampling/sampling_logp_difference/max": 2.7972869873046875, "sampling/sampling_logp_difference/mean": 0.4439563751220703, "step": 1587, "step_time": 16.54983875202015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.2002750635147095, "epoch": 0.00794, "grad_norm": 0.03482870012521744, "kl": 0.13440347369760275, "learning_rate": 7.99977709229412e-06, "loss": -0.0956, "step": 1588, "step_time": 6.754199228991638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 6.500000476837158, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.936171442270279, "epoch": 0.007945, "frac_reward_zero_std": 0.0, "grad_norm": 0.21466735005378723, "kl": 0.4063011221587658, "learning_rate": 7.999776804952944e-06, "loss": -0.0684, "num_tokens": 21297590.0, "reward": 0.8888648748397827, "reward_std": 1.361609935760498, "rewards/rollout_reward_func/mean": 0.8888648748397827, "rewards/rollout_reward_func/std": 1.3616100549697876, "sampling/importance_sampling_ratio/max": 1.03938889503479, "sampling/importance_sampling_ratio/mean": 0.5157889723777771, "sampling/importance_sampling_ratio/min": 3.586232422136959e-10, "sampling/sampling_logp_difference/max": 2.1676647663116455, "sampling/sampling_logp_difference/mean": 0.3387192189693451, "step": 1589, "step_time": 16.86702953997883 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 1.9203754365444183, "epoch": 0.00795, "grad_norm": 0.12934383749961853, "kl": 0.4056772254407406, "learning_rate": 7.999776517426691e-06, "loss": -0.0692, "step": 1590, "step_time": 6.920922449993668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 14.1875, "completions/mean_terminated_length": 6.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.594175338745117, "epoch": 0.007955, "frac_reward_zero_std": 0.0, "grad_norm": 0.04881015792489052, "kl": 0.08381632063537836, "learning_rate": 7.999776229715368e-06, "loss": -0.0545, "num_tokens": 21336016.0, "reward": -0.3571445941925049, "reward_std": 0.8363173007965088, "rewards/rollout_reward_func/mean": -0.3571445941925049, "rewards/rollout_reward_func/std": 0.8363173604011536, "sampling/importance_sampling_ratio/max": 1.14541757106781, "sampling/importance_sampling_ratio/mean": 0.14493732154369354, "sampling/importance_sampling_ratio/min": 3.3470491871412378e-06, "sampling/sampling_logp_difference/max": 1.8965911865234375, "sampling/sampling_logp_difference/mean": 0.4886091947555542, "step": 1591, "step_time": 22.470740033008042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.5861995816230774, "epoch": 0.00796, "grad_norm": 0.044909559190273285, "kl": 0.08955011796206236, "learning_rate": 7.999775941818969e-06, "loss": -0.0547, "step": 1592, "step_time": 8.140150113016716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.49671114422380924, "epoch": 0.007965, "frac_reward_zero_std": 0.5, "grad_norm": 0.008870634250342846, "kl": 0.2641602624207735, "learning_rate": 7.999775653737495e-06, "loss": -0.0388, "num_tokens": 21353312.0, "reward": 1.7696508169174194, "reward_std": 0.901185929775238, "rewards/rollout_reward_func/mean": 1.7696508169174194, "rewards/rollout_reward_func/std": 0.901185929775238, "sampling/importance_sampling_ratio/max": 1.0479581356048584, "sampling/importance_sampling_ratio/mean": 0.9612598419189453, "sampling/importance_sampling_ratio/min": 0.00010145548731088638, "sampling/sampling_logp_difference/max": 1.2424325942993164, "sampling/sampling_logp_difference/mean": 0.10969395935535431, "step": 1593, "step_time": 7.977780258996063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.49395848624408245, "epoch": 0.00797, "grad_norm": 0.00909232348203659, "kl": 0.26429057493805885, "learning_rate": 7.99977536547095e-06, "loss": -0.0389, "step": 1594, "step_time": 4.107446800000616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.625, "completions/mean_terminated_length": 5.800000190734863, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.587348133325577, "epoch": 0.007975, "frac_reward_zero_std": 0.0, "grad_norm": 0.021027443930506706, "kl": 0.19891858845949173, "learning_rate": 7.99977507701933e-06, "loss": -0.0959, "num_tokens": 21383136.0, "reward": 0.13874861598014832, "reward_std": 1.2874877452850342, "rewards/rollout_reward_func/mean": 0.13874861598014832, "rewards/rollout_reward_func/std": 1.2874878644943237, "sampling/importance_sampling_ratio/max": 1.1028248071670532, "sampling/importance_sampling_ratio/mean": 0.4581022560596466, "sampling/importance_sampling_ratio/min": 6.6493471422290895e-06, "sampling/sampling_logp_difference/max": 2.2145142555236816, "sampling/sampling_logp_difference/mean": 0.4396762251853943, "step": 1595, "step_time": 16.355146372996387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5805968940258026, "epoch": 0.00798, "grad_norm": 0.019978327676653862, "kl": 0.1977777313441038, "learning_rate": 7.999774788382636e-06, "loss": -0.0959, "step": 1596, "step_time": 6.344349167018663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.1875, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.135560393333435, "epoch": 0.007985, "frac_reward_zero_std": 0.0, "grad_norm": 0.019723284989595413, "kl": 0.2375908251851797, "learning_rate": 7.999774499560869e-06, "loss": -0.0803, "num_tokens": 21420685.0, "reward": -0.22330054640769958, "reward_std": 1.0993257761001587, "rewards/rollout_reward_func/mean": -0.22330054640769958, "rewards/rollout_reward_func/std": 1.0993257761001587, "sampling/importance_sampling_ratio/max": 1.1248382329940796, "sampling/importance_sampling_ratio/mean": 0.2744683027267456, "sampling/importance_sampling_ratio/min": 1.0400570317870006e-05, "sampling/sampling_logp_difference/max": 2.8943428993225098, "sampling/sampling_logp_difference/mean": 0.4761750102043152, "step": 1597, "step_time": 22.02223512000637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.1278218626976013, "epoch": 0.00799, "grad_norm": 0.023366812616586685, "kl": 0.24208672903478146, "learning_rate": 7.99977421055403e-06, "loss": -0.0804, "step": 1598, "step_time": 8.009735354004079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.1875, "completions/mean_terminated_length": 4.636363983154297, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2080807983875275, "epoch": 0.007995, "frac_reward_zero_std": 0.0, "grad_norm": 0.27922993898391724, "kl": 0.1759023740887642, "learning_rate": 7.999773921362115e-06, "loss": -0.0831, "num_tokens": 21455469.0, "reward": -0.046739257872104645, "reward_std": 0.9138731360435486, "rewards/rollout_reward_func/mean": -0.046739257872104645, "rewards/rollout_reward_func/std": 0.9138731360435486, "sampling/importance_sampling_ratio/max": 1.1793779134750366, "sampling/importance_sampling_ratio/mean": 0.6294651031494141, "sampling/importance_sampling_ratio/min": 2.2122096652310574e-06, "sampling/sampling_logp_difference/max": 1.7641651630401611, "sampling/sampling_logp_difference/mean": 0.34581494331359863, "step": 1599, "step_time": 19.17906091698387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1828977167606354, "epoch": 0.008, "grad_norm": 0.16908983886241913, "kl": 0.1779353730380535, "learning_rate": 7.999773631985126e-06, "loss": -0.0845, "step": 1600, "step_time": 8.481588106995332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.625, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4461525678634644, "epoch": 0.008005, "frac_reward_zero_std": 0.0, "grad_norm": 0.11425058543682098, "kl": 0.5700467228889465, "learning_rate": 7.999773342423066e-06, "loss": -0.0953, "num_tokens": 21488761.0, "reward": 0.04147586226463318, "reward_std": 1.2365938425064087, "rewards/rollout_reward_func/mean": 0.04147586226463318, "rewards/rollout_reward_func/std": 1.2365938425064087, "sampling/importance_sampling_ratio/max": 1.1893577575683594, "sampling/importance_sampling_ratio/mean": 0.46459826827049255, "sampling/importance_sampling_ratio/min": 1.9826573407044634e-05, "sampling/sampling_logp_difference/max": 1.6376659870147705, "sampling/sampling_logp_difference/mean": 0.40805917978286743, "step": 1601, "step_time": 16.729472022969276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 2.4302148520946503, "epoch": 0.00801, "grad_norm": 0.13222342729568481, "kl": 0.6287639047950506, "learning_rate": 7.999773052675932e-06, "loss": -0.0951, "step": 1602, "step_time": 6.836618835004629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.8125, "completions/mean_terminated_length": 4.357142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0492912158370018, "epoch": 0.008015, "frac_reward_zero_std": 0.5, "grad_norm": 0.19876374304294586, "kl": 0.19190490245819092, "learning_rate": 7.999772762743724e-06, "loss": -0.0367, "num_tokens": 21514154.0, "reward": 1.2961947917938232, "reward_std": 1.2049784660339355, "rewards/rollout_reward_func/mean": 1.2961947917938232, "rewards/rollout_reward_func/std": 1.204978585243225, "sampling/importance_sampling_ratio/max": 1.2933958768844604, "sampling/importance_sampling_ratio/mean": 0.8486182689666748, "sampling/importance_sampling_ratio/min": 0.0009841622086241841, "sampling/sampling_logp_difference/max": 1.3367786407470703, "sampling/sampling_logp_difference/mean": 0.14334280788898468, "step": 1603, "step_time": 16.328628166011185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0491808336228132, "epoch": 0.00802, "grad_norm": 0.21197108924388885, "kl": 0.1902911476790905, "learning_rate": 7.999772472626442e-06, "loss": -0.0374, "step": 1604, "step_time": 6.5666032149893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.8125, "completions/mean_terminated_length": 4.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.8294119238853455, "epoch": 0.008025, "frac_reward_zero_std": 0.0, "grad_norm": 0.03854474052786827, "kl": 0.21194534935057163, "learning_rate": 7.999772182324089e-06, "loss": -0.1085, "num_tokens": 21546138.0, "reward": 0.09655806422233582, "reward_std": 1.223554253578186, "rewards/rollout_reward_func/mean": 0.09655806422233582, "rewards/rollout_reward_func/std": 1.223554253578186, "sampling/importance_sampling_ratio/max": 1.1497501134872437, "sampling/importance_sampling_ratio/mean": 0.4223344326019287, "sampling/importance_sampling_ratio/min": 2.319760369573487e-06, "sampling/sampling_logp_difference/max": 1.952052116394043, "sampling/sampling_logp_difference/mean": 0.4244481325149536, "step": 1605, "step_time": 18.531873753017862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8210688531398773, "epoch": 0.00803, "grad_norm": 0.03362691029906273, "kl": 0.20767594501376152, "learning_rate": 7.999771891836658e-06, "loss": -0.1087, "step": 1606, "step_time": 7.379598952000379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 4.5714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.3644561171531677, "epoch": 0.008035, "frac_reward_zero_std": 0.0, "grad_norm": 0.03240678459405899, "kl": 0.09498476423323154, "learning_rate": 7.999771601164157e-06, "loss": -0.1065, "num_tokens": 21576599.0, "reward": -0.020524650812149048, "reward_std": 1.1403522491455078, "rewards/rollout_reward_func/mean": -0.020524650812149048, "rewards/rollout_reward_func/std": 1.1403522491455078, "sampling/importance_sampling_ratio/max": 1.1024000644683838, "sampling/importance_sampling_ratio/mean": 0.3934398293495178, "sampling/importance_sampling_ratio/min": 2.3266073867489467e-07, "sampling/sampling_logp_difference/max": 2.232171058654785, "sampling/sampling_logp_difference/mean": 0.5127789974212646, "step": 1607, "step_time": 16.329936324982555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.366901695728302, "epoch": 0.00804, "grad_norm": 0.02719111554324627, "kl": 0.09586097113788128, "learning_rate": 7.999771310306581e-06, "loss": -0.1066, "step": 1608, "step_time": 6.924908823988517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.3125, "completions/mean_terminated_length": 4.4166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.744968831539154, "epoch": 0.008045, "frac_reward_zero_std": 0.5, "grad_norm": 0.3840758800506592, "kl": 0.17944474332034588, "learning_rate": 7.999771019263933e-06, "loss": -0.0387, "num_tokens": 21598529.0, "reward": 0.8628592491149902, "reward_std": 1.3656952381134033, "rewards/rollout_reward_func/mean": 0.8628592491149902, "rewards/rollout_reward_func/std": 1.3656952381134033, "sampling/importance_sampling_ratio/max": 1.2603232860565186, "sampling/importance_sampling_ratio/mean": 0.7068650722503662, "sampling/importance_sampling_ratio/min": 1.677394578791791e-07, "sampling/sampling_logp_difference/max": 1.998060941696167, "sampling/sampling_logp_difference/mean": 0.2541007399559021, "step": 1609, "step_time": 14.154552862019045 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.7380160987377167, "epoch": 0.00805, "grad_norm": 0.0799577608704567, "kl": 0.1797071285545826, "learning_rate": 7.999770728036211e-06, "loss": -0.0402, "step": 1610, "step_time": 6.349067838003975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7008898258209229, "epoch": 0.008055, "frac_reward_zero_std": 0.0, "grad_norm": 0.018633225932717323, "kl": 0.19666289165616035, "learning_rate": 7.999770436623416e-06, "loss": -0.1119, "num_tokens": 21631044.0, "reward": 0.5994333028793335, "reward_std": 1.2762397527694702, "rewards/rollout_reward_func/mean": 0.5994333028793335, "rewards/rollout_reward_func/std": 1.2762397527694702, "sampling/importance_sampling_ratio/max": 1.0525166988372803, "sampling/importance_sampling_ratio/mean": 0.5787733197212219, "sampling/importance_sampling_ratio/min": 2.1900397769059055e-05, "sampling/sampling_logp_difference/max": 1.7005248069763184, "sampling/sampling_logp_difference/mean": 0.3465961813926697, "step": 1611, "step_time": 16.60685170200304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6980657577514648, "epoch": 0.00806, "grad_norm": 0.018159305676817894, "kl": 0.19692472368478775, "learning_rate": 7.999770145025548e-06, "loss": -0.1119, "step": 1612, "step_time": 7.051278737999382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 5.090909004211426, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9696959555149078, "epoch": 0.008065, "frac_reward_zero_std": 0.0, "grad_norm": 0.022820869460701942, "kl": 0.24395274929702282, "learning_rate": 7.999769853242606e-06, "loss": -0.0953, "num_tokens": 21656334.0, "reward": 1.0375992059707642, "reward_std": 1.1649068593978882, "rewards/rollout_reward_func/mean": 1.0375992059707642, "rewards/rollout_reward_func/std": 1.1649068593978882, "sampling/importance_sampling_ratio/max": 1.0345584154129028, "sampling/importance_sampling_ratio/mean": 0.6397317051887512, "sampling/importance_sampling_ratio/min": 6.873881375213386e-07, "sampling/sampling_logp_difference/max": 1.9140342473983765, "sampling/sampling_logp_difference/mean": 0.3436806797981262, "step": 1613, "step_time": 17.446300515002804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.966919481754303, "epoch": 0.00807, "grad_norm": 0.02468239888548851, "kl": 0.2596589922904968, "learning_rate": 7.999769561274591e-06, "loss": -0.0952, "step": 1614, "step_time": 6.384718967005028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8659653486683965, "epoch": 0.008075, "frac_reward_zero_std": 0.0, "grad_norm": 0.005051479209214449, "kl": 0.27170615270733833, "learning_rate": 7.999769269121503e-06, "loss": -0.0533, "num_tokens": 21684181.0, "reward": 1.1066181659698486, "reward_std": 1.146313190460205, "rewards/rollout_reward_func/mean": 1.1066181659698486, "rewards/rollout_reward_func/std": 1.1463133096694946, "sampling/importance_sampling_ratio/max": 1.0425046682357788, "sampling/importance_sampling_ratio/mean": 0.8259398937225342, "sampling/importance_sampling_ratio/min": 2.635063083289424e-06, "sampling/sampling_logp_difference/max": 1.83029305934906, "sampling/sampling_logp_difference/mean": 0.17617098987102509, "step": 1615, "step_time": 13.237176140988595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8676251340657473, "epoch": 0.00808, "grad_norm": 0.004758546128869057, "kl": 0.27182554453611374, "learning_rate": 7.999768976783341e-06, "loss": -0.0533, "step": 1616, "step_time": 5.960443537973333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.46266717556864023, "epoch": 0.008085, "frac_reward_zero_std": 0.5, "grad_norm": 0.03716682642698288, "kl": 0.3995044231414795, "learning_rate": 7.999768684260106e-06, "loss": -0.0371, "num_tokens": 21700880.0, "reward": 1.943134069442749, "reward_std": 0.05355268716812134, "rewards/rollout_reward_func/mean": 1.943134069442749, "rewards/rollout_reward_func/std": 0.053552668541669846, "sampling/importance_sampling_ratio/max": 1.0253351926803589, "sampling/importance_sampling_ratio/mean": 0.9509857296943665, "sampling/importance_sampling_ratio/min": 2.8126620236434974e-05, "sampling/sampling_logp_difference/max": 1.7442913055419922, "sampling/sampling_logp_difference/mean": 0.12229336053133011, "step": 1617, "step_time": 6.089658282013261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.45836158096790314, "epoch": 0.00809, "grad_norm": 0.034652676433324814, "kl": 0.3856920264661312, "learning_rate": 7.9997683915518e-06, "loss": -0.0372, "step": 1618, "step_time": 4.0221905799844535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2521479241549969, "epoch": 0.008095, "frac_reward_zero_std": 0.0, "grad_norm": 0.05709109455347061, "kl": 0.9052055887877941, "learning_rate": 7.999768098658418e-06, "loss": -0.0706, "num_tokens": 21735011.0, "reward": 0.8159009218215942, "reward_std": 1.156794548034668, "rewards/rollout_reward_func/mean": 0.8159009218215942, "rewards/rollout_reward_func/std": 1.156794548034668, "sampling/importance_sampling_ratio/max": 1.0750727653503418, "sampling/importance_sampling_ratio/mean": 0.7743077278137207, "sampling/importance_sampling_ratio/min": 0.00010018985631177202, "sampling/sampling_logp_difference/max": 1.8906736373901367, "sampling/sampling_logp_difference/mean": 0.2143821269273758, "step": 1619, "step_time": 16.090063426003326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2549276212230325, "epoch": 0.0081, "grad_norm": 0.06443613767623901, "kl": 0.9981081895530224, "learning_rate": 7.999767805579964e-06, "loss": -0.0704, "step": 1620, "step_time": 7.235527638011263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7775896498933434, "epoch": 0.008105, "frac_reward_zero_std": 0.5, "grad_norm": 0.004909635055810213, "kl": 0.27967892587184906, "learning_rate": 7.999767512316437e-06, "loss": -0.055, "num_tokens": 21760989.0, "reward": 1.435219168663025, "reward_std": 1.0427169799804688, "rewards/rollout_reward_func/mean": 1.435219168663025, "rewards/rollout_reward_func/std": 1.0427169799804688, "sampling/importance_sampling_ratio/max": 1.0208394527435303, "sampling/importance_sampling_ratio/mean": 0.8238295912742615, "sampling/importance_sampling_ratio/min": 0.0001408690877724439, "sampling/sampling_logp_difference/max": 1.340822458267212, "sampling/sampling_logp_difference/mean": 0.15062543749809265, "step": 1621, "step_time": 15.893224427985842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7742447722703218, "epoch": 0.00811, "grad_norm": 0.00493212416768074, "kl": 0.27923253551125526, "learning_rate": 7.999767218867836e-06, "loss": -0.055, "step": 1622, "step_time": 6.490275499003474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 5.357142925262451, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.105333475396037, "epoch": 0.008115, "frac_reward_zero_std": 0.5, "grad_norm": 0.03943439573049545, "kl": 0.22636543959379196, "learning_rate": 7.999766925234162e-06, "loss": -0.0426, "num_tokens": 21782712.0, "reward": 0.9704661965370178, "reward_std": 1.37095046043396, "rewards/rollout_reward_func/mean": 0.9704661965370178, "rewards/rollout_reward_func/std": 1.37095046043396, "sampling/importance_sampling_ratio/max": 1.029549479484558, "sampling/importance_sampling_ratio/mean": 0.7133764028549194, "sampling/importance_sampling_ratio/min": 0.0004150408203713596, "sampling/sampling_logp_difference/max": 1.2854909896850586, "sampling/sampling_logp_difference/mean": 0.1621776670217514, "step": 1623, "step_time": 13.384984975986299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1110013015568256, "epoch": 0.00812, "grad_norm": 0.03839844465255737, "kl": 0.22551218792796135, "learning_rate": 7.999766631415414e-06, "loss": -0.0426, "step": 1624, "step_time": 5.516323510993971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3933852352201939, "epoch": 0.008125, "frac_reward_zero_std": 0.5, "grad_norm": 0.07192959636449814, "kl": 0.17183082178235054, "learning_rate": 7.999766337411595e-06, "loss": -0.0437, "num_tokens": 21807863.0, "reward": 0.6338478326797485, "reward_std": 1.4330921173095703, "rewards/rollout_reward_func/mean": 0.6338478326797485, "rewards/rollout_reward_func/std": 1.4330922365188599, "sampling/importance_sampling_ratio/max": 1.0352485179901123, "sampling/importance_sampling_ratio/mean": 0.7631621360778809, "sampling/importance_sampling_ratio/min": 5.654127903653716e-07, "sampling/sampling_logp_difference/max": 1.9072802066802979, "sampling/sampling_logp_difference/mean": 0.24541394412517548, "step": 1625, "step_time": 16.21922312999959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.395240263082087, "epoch": 0.00813, "grad_norm": 0.05614585801959038, "kl": 0.17119577527046204, "learning_rate": 7.999766043222702e-06, "loss": -0.0438, "step": 1626, "step_time": 6.900073367985897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1172182522714138, "epoch": 0.008135, "frac_reward_zero_std": 0.5, "grad_norm": 0.039844874292612076, "kl": 0.21514885872602463, "learning_rate": 7.999765748848737e-06, "loss": -0.0407, "num_tokens": 21834693.0, "reward": 0.8939965963363647, "reward_std": 1.2802002429962158, "rewards/rollout_reward_func/mean": 0.8939965963363647, "rewards/rollout_reward_func/std": 1.2802002429962158, "sampling/importance_sampling_ratio/max": 1.111416220664978, "sampling/importance_sampling_ratio/mean": 0.7673920392990112, "sampling/importance_sampling_ratio/min": 4.510095823206939e-05, "sampling/sampling_logp_difference/max": 1.296983242034912, "sampling/sampling_logp_difference/mean": 0.1797294318675995, "step": 1627, "step_time": 16.852469275007024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1187851782888174, "epoch": 0.00814, "grad_norm": 0.03594309464097023, "kl": 0.21529943868517876, "learning_rate": 7.999765454289697e-06, "loss": -0.0408, "step": 1628, "step_time": 7.246001034989604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.05391300469636917, "epoch": 0.008145, "frac_reward_zero_std": 1.0, "grad_norm": 0.00025286179152317345, "kl": 0.24655001610517502, "learning_rate": 7.999765159545584e-06, "loss": 0.0006, "num_tokens": 21851770.0, "reward": 1.949798583984375, "reward_std": 0.051540616899728775, "rewards/rollout_reward_func/mean": 1.949798583984375, "rewards/rollout_reward_func/std": 0.051540616899728775, "sampling/importance_sampling_ratio/max": 1.0161052942276, "sampling/importance_sampling_ratio/mean": 1.0078191757202148, "sampling/importance_sampling_ratio/min": 0.998998761177063, "sampling/sampling_logp_difference/max": 0.008092924021184444, "sampling/sampling_logp_difference/mean": 0.0022807735949754715, "step": 1629, "step_time": 5.914775689976523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05431906972080469, "epoch": 0.00815, "grad_norm": 0.0002545515599194914, "kl": 0.24649795144796371, "learning_rate": 7.999764864616402e-06, "loss": 0.0006, "step": 1630, "step_time": 3.7486927029822255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.92307710647583, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3462414033710957, "epoch": 0.008155, "frac_reward_zero_std": 0.0, "grad_norm": 0.00723963463678956, "kl": 0.2557493858039379, "learning_rate": 7.999764569502142e-06, "loss": -0.0938, "num_tokens": 21884806.0, "reward": 1.1200624704360962, "reward_std": 1.1893339157104492, "rewards/rollout_reward_func/mean": 1.1200624704360962, "rewards/rollout_reward_func/std": 1.1893339157104492, "sampling/importance_sampling_ratio/max": 1.0542808771133423, "sampling/importance_sampling_ratio/mean": 0.7629671692848206, "sampling/importance_sampling_ratio/min": 1.2916607374791056e-06, "sampling/sampling_logp_difference/max": 2.1826529502868652, "sampling/sampling_logp_difference/mean": 0.32283729314804077, "step": 1631, "step_time": 15.618008803983685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3448579898104072, "epoch": 0.00816, "grad_norm": 0.006954774260520935, "kl": 0.25482644885778427, "learning_rate": 7.999764274202813e-06, "loss": -0.0938, "step": 1632, "step_time": 6.571778560988605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 4.5625, "completions/mean_terminated_length": 4.5625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.41333954501897097, "epoch": 0.008165, "frac_reward_zero_std": 0.5, "grad_norm": 0.03714615851640701, "kl": 0.3867219053208828, "learning_rate": 7.999763978718408e-06, "loss": -0.0374, "num_tokens": 21902059.0, "reward": 0.6059711575508118, "reward_std": 1.4405570030212402, "rewards/rollout_reward_func/mean": 0.6059711575508118, "rewards/rollout_reward_func/std": 1.4405570030212402, "sampling/importance_sampling_ratio/max": 1.0149056911468506, "sampling/importance_sampling_ratio/mean": 0.9469196796417236, "sampling/importance_sampling_ratio/min": 0.0007346296333707869, "sampling/sampling_logp_difference/max": 1.7003005743026733, "sampling/sampling_logp_difference/mean": 0.09018759429454803, "step": 1633, "step_time": 6.132058244009386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41223818250000477, "epoch": 0.00817, "grad_norm": 0.03192909434437752, "kl": 0.3744158148765564, "learning_rate": 7.999763683048932e-06, "loss": -0.0375, "step": 1634, "step_time": 3.284974703012267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.8125, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.226733237504959, "epoch": 0.008175, "frac_reward_zero_std": 0.0, "grad_norm": 0.015307529829442501, "kl": 0.24868682399392128, "learning_rate": 7.999763387194383e-06, "loss": -0.0731, "num_tokens": 21926791.0, "reward": 0.6100808382034302, "reward_std": 1.4158498048782349, "rewards/rollout_reward_func/mean": 0.6100808382034302, "rewards/rollout_reward_func/std": 1.4158498048782349, "sampling/importance_sampling_ratio/max": 1.018608808517456, "sampling/importance_sampling_ratio/mean": 0.5674304962158203, "sampling/importance_sampling_ratio/min": 1.3218157619121484e-05, "sampling/sampling_logp_difference/max": 1.9661117792129517, "sampling/sampling_logp_difference/mean": 0.38171064853668213, "step": 1635, "step_time": 17.97049321900704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2220026552677155, "epoch": 0.00818, "grad_norm": 0.01797456108033657, "kl": 0.26465753465890884, "learning_rate": 7.99976309115476e-06, "loss": -0.073, "step": 1636, "step_time": 7.190087650989881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.4375, "completions/mean_terminated_length": 5.461538791656494, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7308077216148376, "epoch": 0.008185, "frac_reward_zero_std": 0.5, "grad_norm": 0.07918722182512283, "kl": 0.3435177616775036, "learning_rate": 7.999762794930064e-06, "loss": -0.0315, "num_tokens": 21952303.0, "reward": 0.779430091381073, "reward_std": 1.392167568206787, "rewards/rollout_reward_func/mean": 0.779430091381073, "rewards/rollout_reward_func/std": 1.392167568206787, "sampling/importance_sampling_ratio/max": 1.0297056436538696, "sampling/importance_sampling_ratio/mean": 0.6508415341377258, "sampling/importance_sampling_ratio/min": 5.7433950928498234e-08, "sampling/sampling_logp_difference/max": 2.266118049621582, "sampling/sampling_logp_difference/mean": 0.32260724902153015, "step": 1637, "step_time": 15.452955772998394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7363948822021484, "epoch": 0.00819, "grad_norm": 0.08144021779298782, "kl": 0.34380966797471046, "learning_rate": 7.999762498520296e-06, "loss": -0.0317, "step": 1638, "step_time": 6.451840259003802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2290335074067116, "epoch": 0.008195, "frac_reward_zero_std": 0.0, "grad_norm": 0.04638133943080902, "kl": 0.3877050429582596, "learning_rate": 7.999762201925453e-06, "loss": -0.0849, "num_tokens": 21983054.0, "reward": 0.4785510301589966, "reward_std": 1.1245336532592773, "rewards/rollout_reward_func/mean": 0.4785510301589966, "rewards/rollout_reward_func/std": 1.124533772468567, "sampling/importance_sampling_ratio/max": 1.1109322309494019, "sampling/importance_sampling_ratio/mean": 0.7699686288833618, "sampling/importance_sampling_ratio/min": 5.193157903704559e-07, "sampling/sampling_logp_difference/max": 1.8179357051849365, "sampling/sampling_logp_difference/mean": 0.27210068702697754, "step": 1639, "step_time": 14.488024968988611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2306081131100655, "epoch": 0.0082, "grad_norm": 0.0477139838039875, "kl": 0.387943584471941, "learning_rate": 7.99976190514554e-06, "loss": -0.085, "step": 1640, "step_time": 6.891387040013797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 4.230769157409668, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5845244526863098, "epoch": 0.008205, "frac_reward_zero_std": 0.0, "grad_norm": 0.08779849857091904, "kl": 1.0985899474471807, "learning_rate": 7.999761608180555e-06, "loss": -0.0972, "num_tokens": 22013571.0, "reward": 0.8986698389053345, "reward_std": 1.1772806644439697, "rewards/rollout_reward_func/mean": 0.8986698389053345, "rewards/rollout_reward_func/std": 1.1772806644439697, "sampling/importance_sampling_ratio/max": 1.0678962469100952, "sampling/importance_sampling_ratio/mean": 0.7086577415466309, "sampling/importance_sampling_ratio/min": 2.195528878701225e-07, "sampling/sampling_logp_difference/max": 1.728081464767456, "sampling/sampling_logp_difference/mean": 0.31107795238494873, "step": 1641, "step_time": 14.222819893984706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5881158858537674, "epoch": 0.00821, "grad_norm": 0.05413191020488739, "kl": 0.86152358725667, "learning_rate": 7.999761311030496e-06, "loss": -0.0977, "step": 1642, "step_time": 6.2355291830026545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6759077310562134, "epoch": 0.008215, "frac_reward_zero_std": 0.0, "grad_norm": 0.47895628213882446, "kl": 0.16163743287324905, "learning_rate": 7.999761013695363e-06, "loss": 0.0212, "num_tokens": 22040185.0, "reward": -0.8100002408027649, "reward_std": 0.18038859963417053, "rewards/rollout_reward_func/mean": -0.8100002408027649, "rewards/rollout_reward_func/std": 0.18038862943649292, "sampling/importance_sampling_ratio/max": 1.1603800058364868, "sampling/importance_sampling_ratio/mean": 0.631266713142395, "sampling/importance_sampling_ratio/min": 0.0004195715009700507, "sampling/sampling_logp_difference/max": 1.3760074377059937, "sampling/sampling_logp_difference/mean": 0.1990257203578949, "step": 1643, "step_time": 18.088240303011844 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 1.6681804955005646, "epoch": 0.00822, "grad_norm": 0.1170927956700325, "kl": 0.16284693032503128, "learning_rate": 7.999760716175157e-06, "loss": 0.0191, "step": 1644, "step_time": 6.245145052031148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.1875, "completions/mean_terminated_length": 6.090909481048584, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4322324097156525, "epoch": 0.008225, "frac_reward_zero_std": 0.0, "grad_norm": 0.011586462147533894, "kl": 0.1754603534936905, "learning_rate": 7.999760418469882e-06, "loss": -0.1052, "num_tokens": 22071170.0, "reward": 0.3560815453529358, "reward_std": 1.3308604955673218, "rewards/rollout_reward_func/mean": 0.3560815453529358, "rewards/rollout_reward_func/std": 1.3308604955673218, "sampling/importance_sampling_ratio/max": 1.04707670211792, "sampling/importance_sampling_ratio/mean": 0.5091925859451294, "sampling/importance_sampling_ratio/min": 3.232833876154473e-07, "sampling/sampling_logp_difference/max": 1.9946510791778564, "sampling/sampling_logp_difference/mean": 0.4393380880355835, "step": 1645, "step_time": 15.98097379601677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.432566910982132, "epoch": 0.00823, "grad_norm": 0.011670242063701153, "kl": 0.1750788800418377, "learning_rate": 7.999760120579531e-06, "loss": -0.1051, "step": 1646, "step_time": 6.8403736149921315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8925199750810862, "epoch": 0.008235, "frac_reward_zero_std": 0.5, "grad_norm": 0.04162968695163727, "kl": 0.2494817152619362, "learning_rate": 7.999759822504108e-06, "loss": -0.0483, "num_tokens": 22096649.0, "reward": 1.5628104209899902, "reward_std": 0.8564016819000244, "rewards/rollout_reward_func/mean": 1.5628104209899902, "rewards/rollout_reward_func/std": 0.856401801109314, "sampling/importance_sampling_ratio/max": 1.02493155002594, "sampling/importance_sampling_ratio/mean": 0.8672484159469604, "sampling/importance_sampling_ratio/min": 0.00018576462753117085, "sampling/sampling_logp_difference/max": 1.6549227237701416, "sampling/sampling_logp_difference/mean": 0.16063201427459717, "step": 1647, "step_time": 14.024685821015737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8933084383606911, "epoch": 0.00824, "grad_norm": 0.037436798214912415, "kl": 0.2490626871585846, "learning_rate": 7.999759524243613e-06, "loss": -0.0483, "step": 1648, "step_time": 6.047821008003666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7756660357117653, "epoch": 0.008245, "frac_reward_zero_std": 0.0, "grad_norm": 0.05510011315345764, "kl": 0.27975696697831154, "learning_rate": 7.999759225798045e-06, "loss": -0.0986, "num_tokens": 22129559.0, "reward": 0.6425880193710327, "reward_std": 1.2446632385253906, "rewards/rollout_reward_func/mean": 0.6425880193710327, "rewards/rollout_reward_func/std": 1.2446632385253906, "sampling/importance_sampling_ratio/max": 1.0840235948562622, "sampling/importance_sampling_ratio/mean": 0.6342962980270386, "sampling/importance_sampling_ratio/min": 2.7304498871671967e-05, "sampling/sampling_logp_difference/max": 1.9055308103561401, "sampling/sampling_logp_difference/mean": 0.30256322026252747, "step": 1649, "step_time": 16.439642598008504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7795493453741074, "epoch": 0.00825, "grad_norm": 0.055364083498716354, "kl": 0.27909842133522034, "learning_rate": 7.999758927167403e-06, "loss": -0.0986, "step": 1650, "step_time": 6.856882026986568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.3920974750071764, "epoch": 0.008255, "frac_reward_zero_std": 0.5, "grad_norm": 0.010935204103589058, "kl": 0.22763270139694214, "learning_rate": 7.999758628351691e-06, "loss": -0.0379, "num_tokens": 22147643.0, "reward": 1.8214519023895264, "reward_std": 0.6939850449562073, "rewards/rollout_reward_func/mean": 1.8214519023895264, "rewards/rollout_reward_func/std": 0.693985104560852, "sampling/importance_sampling_ratio/max": 1.0134236812591553, "sampling/importance_sampling_ratio/mean": 0.9409281611442566, "sampling/importance_sampling_ratio/min": 0.0018964676419273019, "sampling/sampling_logp_difference/max": 0.9600486755371094, "sampling/sampling_logp_difference/mean": 0.07347778230905533, "step": 1651, "step_time": 9.250346417014953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39446601271629333, "epoch": 0.00826, "grad_norm": 0.011870344169437885, "kl": 0.22776079550385475, "learning_rate": 7.999758329350904e-06, "loss": -0.0379, "step": 1652, "step_time": 4.761934110996663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.737193524837494, "epoch": 0.008265, "frac_reward_zero_std": 0.0, "grad_norm": 0.12621474266052246, "kl": 0.2581440210342407, "learning_rate": 7.999758030165047e-06, "loss": -0.0543, "num_tokens": 22178725.0, "reward": 0.20240333676338196, "reward_std": 1.17302668094635, "rewards/rollout_reward_func/mean": 0.20240333676338196, "rewards/rollout_reward_func/std": 1.1730268001556396, "sampling/importance_sampling_ratio/max": 1.1627341508865356, "sampling/importance_sampling_ratio/mean": 0.7435187101364136, "sampling/importance_sampling_ratio/min": 1.3189918490752461e-06, "sampling/sampling_logp_difference/max": 1.6664501428604126, "sampling/sampling_logp_difference/mean": 0.32940301299095154, "step": 1653, "step_time": 17.478841568008647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7401962280273438, "epoch": 0.00827, "grad_norm": 0.12257242202758789, "kl": 0.25797607004642487, "learning_rate": 7.999757730794116e-06, "loss": -0.0544, "step": 1654, "step_time": 7.2931830269953934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.375, "completions/mean_terminated_length": 6.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.963309109210968, "epoch": 0.008275, "frac_reward_zero_std": 0.0, "grad_norm": 0.08244270831346512, "kl": 0.25772785767912865, "learning_rate": 7.99975743123811e-06, "loss": -0.0946, "num_tokens": 22216036.0, "reward": -0.10704122483730316, "reward_std": 1.1139925718307495, "rewards/rollout_reward_func/mean": -0.10704122483730316, "rewards/rollout_reward_func/std": 1.1139925718307495, "sampling/importance_sampling_ratio/max": 1.0500022172927856, "sampling/importance_sampling_ratio/mean": 0.2579900622367859, "sampling/importance_sampling_ratio/min": 4.480542870055615e-08, "sampling/sampling_logp_difference/max": 2.3941659927368164, "sampling/sampling_logp_difference/mean": 0.46459072828292847, "step": 1655, "step_time": 18.713400448017637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.9598527550697327, "epoch": 0.00828, "grad_norm": 0.07144860178232193, "kl": 0.23899315670132637, "learning_rate": 7.999757131497035e-06, "loss": -0.0948, "step": 1656, "step_time": 7.231342351980857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.48373157531023026, "epoch": 0.008285, "frac_reward_zero_std": 0.5, "grad_norm": 0.007880587130784988, "kl": 0.31921253353357315, "learning_rate": 7.999756831570886e-06, "loss": -0.0379, "num_tokens": 22232489.0, "reward": 1.807760238647461, "reward_std": 0.7487509846687317, "rewards/rollout_reward_func/mean": 1.807760238647461, "rewards/rollout_reward_func/std": 0.7487510442733765, "sampling/importance_sampling_ratio/max": 1.0232456922531128, "sampling/importance_sampling_ratio/mean": 0.9434702396392822, "sampling/importance_sampling_ratio/min": 2.838744876498822e-05, "sampling/sampling_logp_difference/max": 2.218177556991577, "sampling/sampling_logp_difference/mean": 0.12136256694793701, "step": 1657, "step_time": 6.053927964007016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.48642004653811455, "epoch": 0.00829, "grad_norm": 0.008264287374913692, "kl": 0.3192463219165802, "learning_rate": 7.999756531459666e-06, "loss": -0.0378, "step": 1658, "step_time": 3.2739069819799624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.010194480419159, "epoch": 0.008295, "frac_reward_zero_std": 0.0, "grad_norm": 0.051688410341739655, "kl": 0.2775440402328968, "learning_rate": 7.999756231163373e-06, "loss": -0.1018, "num_tokens": 22259945.0, "reward": 0.5042799115180969, "reward_std": 1.3710511922836304, "rewards/rollout_reward_func/mean": 0.5042799115180969, "rewards/rollout_reward_func/std": 1.3710511922836304, "sampling/importance_sampling_ratio/max": 1.03848397731781, "sampling/importance_sampling_ratio/mean": 0.5957176685333252, "sampling/importance_sampling_ratio/min": 2.3477473405364435e-06, "sampling/sampling_logp_difference/max": 1.7064839601516724, "sampling/sampling_logp_difference/mean": 0.38425368070602417, "step": 1659, "step_time": 12.866705077976803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.010806053876877, "epoch": 0.0083, "grad_norm": 0.04628956690430641, "kl": 0.2842014506459236, "learning_rate": 7.999755930682007e-06, "loss": -0.1019, "step": 1660, "step_time": 6.49943647600594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4625463047996163, "epoch": 0.008305, "frac_reward_zero_std": 0.0, "grad_norm": 0.02179120108485222, "kl": 0.18499003909528255, "learning_rate": 7.999755630015569e-06, "loss": -0.1036, "num_tokens": 22286346.0, "reward": -0.0036330819129943848, "reward_std": 1.1972298622131348, "rewards/rollout_reward_func/mean": -0.0036330819129943848, "rewards/rollout_reward_func/std": 1.1972298622131348, "sampling/importance_sampling_ratio/max": 1.0490953922271729, "sampling/importance_sampling_ratio/mean": 0.695440411567688, "sampling/importance_sampling_ratio/min": 0.00023967871675267816, "sampling/sampling_logp_difference/max": 1.848435878753662, "sampling/sampling_logp_difference/mean": 0.2754884362220764, "step": 1661, "step_time": 15.379395825992106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4657467398792505, "epoch": 0.00831, "grad_norm": 0.022077297791838646, "kl": 0.18526456598192453, "learning_rate": 7.999755329164059e-06, "loss": -0.1036, "step": 1662, "step_time": 5.8066019830002915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.49610811937600374, "epoch": 0.008315, "frac_reward_zero_std": 0.5, "grad_norm": 0.02267155982553959, "kl": 0.340521015226841, "learning_rate": 7.999755028127475e-06, "loss": -0.0347, "num_tokens": 22310213.0, "reward": 1.650381088256836, "reward_std": 0.6482297778129578, "rewards/rollout_reward_func/mean": 1.650381088256836, "rewards/rollout_reward_func/std": 0.6482297778129578, "sampling/importance_sampling_ratio/max": 1.1698273420333862, "sampling/importance_sampling_ratio/mean": 0.9600198268890381, "sampling/importance_sampling_ratio/min": 2.846970528480597e-05, "sampling/sampling_logp_difference/max": 1.3262951374053955, "sampling/sampling_logp_difference/mean": 0.12367524206638336, "step": 1663, "step_time": 13.411032939999131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.49745068699121475, "epoch": 0.00832, "grad_norm": 0.02174273319542408, "kl": 0.33595462143421173, "learning_rate": 7.99975472690582e-06, "loss": -0.0348, "step": 1664, "step_time": 6.237620084997616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.875, "completions/mean_terminated_length": 6.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.283313751220703, "epoch": 0.008325, "frac_reward_zero_std": 0.0, "grad_norm": 0.14871543645858765, "kl": 0.564160481095314, "learning_rate": 7.999754425499092e-06, "loss": -0.0586, "num_tokens": 22343015.0, "reward": -0.16409581899642944, "reward_std": 1.1243733167648315, "rewards/rollout_reward_func/mean": -0.16409581899642944, "rewards/rollout_reward_func/std": 1.1243733167648315, "sampling/importance_sampling_ratio/max": 1.035561442375183, "sampling/importance_sampling_ratio/mean": 0.5428206920623779, "sampling/importance_sampling_ratio/min": 0.0001094735853257589, "sampling/sampling_logp_difference/max": 1.9596582651138306, "sampling/sampling_logp_difference/mean": 0.3455897271633148, "step": 1665, "step_time": 15.566580001002876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2956265807151794, "epoch": 0.00833, "grad_norm": 0.17158369719982147, "kl": 0.5119991861283779, "learning_rate": 7.99975412390729e-06, "loss": -0.059, "step": 1666, "step_time": 6.9294489009917015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 4.266666889190674, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6250270530581474, "epoch": 0.008335, "frac_reward_zero_std": 0.5, "grad_norm": 0.13283486664295197, "kl": 0.79719278216362, "learning_rate": 7.999753822130418e-06, "loss": -0.007, "num_tokens": 22363119.0, "reward": 0.7214010953903198, "reward_std": 1.447036623954773, "rewards/rollout_reward_func/mean": 0.7214010953903198, "rewards/rollout_reward_func/std": 1.447036623954773, "sampling/importance_sampling_ratio/max": 1.2282500267028809, "sampling/importance_sampling_ratio/mean": 0.9066815376281738, "sampling/importance_sampling_ratio/min": 0.004430083557963371, "sampling/sampling_logp_difference/max": 1.0005946159362793, "sampling/sampling_logp_difference/mean": 0.09091527760028839, "step": 1667, "step_time": 10.373067669992452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6416293755173683, "epoch": 0.00834, "grad_norm": 0.13123957812786102, "kl": 0.692184392362833, "learning_rate": 7.999753520168475e-06, "loss": -0.0077, "step": 1668, "step_time": 5.201110095978947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.826744269579649, "epoch": 0.008345, "frac_reward_zero_std": 0.5, "grad_norm": 0.05390429496765137, "kl": 0.29090800508856773, "learning_rate": 7.999753218021458e-06, "loss": -0.053, "num_tokens": 22388307.0, "reward": 1.240630865097046, "reward_std": 1.1452101469039917, "rewards/rollout_reward_func/mean": 1.240630865097046, "rewards/rollout_reward_func/std": 1.1452102661132812, "sampling/importance_sampling_ratio/max": 1.0204938650131226, "sampling/importance_sampling_ratio/mean": 0.7752317786216736, "sampling/importance_sampling_ratio/min": 0.0011970424093306065, "sampling/sampling_logp_difference/max": 1.335618019104004, "sampling/sampling_logp_difference/mean": 0.14680610597133636, "step": 1669, "step_time": 15.11242369498359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8394104279577732, "epoch": 0.00835, "grad_norm": 0.04545741155743599, "kl": 0.29492561891674995, "learning_rate": 7.999752915689368e-06, "loss": -0.0531, "step": 1670, "step_time": 6.677713526005391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.6875, "completions/mean_terminated_length": 5.375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.401769906282425, "epoch": 0.008355, "frac_reward_zero_std": 0.0, "grad_norm": 0.014676856808364391, "kl": 0.1482227467931807, "learning_rate": 7.999752613172207e-06, "loss": -0.1019, "num_tokens": 22421229.0, "reward": 0.08441399037837982, "reward_std": 1.314918875694275, "rewards/rollout_reward_func/mean": 0.08441399037837982, "rewards/rollout_reward_func/std": 1.314918875694275, "sampling/importance_sampling_ratio/max": 1.0605416297912598, "sampling/importance_sampling_ratio/mean": 0.44931909441947937, "sampling/importance_sampling_ratio/min": 1.728453602467539e-09, "sampling/sampling_logp_difference/max": 2.5034263134002686, "sampling/sampling_logp_difference/mean": 0.4196610748767853, "step": 1671, "step_time": 16.92952666201745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4042424261569977, "epoch": 0.00836, "grad_norm": 0.017721032723784447, "kl": 0.14823306258767843, "learning_rate": 7.999752310469974e-06, "loss": -0.1018, "step": 1672, "step_time": 7.6213290780142415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 5.230769634246826, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7566981315612793, "epoch": 0.008365, "frac_reward_zero_std": 0.0, "grad_norm": 0.06987318396568298, "kl": 0.4021187946200371, "learning_rate": 7.999752007582667e-06, "loss": -0.0826, "num_tokens": 22444227.0, "reward": 1.193880319595337, "reward_std": 1.3427772521972656, "rewards/rollout_reward_func/mean": 1.193880319595337, "rewards/rollout_reward_func/std": 1.3427772521972656, "sampling/importance_sampling_ratio/max": 1.0377739667892456, "sampling/importance_sampling_ratio/mean": 0.6426030993461609, "sampling/importance_sampling_ratio/min": 0.0003424482129048556, "sampling/sampling_logp_difference/max": 1.9646539688110352, "sampling/sampling_logp_difference/mean": 0.29630568623542786, "step": 1673, "step_time": 13.686948070011567 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 1.759047508239746, "epoch": 0.00837, "grad_norm": 0.06968310475349426, "kl": 0.3987174406647682, "learning_rate": 7.99975170451029e-06, "loss": -0.0827, "step": 1674, "step_time": 5.47046381700784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.0625, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2753436286002398, "epoch": 0.008375, "frac_reward_zero_std": 0.5, "grad_norm": 0.02501392923295498, "kl": 0.337321437895298, "learning_rate": 7.99975140125284e-06, "loss": -0.0543, "num_tokens": 22467044.0, "reward": 1.2389438152313232, "reward_std": 1.2401570081710815, "rewards/rollout_reward_func/mean": 1.2389438152313232, "rewards/rollout_reward_func/std": 1.240157127380371, "sampling/importance_sampling_ratio/max": 1.0359008312225342, "sampling/importance_sampling_ratio/mean": 0.7738511562347412, "sampling/importance_sampling_ratio/min": 7.064460078254342e-05, "sampling/sampling_logp_difference/max": 1.829248070716858, "sampling/sampling_logp_difference/mean": 0.19555310904979706, "step": 1675, "step_time": 12.154035083003691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2685350980609655, "epoch": 0.00838, "grad_norm": 0.024851424619555473, "kl": 0.3321504220366478, "learning_rate": 7.999751097810317e-06, "loss": -0.0543, "step": 1676, "step_time": 5.923746396001661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 4.384615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4517827928066254, "epoch": 0.008385, "frac_reward_zero_std": 0.5, "grad_norm": 0.027324173599481583, "kl": 0.22750313952565193, "learning_rate": 7.999750794182724e-06, "loss": -0.0425, "num_tokens": 22494723.0, "reward": 0.9005216360092163, "reward_std": 1.3177425861358643, "rewards/rollout_reward_func/mean": 0.9005216360092163, "rewards/rollout_reward_func/std": 1.3177425861358643, "sampling/importance_sampling_ratio/max": 1.087872862815857, "sampling/importance_sampling_ratio/mean": 0.7519771456718445, "sampling/importance_sampling_ratio/min": 0.0002702704514376819, "sampling/sampling_logp_difference/max": 1.145105004310608, "sampling/sampling_logp_difference/mean": 0.16878223419189453, "step": 1677, "step_time": 19.039411527992343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.459857553243637, "epoch": 0.00839, "grad_norm": 0.027057111263275146, "kl": 0.22768261656165123, "learning_rate": 7.999750490370056e-06, "loss": -0.0425, "step": 1678, "step_time": 7.772483668988571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 4.769230842590332, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3040305580943823, "epoch": 0.008395, "frac_reward_zero_std": 0.0, "grad_norm": 0.0925222635269165, "kl": 0.200991190969944, "learning_rate": 7.999750186372318e-06, "loss": -0.0945, "num_tokens": 22519967.0, "reward": 0.029999911785125732, "reward_std": 1.293434739112854, "rewards/rollout_reward_func/mean": 0.029999911785125732, "rewards/rollout_reward_func/std": 1.293434739112854, "sampling/importance_sampling_ratio/max": 1.0599044561386108, "sampling/importance_sampling_ratio/mean": 0.774811863899231, "sampling/importance_sampling_ratio/min": 3.462485256022774e-05, "sampling/sampling_logp_difference/max": 1.547324299812317, "sampling/sampling_logp_difference/mean": 0.2652774453163147, "step": 1679, "step_time": 15.635845188997337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.299549674615264, "epoch": 0.0084, "grad_norm": 0.08712246268987656, "kl": 0.20249978825449944, "learning_rate": 7.999749882189507e-06, "loss": -0.0946, "step": 1680, "step_time": 7.482769441005075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.5625, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9429223202168941, "epoch": 0.008405, "frac_reward_zero_std": 0.0, "grad_norm": 0.247772678732872, "kl": 0.2707386650145054, "learning_rate": 7.999749577821624e-06, "loss": -0.101, "num_tokens": 22546155.0, "reward": 1.021168828010559, "reward_std": 1.0316001176834106, "rewards/rollout_reward_func/mean": 1.021168828010559, "rewards/rollout_reward_func/std": 1.0316001176834106, "sampling/importance_sampling_ratio/max": 1.1323003768920898, "sampling/importance_sampling_ratio/mean": 0.6142561435699463, "sampling/importance_sampling_ratio/min": 8.850088306644466e-06, "sampling/sampling_logp_difference/max": 2.20829701423645, "sampling/sampling_logp_difference/mean": 0.34262585639953613, "step": 1681, "step_time": 16.922122142990702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9248803183436394, "epoch": 0.00841, "grad_norm": 0.2130015790462494, "kl": 0.2868489548563957, "learning_rate": 7.99974927326867e-06, "loss": -0.1024, "step": 1682, "step_time": 7.184623252993333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 4.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.071981057524681, "epoch": 0.008415, "frac_reward_zero_std": 0.0, "grad_norm": 0.2280224710702896, "kl": 0.22098466753959656, "learning_rate": 7.999748968530642e-06, "loss": -0.0809, "num_tokens": 22577672.0, "reward": 0.3629658818244934, "reward_std": 1.149306297302246, "rewards/rollout_reward_func/mean": 0.3629658818244934, "rewards/rollout_reward_func/std": 1.149306297302246, "sampling/importance_sampling_ratio/max": 1.154697299003601, "sampling/importance_sampling_ratio/mean": 0.6258467435836792, "sampling/importance_sampling_ratio/min": 3.256249314631532e-08, "sampling/sampling_logp_difference/max": 2.008713483810425, "sampling/sampling_logp_difference/mean": 0.39269503951072693, "step": 1683, "step_time": 15.925317425004323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.079920046031475, "epoch": 0.00842, "grad_norm": 0.23134350776672363, "kl": 0.22186928614974022, "learning_rate": 7.999748663607544e-06, "loss": -0.081, "step": 1684, "step_time": 7.227679527000873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 6.285714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.820846736431122, "epoch": 0.008425, "frac_reward_zero_std": 0.0, "grad_norm": 0.01872221753001213, "kl": 0.10031120711937547, "learning_rate": 7.999748358499373e-06, "loss": -0.1078, "num_tokens": 22611234.0, "reward": 0.028895892202854156, "reward_std": 1.251125454902649, "rewards/rollout_reward_func/mean": 0.028895892202854156, "rewards/rollout_reward_func/std": 1.251125454902649, "sampling/importance_sampling_ratio/max": 1.053484559059143, "sampling/importance_sampling_ratio/mean": 0.32556962966918945, "sampling/importance_sampling_ratio/min": 3.4046311156998854e-06, "sampling/sampling_logp_difference/max": 1.9896495342254639, "sampling/sampling_logp_difference/mean": 0.43427574634552, "step": 1685, "step_time": 16.773240490991157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8224164843559265, "epoch": 0.00843, "grad_norm": 0.017562048509716988, "kl": 0.10105964634567499, "learning_rate": 7.999748053206131e-06, "loss": -0.1078, "step": 1686, "step_time": 6.462236511986703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.8125, "completions/mean_terminated_length": 5.083333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1129420399665833, "epoch": 0.008435, "frac_reward_zero_std": 0.5, "grad_norm": 0.007835718803107738, "kl": 0.1750941090285778, "learning_rate": 7.999747747727817e-06, "loss": -0.0371, "num_tokens": 22632495.0, "reward": 0.7258450984954834, "reward_std": 1.462570071220398, "rewards/rollout_reward_func/mean": 0.7258450984954834, "rewards/rollout_reward_func/std": 1.4625701904296875, "sampling/importance_sampling_ratio/max": 1.0213370323181152, "sampling/importance_sampling_ratio/mean": 0.6271960139274597, "sampling/importance_sampling_ratio/min": 3.58047100235126e-06, "sampling/sampling_logp_difference/max": 2.1891586780548096, "sampling/sampling_logp_difference/mean": 0.3079072833061218, "step": 1687, "step_time": 13.804913496991503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1168974339962006, "epoch": 0.00844, "grad_norm": 0.008122684434056282, "kl": 0.17753928154706955, "learning_rate": 7.999747442064431e-06, "loss": -0.0371, "step": 1688, "step_time": 6.3540686990018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.9375, "completions/mean_terminated_length": 4.84615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.901256039738655, "epoch": 0.008445, "frac_reward_zero_std": 0.0, "grad_norm": 0.28262996673583984, "kl": 0.24369993060827255, "learning_rate": 7.999747136215972e-06, "loss": -0.0419, "num_tokens": 22658706.0, "reward": 0.1182650625705719, "reward_std": 1.3685635328292847, "rewards/rollout_reward_func/mean": 0.1182650625705719, "rewards/rollout_reward_func/std": 1.3685635328292847, "sampling/importance_sampling_ratio/max": 1.060186505317688, "sampling/importance_sampling_ratio/mean": 0.5483340620994568, "sampling/importance_sampling_ratio/min": 0.00010564239346422255, "sampling/sampling_logp_difference/max": 2.1318955421447754, "sampling/sampling_logp_difference/mean": 0.33816128969192505, "step": 1689, "step_time": 13.028379849987687 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.02220394741743803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03782894741743803, "entropy": 1.9207473993301392, "epoch": 0.00845, "grad_norm": 0.20698188245296478, "kl": 0.2560127079486847, "learning_rate": 7.999746830182441e-06, "loss": -0.0432, "step": 1690, "step_time": 6.562022361991694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 5.777777671813965, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.2800859808921814, "epoch": 0.008455, "frac_reward_zero_std": 0.0, "grad_norm": 0.05142049863934517, "kl": 0.15143930912017822, "learning_rate": 7.999746523963839e-06, "loss": -0.0656, "num_tokens": 22691558.0, "reward": -0.41658419370651245, "reward_std": 0.8232460618019104, "rewards/rollout_reward_func/mean": -0.41658419370651245, "rewards/rollout_reward_func/std": 0.8232461214065552, "sampling/importance_sampling_ratio/max": 1.113399624824524, "sampling/importance_sampling_ratio/mean": 0.3323090672492981, "sampling/importance_sampling_ratio/min": 5.205611159908585e-06, "sampling/sampling_logp_difference/max": 2.001447916030884, "sampling/sampling_logp_difference/mean": 0.4938932955265045, "step": 1691, "step_time": 16.64401337399613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.2888699769973755, "epoch": 0.00846, "grad_norm": 0.05056793615221977, "kl": 0.15107419341802597, "learning_rate": 7.999746217560167e-06, "loss": -0.0657, "step": 1692, "step_time": 7.369537576989387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.3125, "completions/mean_terminated_length": 6.625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.8039716482162476, "epoch": 0.008465, "frac_reward_zero_std": 0.0, "grad_norm": 0.027505949139595032, "kl": 0.10508331097662449, "learning_rate": 7.99974591097142e-06, "loss": -0.0843, "num_tokens": 22722029.0, "reward": -0.3584175109863281, "reward_std": 0.9481887817382812, "rewards/rollout_reward_func/mean": -0.3584175109863281, "rewards/rollout_reward_func/std": 0.9481887221336365, "sampling/importance_sampling_ratio/max": 1.0873302221298218, "sampling/importance_sampling_ratio/mean": 0.2952108681201935, "sampling/importance_sampling_ratio/min": 2.760691859293729e-05, "sampling/sampling_logp_difference/max": 1.7597541809082031, "sampling/sampling_logp_difference/mean": 0.43949007987976074, "step": 1693, "step_time": 16.560834724004962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8031802773475647, "epoch": 0.00847, "grad_norm": 0.026761358603835106, "kl": 0.10259051062166691, "learning_rate": 7.999745604197602e-06, "loss": -0.0844, "step": 1694, "step_time": 6.427116986975307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.1875, "completions/mean_terminated_length": 6.384615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8913052957504988, "epoch": 0.008475, "frac_reward_zero_std": 0.0, "grad_norm": 0.03565632924437523, "kl": 0.29368405789136887, "learning_rate": 7.999745297238714e-06, "loss": -0.0919, "num_tokens": 22746942.0, "reward": 0.6330364346504211, "reward_std": 1.4107739925384521, "rewards/rollout_reward_func/mean": 0.6330364346504211, "rewards/rollout_reward_func/std": 1.4107738733291626, "sampling/importance_sampling_ratio/max": 1.0540896654129028, "sampling/importance_sampling_ratio/mean": 0.5744460821151733, "sampling/importance_sampling_ratio/min": 8.111722308967728e-06, "sampling/sampling_logp_difference/max": 1.9654078483581543, "sampling/sampling_logp_difference/mean": 0.4133731424808502, "step": 1695, "step_time": 15.60344047700346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8905717357993126, "epoch": 0.00848, "grad_norm": 0.03722022473812103, "kl": 0.2940373867750168, "learning_rate": 7.999744990094751e-06, "loss": -0.0919, "step": 1696, "step_time": 6.466847694013268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.4240055177360773, "epoch": 0.008485, "frac_reward_zero_std": 0.5, "grad_norm": 0.03671376779675484, "kl": 0.4096346087753773, "learning_rate": 7.99974468276572e-06, "loss": -0.0386, "num_tokens": 22763790.0, "reward": 1.7096964120864868, "reward_std": 0.9184082746505737, "rewards/rollout_reward_func/mean": 1.7096964120864868, "rewards/rollout_reward_func/std": 0.9184082746505737, "sampling/importance_sampling_ratio/max": 1.0172489881515503, "sampling/importance_sampling_ratio/mean": 0.8988662958145142, "sampling/importance_sampling_ratio/min": 0.0029875629115849733, "sampling/sampling_logp_difference/max": 1.0115138292312622, "sampling/sampling_logp_difference/mean": 0.08351055532693863, "step": 1697, "step_time": 8.569684489979409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4256696440279484, "epoch": 0.00849, "grad_norm": 0.03385769575834274, "kl": 0.3958524689078331, "learning_rate": 7.999744375251616e-06, "loss": -0.0387, "step": 1698, "step_time": 4.207805895013735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.92307710647583, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8933685719966888, "epoch": 0.008495, "frac_reward_zero_std": 0.5, "grad_norm": 0.1760614663362503, "kl": 0.2937080319970846, "learning_rate": 7.999744067552439e-06, "loss": -0.0404, "num_tokens": 22783683.0, "reward": -0.24908547103405, "reward_std": 1.1008256673812866, "rewards/rollout_reward_func/mean": -0.24908547103405, "rewards/rollout_reward_func/std": 1.1008256673812866, "sampling/importance_sampling_ratio/max": 1.0470248460769653, "sampling/importance_sampling_ratio/mean": 0.6823017001152039, "sampling/importance_sampling_ratio/min": 1.4521646107823472e-06, "sampling/sampling_logp_difference/max": 2.098567008972168, "sampling/sampling_logp_difference/mean": 0.2952532470226288, "step": 1699, "step_time": 11.319216326010064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8883220851421356, "epoch": 0.0085, "grad_norm": 0.15530677139759064, "kl": 0.2931422609835863, "learning_rate": 7.99974375966819e-06, "loss": -0.0407, "step": 1700, "step_time": 5.427542972989613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.875, "completions/mean_terminated_length": 5.111111164093018, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3839078843593597, "epoch": 0.008505, "frac_reward_zero_std": 0.0, "grad_norm": 0.27081286907196045, "kl": 0.12227856740355492, "learning_rate": 7.999743451598872e-06, "loss": -0.094, "num_tokens": 22820500.0, "reward": 0.08414816856384277, "reward_std": 1.1227525472640991, "rewards/rollout_reward_func/mean": 0.08414816856384277, "rewards/rollout_reward_func/std": 1.1227525472640991, "sampling/importance_sampling_ratio/max": 1.2221803665161133, "sampling/importance_sampling_ratio/mean": 0.5171191692352295, "sampling/importance_sampling_ratio/min": 5.885612495148962e-07, "sampling/sampling_logp_difference/max": 2.066801071166992, "sampling/sampling_logp_difference/mean": 0.3936523497104645, "step": 1701, "step_time": 18.707732768991264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3854347467422485, "epoch": 0.00851, "grad_norm": 0.19294853508472443, "kl": 0.12311341520398855, "learning_rate": 7.99974314334448e-06, "loss": -0.0957, "step": 1702, "step_time": 8.004003131994978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.3125, "completions/mean_terminated_length": 4.4166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9943206906318665, "epoch": 0.008515, "frac_reward_zero_std": 0.0, "grad_norm": 0.04894685745239258, "kl": 0.21642986312508583, "learning_rate": 7.999742834905018e-06, "loss": -0.0685, "num_tokens": 22844933.0, "reward": 0.4473995566368103, "reward_std": 1.4514201879501343, "rewards/rollout_reward_func/mean": 0.4473995566368103, "rewards/rollout_reward_func/std": 1.4514201879501343, "sampling/importance_sampling_ratio/max": 1.017388105392456, "sampling/importance_sampling_ratio/mean": 0.5799511671066284, "sampling/importance_sampling_ratio/min": 0.0003670387086458504, "sampling/sampling_logp_difference/max": 1.8610912561416626, "sampling/sampling_logp_difference/mean": 0.28120678663253784, "step": 1703, "step_time": 15.943707410013303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.997815579175949, "epoch": 0.00852, "grad_norm": 0.04898126423358917, "kl": 0.22328157722949982, "learning_rate": 7.999742526280483e-06, "loss": -0.0688, "step": 1704, "step_time": 6.0615405139978975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.8125, "completions/mean_terminated_length": 5.083333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5206760466098785, "epoch": 0.008525, "frac_reward_zero_std": 0.5, "grad_norm": 0.19978347420692444, "kl": 0.8151573464274406, "learning_rate": 7.999742217470878e-06, "loss": -0.0428, "num_tokens": 22869546.0, "reward": 0.9079535603523254, "reward_std": 1.4130206108093262, "rewards/rollout_reward_func/mean": 0.9079535603523254, "rewards/rollout_reward_func/std": 1.4130207300186157, "sampling/importance_sampling_ratio/max": 1.0215929746627808, "sampling/importance_sampling_ratio/mean": 0.6470077633857727, "sampling/importance_sampling_ratio/min": 0.0001933825114974752, "sampling/sampling_logp_difference/max": 1.4976420402526855, "sampling/sampling_logp_difference/mean": 0.19810909032821655, "step": 1705, "step_time": 16.00201591498626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5168052911758423, "epoch": 0.00853, "grad_norm": 0.1581193506717682, "kl": 0.7031230479478836, "learning_rate": 7.9997419084762e-06, "loss": -0.0434, "step": 1706, "step_time": 5.895589874999132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.428571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3568688333034515, "epoch": 0.008535, "frac_reward_zero_std": 0.5, "grad_norm": 0.15256589651107788, "kl": 0.23851378262043, "learning_rate": 7.99974159929645e-06, "loss": -0.0351, "num_tokens": 22893323.0, "reward": 1.0197551250457764, "reward_std": 1.2346827983856201, "rewards/rollout_reward_func/mean": 1.0197551250457764, "rewards/rollout_reward_func/std": 1.2346827983856201, "sampling/importance_sampling_ratio/max": 1.0890061855316162, "sampling/importance_sampling_ratio/mean": 0.7960010170936584, "sampling/importance_sampling_ratio/min": 1.6333613530150615e-05, "sampling/sampling_logp_difference/max": 2.3331432342529297, "sampling/sampling_logp_difference/mean": 0.23411406576633453, "step": 1707, "step_time": 14.679456254991237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3580316230654716, "epoch": 0.00854, "grad_norm": 0.13646462559700012, "kl": 0.2383572831749916, "learning_rate": 7.99974128993163e-06, "loss": -0.0352, "step": 1708, "step_time": 5.9489931049902225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.4375, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2961170747876167, "epoch": 0.008545, "frac_reward_zero_std": 0.0, "grad_norm": 0.49899372458457947, "kl": 0.3707994930446148, "learning_rate": 7.999740980381737e-06, "loss": -0.0826, "num_tokens": 22921552.0, "reward": 0.38358408212661743, "reward_std": 1.0712658166885376, "rewards/rollout_reward_func/mean": 0.38358408212661743, "rewards/rollout_reward_func/std": 1.0712659358978271, "sampling/importance_sampling_ratio/max": 1.0665260553359985, "sampling/importance_sampling_ratio/mean": 0.43902331590652466, "sampling/importance_sampling_ratio/min": 5.135775040798762e-07, "sampling/sampling_logp_difference/max": 2.3627238273620605, "sampling/sampling_logp_difference/mean": 0.42478060722351074, "step": 1709, "step_time": 16.792611725002644 }, { "clip_ratio/high_max": 0.16875000018626451, "clip_ratio/high_mean": 0.08437500009313226, "clip_ratio/low_mean": 0.019657257944345474, "clip_ratio/low_min": 0.008064515888690948, "clip_ratio/region_mean": 0.10403225803747773, "entropy": 2.2581480965018272, "epoch": 0.00855, "grad_norm": 0.1016121432185173, "kl": 0.2768402397632599, "learning_rate": 7.999740670646774e-06, "loss": -0.0856, "step": 1710, "step_time": 7.997628623998025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 4.714285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5162620786577463, "epoch": 0.008555, "frac_reward_zero_std": 0.0, "grad_norm": 0.015546049922704697, "kl": 0.3729441314935684, "learning_rate": 7.99974036072674e-06, "loss": -0.0887, "num_tokens": 22944012.0, "reward": 1.5405141115188599, "reward_std": 0.9515619874000549, "rewards/rollout_reward_func/mean": 1.5405141115188599, "rewards/rollout_reward_func/std": 0.9515620470046997, "sampling/importance_sampling_ratio/max": 1.0259371995925903, "sampling/importance_sampling_ratio/mean": 0.8228955864906311, "sampling/importance_sampling_ratio/min": 2.871798585601937e-07, "sampling/sampling_logp_difference/max": 2.000811815261841, "sampling/sampling_logp_difference/mean": 0.37303626537323, "step": 1711, "step_time": 11.804485624001245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5149151207879186, "epoch": 0.00856, "grad_norm": 0.01954294927418232, "kl": 0.3937387466430664, "learning_rate": 7.999740050621633e-06, "loss": -0.0886, "step": 1712, "step_time": 5.942291227998794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.967564642429352, "epoch": 0.008565, "frac_reward_zero_std": 0.0, "grad_norm": 0.09777010232210159, "kl": 0.1446699183434248, "learning_rate": 7.999739740331455e-06, "loss": -0.0835, "num_tokens": 22977307.0, "reward": -0.15421752631664276, "reward_std": 0.9741225242614746, "rewards/rollout_reward_func/mean": -0.15421752631664276, "rewards/rollout_reward_func/std": 0.9741225242614746, "sampling/importance_sampling_ratio/max": 1.3339899778366089, "sampling/importance_sampling_ratio/mean": 0.5629855394363403, "sampling/importance_sampling_ratio/min": 1.13587043415464e-06, "sampling/sampling_logp_difference/max": 2.0852646827697754, "sampling/sampling_logp_difference/mean": 0.44688308238983154, "step": 1713, "step_time": 17.04658277297858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.957565426826477, "epoch": 0.00857, "grad_norm": 0.07630827277898788, "kl": 0.14598702359944582, "learning_rate": 7.999739429856206e-06, "loss": -0.0837, "step": 1714, "step_time": 7.639428518989007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 5.818181991577148, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2153603732585907, "epoch": 0.008575, "frac_reward_zero_std": 0.0, "grad_norm": 0.07523034512996674, "kl": 0.16564860194921494, "learning_rate": 7.999739119195885e-06, "loss": -0.0727, "num_tokens": 23007197.0, "reward": -0.003956705331802368, "reward_std": 1.238969326019287, "rewards/rollout_reward_func/mean": -0.003956705331802368, "rewards/rollout_reward_func/std": 1.2389692068099976, "sampling/importance_sampling_ratio/max": 1.0667885541915894, "sampling/importance_sampling_ratio/mean": 0.5564882755279541, "sampling/importance_sampling_ratio/min": 6.366532034007832e-06, "sampling/sampling_logp_difference/max": 1.7841439247131348, "sampling/sampling_logp_difference/mean": 0.3966737985610962, "step": 1715, "step_time": 15.535493974995916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.213508903980255, "epoch": 0.00858, "grad_norm": 0.08442823588848114, "kl": 0.1671663448214531, "learning_rate": 7.999738808350492e-06, "loss": -0.0731, "step": 1716, "step_time": 6.258566661999794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.428571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0616063103079796, "epoch": 0.008585, "frac_reward_zero_std": 0.0, "grad_norm": 0.027611009776592255, "kl": 0.37972722202539444, "learning_rate": 7.99973849732003e-06, "loss": -0.0846, "num_tokens": 23040878.0, "reward": 1.2381538152694702, "reward_std": 0.9884982705116272, "rewards/rollout_reward_func/mean": 1.2381538152694702, "rewards/rollout_reward_func/std": 0.988498330116272, "sampling/importance_sampling_ratio/max": 1.0529273748397827, "sampling/importance_sampling_ratio/mean": 0.8272676467895508, "sampling/importance_sampling_ratio/min": 0.00037197224446572363, "sampling/sampling_logp_difference/max": 1.5923346281051636, "sampling/sampling_logp_difference/mean": 0.21436244249343872, "step": 1717, "step_time": 17.24406012799591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0625057807192206, "epoch": 0.00859, "grad_norm": 0.03122893162071705, "kl": 0.3925694599747658, "learning_rate": 7.999738186104493e-06, "loss": -0.0845, "step": 1718, "step_time": 7.288906778005185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.375, "completions/mean_terminated_length": 5.833333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.634101152420044, "epoch": 0.008595, "frac_reward_zero_std": 0.0, "grad_norm": 0.035457514226436615, "kl": 0.17963502556085587, "learning_rate": 7.999737874703888e-06, "loss": -0.0785, "num_tokens": 23070868.0, "reward": 0.580173134803772, "reward_std": 1.4212887287139893, "rewards/rollout_reward_func/mean": 0.580173134803772, "rewards/rollout_reward_func/std": 1.4212887287139893, "sampling/importance_sampling_ratio/max": 1.0246714353561401, "sampling/importance_sampling_ratio/mean": 0.5925616025924683, "sampling/importance_sampling_ratio/min": 0.00018845459271688014, "sampling/sampling_logp_difference/max": 1.5624016523361206, "sampling/sampling_logp_difference/mean": 0.2901573181152344, "step": 1719, "step_time": 17.200079425005242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6255987882614136, "epoch": 0.0086, "grad_norm": 0.03902941942214966, "kl": 0.1756332591176033, "learning_rate": 7.999737563118212e-06, "loss": -0.0786, "step": 1720, "step_time": 6.8767654380062595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0604906007647514, "epoch": 0.008605, "frac_reward_zero_std": 0.0, "grad_norm": 0.32952019572257996, "kl": 0.19782938063144684, "learning_rate": 7.999737251347461e-06, "loss": -0.0248, "num_tokens": 23096939.0, "reward": 0.2384120225906372, "reward_std": 0.9506359696388245, "rewards/rollout_reward_func/mean": 0.2384120225906372, "rewards/rollout_reward_func/std": 0.9506359696388245, "sampling/importance_sampling_ratio/max": 1.1697702407836914, "sampling/importance_sampling_ratio/mean": 0.8178234100341797, "sampling/importance_sampling_ratio/min": 0.0005583219463005662, "sampling/sampling_logp_difference/max": 1.147875189781189, "sampling/sampling_logp_difference/mean": 0.14852458238601685, "step": 1721, "step_time": 16.936786662976374 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.0641922652721405, "epoch": 0.00861, "grad_norm": 0.35511359572410583, "kl": 0.19846250489354134, "learning_rate": 7.999736939391642e-06, "loss": -0.0275, "step": 1722, "step_time": 6.898165851001977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.625, "completions/mean_terminated_length": 4.933333396911621, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9173602052032948, "epoch": 0.008615, "frac_reward_zero_std": 0.0, "grad_norm": 0.15064206719398499, "kl": 0.3690834492444992, "learning_rate": 7.99973662725075e-06, "loss": -0.0708, "num_tokens": 23123910.0, "reward": 0.36562228202819824, "reward_std": 0.9214500784873962, "rewards/rollout_reward_func/mean": 0.36562228202819824, "rewards/rollout_reward_func/std": 0.9214501976966858, "sampling/importance_sampling_ratio/max": 1.0476676225662231, "sampling/importance_sampling_ratio/mean": 0.7951291799545288, "sampling/importance_sampling_ratio/min": 0.000673426256980747, "sampling/sampling_logp_difference/max": 2.070418119430542, "sampling/sampling_logp_difference/mean": 0.2131274938583374, "step": 1723, "step_time": 12.666675395026687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8917281478643417, "epoch": 0.00862, "grad_norm": 0.10694064199924469, "kl": 0.366937056183815, "learning_rate": 7.999736314924787e-06, "loss": -0.0713, "step": 1724, "step_time": 6.096003855986055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 4.769230842590332, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.774579495191574, "epoch": 0.008625, "frac_reward_zero_std": 0.0, "grad_norm": 0.4272991120815277, "kl": 2.438845731317997, "learning_rate": 7.999736002413755e-06, "loss": -0.0584, "num_tokens": 23147095.0, "reward": 0.9523239731788635, "reward_std": 1.2460969686508179, "rewards/rollout_reward_func/mean": 0.9523239731788635, "rewards/rollout_reward_func/std": 1.2460969686508179, "sampling/importance_sampling_ratio/max": 1.0183072090148926, "sampling/importance_sampling_ratio/mean": 0.6609033346176147, "sampling/importance_sampling_ratio/min": 2.0246540088919573e-07, "sampling/sampling_logp_difference/max": 2.541346549987793, "sampling/sampling_logp_difference/mean": 0.351063072681427, "step": 1725, "step_time": 14.2479036149889 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0193452388048172, "entropy": 1.7761732637882233, "epoch": 0.00863, "grad_norm": 0.22382336854934692, "kl": 1.5280507914721966, "learning_rate": 7.99973568971765e-06, "loss": -0.0621, "step": 1726, "step_time": 5.8599557890120195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.8125, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.7522051334381104, "epoch": 0.008635, "frac_reward_zero_std": 0.0, "grad_norm": 0.15605488419532776, "kl": 0.17947148717939854, "learning_rate": 7.999735376836474e-06, "loss": -0.1028, "num_tokens": 23180692.0, "reward": 0.33363601565361023, "reward_std": 1.3075028657913208, "rewards/rollout_reward_func/mean": 0.33363601565361023, "rewards/rollout_reward_func/std": 1.3075028657913208, "sampling/importance_sampling_ratio/max": 1.1622861623764038, "sampling/importance_sampling_ratio/mean": 0.5347586274147034, "sampling/importance_sampling_ratio/min": 2.3676538507544365e-09, "sampling/sampling_logp_difference/max": 2.1382598876953125, "sampling/sampling_logp_difference/mean": 0.5077416896820068, "step": 1727, "step_time": 15.480455789002008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7475006580352783, "epoch": 0.00864, "grad_norm": 0.16286565363407135, "kl": 0.18060056865215302, "learning_rate": 7.999735063770227e-06, "loss": -0.1025, "step": 1728, "step_time": 7.087331791000906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.9375, "completions/mean_terminated_length": 5.266666889190674, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2397265136241913, "epoch": 0.008645, "frac_reward_zero_std": 0.0, "grad_norm": 0.029941346496343613, "kl": 0.40080640465021133, "learning_rate": 7.999734750518908e-06, "loss": -0.0662, "num_tokens": 23205684.0, "reward": 0.7209432721138, "reward_std": 1.3304661512374878, "rewards/rollout_reward_func/mean": 0.7209432721138, "rewards/rollout_reward_func/std": 1.3304662704467773, "sampling/importance_sampling_ratio/max": 1.0925532579421997, "sampling/importance_sampling_ratio/mean": 0.7099851369857788, "sampling/importance_sampling_ratio/min": 0.0002535592648200691, "sampling/sampling_logp_difference/max": 2.1582045555114746, "sampling/sampling_logp_difference/mean": 0.2878875732421875, "step": 1729, "step_time": 12.031340789981186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2510341107845306, "epoch": 0.00865, "grad_norm": 0.02917608432471752, "kl": 0.37737663090229034, "learning_rate": 7.999734437082518e-06, "loss": -0.0663, "step": 1730, "step_time": 6.320473170009791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.8125, "completions/mean_terminated_length": 5.083333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.559843897819519, "epoch": 0.008655, "frac_reward_zero_std": 0.0, "grad_norm": 0.14901816844940186, "kl": 0.27907145395874977, "learning_rate": 7.999734123461058e-06, "loss": -0.1, "num_tokens": 23236096.0, "reward": 0.5942828059196472, "reward_std": 1.2467156648635864, "rewards/rollout_reward_func/mean": 0.5942828059196472, "rewards/rollout_reward_func/std": 1.2467156648635864, "sampling/importance_sampling_ratio/max": 1.1599583625793457, "sampling/importance_sampling_ratio/mean": 0.603864312171936, "sampling/importance_sampling_ratio/min": 3.911078692908632e-06, "sampling/sampling_logp_difference/max": 1.9234206676483154, "sampling/sampling_logp_difference/mean": 0.43954998254776, "step": 1731, "step_time": 15.687210910007707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009615384973585606, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009615384973585606, "entropy": 2.5769497752189636, "epoch": 0.00866, "grad_norm": 0.17600643634796143, "kl": 0.28650143928825855, "learning_rate": 7.999733809654528e-06, "loss": -0.1009, "step": 1732, "step_time": 7.738396398999612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2601609416306019, "epoch": 0.008665, "frac_reward_zero_std": 0.5, "grad_norm": 0.04689495265483856, "kl": 0.27210718020796776, "learning_rate": 7.999733495662924e-06, "loss": -0.0443, "num_tokens": 23262843.0, "reward": -0.33928942680358887, "reward_std": 0.825805127620697, "rewards/rollout_reward_func/mean": -0.33928942680358887, "rewards/rollout_reward_func/std": 0.8258051872253418, "sampling/importance_sampling_ratio/max": 1.2179805040359497, "sampling/importance_sampling_ratio/mean": 0.7930411696434021, "sampling/importance_sampling_ratio/min": 2.844036828264507e-07, "sampling/sampling_logp_difference/max": 2.0342941284179688, "sampling/sampling_logp_difference/mean": 0.22517874836921692, "step": 1733, "step_time": 16.87033349800913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2706536911427975, "epoch": 0.00867, "grad_norm": 0.052248790860176086, "kl": 0.26862597838044167, "learning_rate": 7.999733181486251e-06, "loss": -0.0441, "step": 1734, "step_time": 6.919702444996801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.928571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1417179852724075, "epoch": 0.008675, "frac_reward_zero_std": 0.5, "grad_norm": 0.18795914947986603, "kl": 0.271371454000473, "learning_rate": 7.999732867124506e-06, "loss": -0.0401, "num_tokens": 23288418.0, "reward": 1.0718319416046143, "reward_std": 1.066267728805542, "rewards/rollout_reward_func/mean": 1.0718319416046143, "rewards/rollout_reward_func/std": 1.0662678480148315, "sampling/importance_sampling_ratio/max": 1.1842920780181885, "sampling/importance_sampling_ratio/mean": 0.7838772535324097, "sampling/importance_sampling_ratio/min": 0.0012182515347376466, "sampling/sampling_logp_difference/max": 1.3052105903625488, "sampling/sampling_logp_difference/mean": 0.15031461417675018, "step": 1735, "step_time": 14.985194768989459 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.1233369559049606, "epoch": 0.00868, "grad_norm": 0.09691224992275238, "kl": 0.27198977395892143, "learning_rate": 7.99973255257769e-06, "loss": -0.0407, "step": 1736, "step_time": 7.2628736089973245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.724981565028429, "epoch": 0.008685, "frac_reward_zero_std": 0.5, "grad_norm": 0.011916465125977993, "kl": 0.2441878840327263, "learning_rate": 7.999732237845805e-06, "loss": -0.0509, "num_tokens": 23304807.0, "reward": 0.9249472618103027, "reward_std": 1.1121095418930054, "rewards/rollout_reward_func/mean": 0.9249472618103027, "rewards/rollout_reward_func/std": 1.1121095418930054, "sampling/importance_sampling_ratio/max": 1.0348730087280273, "sampling/importance_sampling_ratio/mean": 0.8929018974304199, "sampling/importance_sampling_ratio/min": 0.001749215298332274, "sampling/sampling_logp_difference/max": 1.572663426399231, "sampling/sampling_logp_difference/mean": 0.1296331137418747, "step": 1737, "step_time": 6.027087166003184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7238234281539917, "epoch": 0.00869, "grad_norm": 0.01225810032337904, "kl": 0.2387877218425274, "learning_rate": 7.999731922928846e-06, "loss": -0.051, "step": 1738, "step_time": 3.2436171529989224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 4.714285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3655522763729095, "epoch": 0.008695, "frac_reward_zero_std": 0.0, "grad_norm": 0.07168971002101898, "kl": 0.24367622658610344, "learning_rate": 7.999731607826818e-06, "loss": 0.0028, "num_tokens": 23329849.0, "reward": -0.07631012797355652, "reward_std": 1.173759937286377, "rewards/rollout_reward_func/mean": -0.07631012797355652, "rewards/rollout_reward_func/std": 1.173759937286377, "sampling/importance_sampling_ratio/max": 1.1106598377227783, "sampling/importance_sampling_ratio/mean": 0.7550971508026123, "sampling/importance_sampling_ratio/min": 1.5456169421668164e-05, "sampling/sampling_logp_difference/max": 1.5712437629699707, "sampling/sampling_logp_difference/mean": 0.24962548911571503, "step": 1739, "step_time": 12.416304800994112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3684586882591248, "epoch": 0.0087, "grad_norm": 0.07382272183895111, "kl": 0.243437509983778, "learning_rate": 7.999731292539719e-06, "loss": 0.0026, "step": 1740, "step_time": 5.945472185994731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 4.266666889190674, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7505340948700905, "epoch": 0.008705, "frac_reward_zero_std": 0.0, "grad_norm": 0.15563912689685822, "kl": 0.8654147684574127, "learning_rate": 7.999730977067547e-06, "loss": -0.0543, "num_tokens": 23347517.0, "reward": 0.6017781496047974, "reward_std": 1.2822306156158447, "rewards/rollout_reward_func/mean": 0.6017781496047974, "rewards/rollout_reward_func/std": 1.2822306156158447, "sampling/importance_sampling_ratio/max": 1.0419610738754272, "sampling/importance_sampling_ratio/mean": 0.9108008742332458, "sampling/importance_sampling_ratio/min": 1.6782475142917974e-07, "sampling/sampling_logp_difference/max": 2.0186538696289062, "sampling/sampling_logp_difference/mean": 0.2002755105495453, "step": 1741, "step_time": 7.251146184004028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7556424289941788, "epoch": 0.00871, "grad_norm": 0.15212060511112213, "kl": 0.825533501803875, "learning_rate": 7.999730661410308e-06, "loss": -0.0548, "step": 1742, "step_time": 3.751985288996366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.538461685180664, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.318725235760212, "epoch": 0.008715, "frac_reward_zero_std": 0.0, "grad_norm": 0.07171463221311569, "kl": 0.3370238319039345, "learning_rate": 7.999730345567995e-06, "loss": -0.0701, "num_tokens": 23379672.0, "reward": 0.7474005222320557, "reward_std": 1.1289212703704834, "rewards/rollout_reward_func/mean": 0.7474005222320557, "rewards/rollout_reward_func/std": 1.128921389579773, "sampling/importance_sampling_ratio/max": 1.1506073474884033, "sampling/importance_sampling_ratio/mean": 0.6918883323669434, "sampling/importance_sampling_ratio/min": 0.0003488186339382082, "sampling/sampling_logp_difference/max": 1.6291694641113281, "sampling/sampling_logp_difference/mean": 0.24134251475334167, "step": 1743, "step_time": 16.61402141400322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.319956623017788, "epoch": 0.00872, "grad_norm": 0.07075877487659454, "kl": 0.34092072024941444, "learning_rate": 7.999730029540612e-06, "loss": -0.0701, "step": 1744, "step_time": 6.897094264000771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.3130234256386757, "epoch": 0.008725, "frac_reward_zero_std": 0.5, "grad_norm": 0.4392167925834656, "kl": 0.248943991959095, "learning_rate": 7.999729713328158e-06, "loss": -0.0095, "num_tokens": 23400260.0, "reward": 1.0164943933486938, "reward_std": 1.407515525817871, "rewards/rollout_reward_func/mean": 1.0164943933486938, "rewards/rollout_reward_func/std": 1.407515525817871, "sampling/importance_sampling_ratio/max": 1.0304893255233765, "sampling/importance_sampling_ratio/mean": 0.9612725377082825, "sampling/importance_sampling_ratio/min": 0.5161803364753723, "sampling/sampling_logp_difference/max": 0.6581401824951172, "sampling/sampling_logp_difference/mean": 0.03615832328796387, "step": 1745, "step_time": 9.009869635017822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3711981549859047, "epoch": 0.00873, "grad_norm": 0.35284584760665894, "kl": 0.24325059726834297, "learning_rate": 7.999729396930634e-06, "loss": -0.013, "step": 1746, "step_time": 4.899103574003675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 4.285714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.051280751824379, "epoch": 0.008735, "frac_reward_zero_std": 0.0, "grad_norm": 0.17860162258148193, "kl": 0.26193467155098915, "learning_rate": 7.999729080348038e-06, "loss": -0.0347, "num_tokens": 23433423.0, "reward": 0.6539081931114197, "reward_std": 1.178390383720398, "rewards/rollout_reward_func/mean": 0.6539081931114197, "rewards/rollout_reward_func/std": 1.1783905029296875, "sampling/importance_sampling_ratio/max": 1.3003865480422974, "sampling/importance_sampling_ratio/mean": 0.8823786377906799, "sampling/importance_sampling_ratio/min": 0.00013436131121125072, "sampling/sampling_logp_difference/max": 1.0774928331375122, "sampling/sampling_logp_difference/mean": 0.1855626106262207, "step": 1747, "step_time": 15.52714378600649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.0771609917283058, "epoch": 0.00874, "grad_norm": 0.15183089673519135, "kl": 0.2561041507869959, "learning_rate": 7.999728763580374e-06, "loss": -0.036, "step": 1748, "step_time": 7.152623457994196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.5625, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.141501486301422, "epoch": 0.008745, "frac_reward_zero_std": 0.0, "grad_norm": 0.043961383402347565, "kl": 0.4160950966179371, "learning_rate": 7.999728446627634e-06, "loss": -0.0891, "num_tokens": 23457955.0, "reward": 0.8441352248191833, "reward_std": 1.2782872915267944, "rewards/rollout_reward_func/mean": 0.8441352248191833, "rewards/rollout_reward_func/std": 1.278287410736084, "sampling/importance_sampling_ratio/max": 1.085986614227295, "sampling/importance_sampling_ratio/mean": 0.7136276960372925, "sampling/importance_sampling_ratio/min": 4.091149641283209e-09, "sampling/sampling_logp_difference/max": 2.632195472717285, "sampling/sampling_logp_difference/mean": 0.43381136655807495, "step": 1749, "step_time": 13.523151637986302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.141726791858673, "epoch": 0.00875, "grad_norm": 0.04508356377482414, "kl": 0.4043220058083534, "learning_rate": 7.999728129489829e-06, "loss": -0.089, "step": 1750, "step_time": 5.857734608012834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.9375, "completions/mean_terminated_length": 4.84615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5660297870635986, "epoch": 0.008755, "frac_reward_zero_std": 0.0, "grad_norm": 0.034169357270002365, "kl": 0.24015305563807487, "learning_rate": 7.99972781216695e-06, "loss": -0.0776, "num_tokens": 23483346.0, "reward": 1.2415056228637695, "reward_std": 0.9826053977012634, "rewards/rollout_reward_func/mean": 1.2415056228637695, "rewards/rollout_reward_func/std": 0.982605516910553, "sampling/importance_sampling_ratio/max": 1.0719977617263794, "sampling/importance_sampling_ratio/mean": 0.739480197429657, "sampling/importance_sampling_ratio/min": 6.654630624325364e-08, "sampling/sampling_logp_difference/max": 2.09226131439209, "sampling/sampling_logp_difference/mean": 0.2649378180503845, "step": 1751, "step_time": 14.150510630002827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5678999423980713, "epoch": 0.00876, "grad_norm": 0.0355399064719677, "kl": 0.22718586400151253, "learning_rate": 7.999727494659e-06, "loss": -0.0775, "step": 1752, "step_time": 6.354545387017424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 6.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.38727530837059, "epoch": 0.008765, "frac_reward_zero_std": 0.0, "grad_norm": 0.04862470179796219, "kl": 0.17129520792514086, "learning_rate": 7.999727176965982e-06, "loss": -0.0922, "num_tokens": 23514737.0, "reward": -0.01284557580947876, "reward_std": 1.3738621473312378, "rewards/rollout_reward_func/mean": -0.01284557580947876, "rewards/rollout_reward_func/std": 1.3738621473312378, "sampling/importance_sampling_ratio/max": 1.1114602088928223, "sampling/importance_sampling_ratio/mean": 0.5236203670501709, "sampling/importance_sampling_ratio/min": 2.5072749849641696e-05, "sampling/sampling_logp_difference/max": 1.7233946323394775, "sampling/sampling_logp_difference/mean": 0.41858971118927, "step": 1753, "step_time": 16.148534708001534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4085163474082947, "epoch": 0.00877, "grad_norm": 0.055399686098098755, "kl": 0.17026417888700962, "learning_rate": 7.999726859087891e-06, "loss": -0.0923, "step": 1754, "step_time": 6.376120417990023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.125, "completions/mean_terminated_length": 6.533333778381348, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.002509370446205, "epoch": 0.008775, "frac_reward_zero_std": 0.0, "grad_norm": 0.04135557636618614, "kl": 0.34629736095666885, "learning_rate": 7.999726541024731e-06, "loss": -0.0864, "num_tokens": 23537795.0, "reward": 0.9813976883888245, "reward_std": 1.3370296955108643, "rewards/rollout_reward_func/mean": 0.9813976883888245, "rewards/rollout_reward_func/std": 1.3370296955108643, "sampling/importance_sampling_ratio/max": 1.046859622001648, "sampling/importance_sampling_ratio/mean": 0.6168321371078491, "sampling/importance_sampling_ratio/min": 3.8018224586267024e-06, "sampling/sampling_logp_difference/max": 1.9747002124786377, "sampling/sampling_logp_difference/mean": 0.34047749638557434, "step": 1755, "step_time": 14.355380092994892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.001835584640503, "epoch": 0.00878, "grad_norm": 0.04073713347315788, "kl": 0.33811117708683014, "learning_rate": 7.999726222776499e-06, "loss": -0.0865, "step": 1756, "step_time": 6.00365175998013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.625, "completions/mean_terminated_length": 5.692307949066162, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.586658887565136, "epoch": 0.008785, "frac_reward_zero_std": 0.0, "grad_norm": 0.07284241914749146, "kl": 0.1635245569050312, "learning_rate": 7.999725904343197e-06, "loss": -0.0657, "num_tokens": 23562963.0, "reward": -0.3686370551586151, "reward_std": 0.980696439743042, "rewards/rollout_reward_func/mean": -0.3686370551586151, "rewards/rollout_reward_func/std": 0.9806964993476868, "sampling/importance_sampling_ratio/max": 1.1489458084106445, "sampling/importance_sampling_ratio/mean": 0.6622767448425293, "sampling/importance_sampling_ratio/min": 5.482302003656514e-05, "sampling/sampling_logp_difference/max": 1.7358262538909912, "sampling/sampling_logp_difference/mean": 0.2502013146877289, "step": 1757, "step_time": 15.343355335004162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5952179953455925, "epoch": 0.00879, "grad_norm": 0.0856577679514885, "kl": 0.1623230315744877, "learning_rate": 7.999725585724824e-06, "loss": -0.0662, "step": 1758, "step_time": 5.830644336019759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8511468842625618, "epoch": 0.008795, "frac_reward_zero_std": 0.0, "grad_norm": 0.023600498214364052, "kl": 0.4800524339079857, "learning_rate": 7.999725266921382e-06, "loss": -0.0734, "num_tokens": 23579196.0, "reward": 1.7833526134490967, "reward_std": 0.7453609108924866, "rewards/rollout_reward_func/mean": 1.7833526134490967, "rewards/rollout_reward_func/std": 0.7453609108924866, "sampling/importance_sampling_ratio/max": 1.0515499114990234, "sampling/importance_sampling_ratio/mean": 0.8462116718292236, "sampling/importance_sampling_ratio/min": 3.7249919841997325e-05, "sampling/sampling_logp_difference/max": 1.8627434968948364, "sampling/sampling_logp_difference/mean": 0.2083597183227539, "step": 1759, "step_time": 5.875055826996686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8489615730941296, "epoch": 0.0088, "grad_norm": 0.02406259998679161, "kl": 0.47187910974025726, "learning_rate": 7.999724947932869e-06, "loss": -0.0734, "step": 1760, "step_time": 3.1763641300058225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.1875, "completions/mean_terminated_length": 6.384615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.566763609647751, "epoch": 0.008805, "frac_reward_zero_std": 0.0, "grad_norm": 0.16346320509910583, "kl": 0.3484414666891098, "learning_rate": 7.999724628759285e-06, "loss": -0.0566, "num_tokens": 23602691.0, "reward": 0.5988425016403198, "reward_std": 1.437261939048767, "rewards/rollout_reward_func/mean": 0.5988425016403198, "rewards/rollout_reward_func/std": 1.437261939048767, "sampling/importance_sampling_ratio/max": 1.0419639348983765, "sampling/importance_sampling_ratio/mean": 0.5552722215652466, "sampling/importance_sampling_ratio/min": 2.2107144559413427e-06, "sampling/sampling_logp_difference/max": 1.936450719833374, "sampling/sampling_logp_difference/mean": 0.46772003173828125, "step": 1761, "step_time": 16.169907887015142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.007352941203862429, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007352941203862429, "entropy": 2.568452388048172, "epoch": 0.00881, "grad_norm": 0.05912833660840988, "kl": 0.347599383443594, "learning_rate": 7.99972430940063e-06, "loss": -0.0573, "step": 1762, "step_time": 6.247982315995614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 4.5714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.2559810876846313, "epoch": 0.008815, "frac_reward_zero_std": 0.0, "grad_norm": 0.27008190751075745, "kl": 2.3094046507030725, "learning_rate": 7.999723989856905e-06, "loss": -0.0347, "num_tokens": 23630426.0, "reward": -0.874285101890564, "reward_std": 0.30500635504722595, "rewards/rollout_reward_func/mean": -0.874285101890564, "rewards/rollout_reward_func/std": 0.30500635504722595, "sampling/importance_sampling_ratio/max": 1.0747618675231934, "sampling/importance_sampling_ratio/mean": 0.2917868494987488, "sampling/importance_sampling_ratio/min": 1.422615092394608e-08, "sampling/sampling_logp_difference/max": 2.285433530807495, "sampling/sampling_logp_difference/mean": 0.49550408124923706, "step": 1763, "step_time": 18.442089561998728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.2607738375663757, "epoch": 0.00882, "grad_norm": 0.23125579953193665, "kl": 1.9341701809316874, "learning_rate": 7.999723670128109e-06, "loss": -0.0358, "step": 1764, "step_time": 6.887846682991949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.625, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.663814067840576, "epoch": 0.008825, "frac_reward_zero_std": 0.0, "grad_norm": 0.039463359862565994, "kl": 0.11554528586566448, "learning_rate": 7.999723350214244e-06, "loss": -0.1079, "num_tokens": 23666385.0, "reward": 0.2297719419002533, "reward_std": 1.21501624584198, "rewards/rollout_reward_func/mean": 0.2297719419002533, "rewards/rollout_reward_func/std": 1.21501624584198, "sampling/importance_sampling_ratio/max": 1.0885123014450073, "sampling/importance_sampling_ratio/mean": 0.4569852650165558, "sampling/importance_sampling_ratio/min": 9.497991413809359e-06, "sampling/sampling_logp_difference/max": 1.667852520942688, "sampling/sampling_logp_difference/mean": 0.39942651987075806, "step": 1765, "step_time": 18.865322571000434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.6662025153636932, "epoch": 0.00883, "grad_norm": 0.03897828608751297, "kl": 0.11688387952744961, "learning_rate": 7.999723030115306e-06, "loss": -0.1079, "step": 1766, "step_time": 7.812624140002299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 6.666666507720947, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.7840569615364075, "epoch": 0.008835, "frac_reward_zero_std": 0.0, "grad_norm": 0.04589826986193657, "kl": 0.15074944123625755, "learning_rate": 7.9997227098313e-06, "loss": -0.1155, "num_tokens": 23697444.0, "reward": 0.19981913268566132, "reward_std": 1.2949473857879639, "rewards/rollout_reward_func/mean": 0.19981913268566132, "rewards/rollout_reward_func/std": 1.2949473857879639, "sampling/importance_sampling_ratio/max": 1.1825915575027466, "sampling/importance_sampling_ratio/mean": 0.42722946405410767, "sampling/importance_sampling_ratio/min": 9.186000293759378e-10, "sampling/sampling_logp_difference/max": 2.8730478286743164, "sampling/sampling_logp_difference/mean": 0.5002427101135254, "step": 1767, "step_time": 18.136215077000088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.78469055891037, "epoch": 0.00884, "grad_norm": 0.045082803815603256, "kl": 0.1537411529570818, "learning_rate": 7.999722389362224e-06, "loss": -0.1155, "step": 1768, "step_time": 6.818940658966312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.9375, "completions/mean_terminated_length": 6.583333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.633365958929062, "epoch": 0.008845, "frac_reward_zero_std": 0.0, "grad_norm": 0.09401761740446091, "kl": 0.2214627042412758, "learning_rate": 7.999722068708076e-06, "loss": -0.1048, "num_tokens": 23727827.0, "reward": 0.1637152135372162, "reward_std": 1.2666449546813965, "rewards/rollout_reward_func/mean": 0.1637152135372162, "rewards/rollout_reward_func/std": 1.2666449546813965, "sampling/importance_sampling_ratio/max": 1.1274794340133667, "sampling/importance_sampling_ratio/mean": 0.5073388814926147, "sampling/importance_sampling_ratio/min": 5.141263272889773e-07, "sampling/sampling_logp_difference/max": 1.8313456773757935, "sampling/sampling_logp_difference/mean": 0.4218447208404541, "step": 1769, "step_time": 15.090000488999067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.646487683057785, "epoch": 0.00885, "grad_norm": 0.08544685691595078, "kl": 0.2295696958899498, "learning_rate": 7.999721747868858e-06, "loss": -0.1052, "step": 1770, "step_time": 6.336266368001816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.3125, "completions/mean_terminated_length": 4.818181991577148, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6148168668150902, "epoch": 0.008855, "frac_reward_zero_std": 0.0, "grad_norm": 0.05351094901561737, "kl": 0.2902985643595457, "learning_rate": 7.999721426844571e-06, "loss": -0.0816, "num_tokens": 23760983.0, "reward": 0.6574940085411072, "reward_std": 1.183972716331482, "rewards/rollout_reward_func/mean": 0.6574940085411072, "rewards/rollout_reward_func/std": 1.1839728355407715, "sampling/importance_sampling_ratio/max": 1.1508342027664185, "sampling/importance_sampling_ratio/mean": 0.6001605987548828, "sampling/importance_sampling_ratio/min": 0.0005518730613403022, "sampling/sampling_logp_difference/max": 1.4147017002105713, "sampling/sampling_logp_difference/mean": 0.2957077920436859, "step": 1771, "step_time": 16.677775111995288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.60451565310359, "epoch": 0.00886, "grad_norm": 0.05034719780087471, "kl": 0.2862442210316658, "learning_rate": 7.999721105635212e-06, "loss": -0.0818, "step": 1772, "step_time": 7.642810254008509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.1875, "completions/mean_terminated_length": 5.666666507720947, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5935813188552856, "epoch": 0.008865, "frac_reward_zero_std": 0.0, "grad_norm": 0.013642370700836182, "kl": 0.16572466492652893, "learning_rate": 7.999720784240784e-06, "loss": -0.0865, "num_tokens": 23794718.0, "reward": 0.2116279900074005, "reward_std": 1.2896157503128052, "rewards/rollout_reward_func/mean": 0.2116279900074005, "rewards/rollout_reward_func/std": 1.2896158695220947, "sampling/importance_sampling_ratio/max": 1.0161727666854858, "sampling/importance_sampling_ratio/mean": 0.3982408046722412, "sampling/importance_sampling_ratio/min": 1.424871776123382e-08, "sampling/sampling_logp_difference/max": 2.5636696815490723, "sampling/sampling_logp_difference/mean": 0.4512992203235626, "step": 1773, "step_time": 18.309475851987372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5870319604873657, "epoch": 0.00887, "grad_norm": 0.012136915698647499, "kl": 0.17226980440318584, "learning_rate": 7.999720462661285e-06, "loss": -0.0866, "step": 1774, "step_time": 7.005778873019153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 9.875, "completions/mean_terminated_length": 5.111111164093018, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.8758715987205505, "epoch": 0.008875, "frac_reward_zero_std": 0.0, "grad_norm": 0.299597829580307, "kl": 0.34651424176990986, "learning_rate": 7.999720140896716e-06, "loss": -0.0602, "num_tokens": 23833752.0, "reward": -0.3247385621070862, "reward_std": 0.7202039361000061, "rewards/rollout_reward_func/mean": -0.3247385621070862, "rewards/rollout_reward_func/std": 0.7202039957046509, "sampling/importance_sampling_ratio/max": 1.306133508682251, "sampling/importance_sampling_ratio/mean": 0.4296228885650635, "sampling/importance_sampling_ratio/min": 1.3212486010161228e-05, "sampling/sampling_logp_difference/max": 1.8382139205932617, "sampling/sampling_logp_difference/mean": 0.4205966293811798, "step": 1775, "step_time": 20.02017117200012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8627471029758453, "epoch": 0.00888, "grad_norm": 0.18860755860805511, "kl": 0.3416178748011589, "learning_rate": 7.999719818947077e-06, "loss": -0.0618, "step": 1776, "step_time": 8.335441873976379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 4.2727274894714355, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9257512539625168, "epoch": 0.008885, "frac_reward_zero_std": 0.0, "grad_norm": 0.05064960569143295, "kl": 0.3144494239240885, "learning_rate": 7.999719496812368e-06, "loss": -0.1057, "num_tokens": 23857102.0, "reward": 1.085752248764038, "reward_std": 1.1775153875350952, "rewards/rollout_reward_func/mean": 1.085752248764038, "rewards/rollout_reward_func/std": 1.1775153875350952, "sampling/importance_sampling_ratio/max": 1.0664949417114258, "sampling/importance_sampling_ratio/mean": 0.6541175842285156, "sampling/importance_sampling_ratio/min": 8.187381439483943e-08, "sampling/sampling_logp_difference/max": 2.1936655044555664, "sampling/sampling_logp_difference/mean": 0.4738926291465759, "step": 1777, "step_time": 15.161181904011755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.920553632080555, "epoch": 0.00889, "grad_norm": 0.044274378567934036, "kl": 0.3271143827587366, "learning_rate": 7.999719174492588e-06, "loss": -0.106, "step": 1778, "step_time": 5.861567997984821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.6875, "completions/mean_terminated_length": 5.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.0757301449775696, "epoch": 0.008895, "frac_reward_zero_std": 0.0, "grad_norm": 0.1400042176246643, "kl": 0.36047128215432167, "learning_rate": 7.999718851987738e-06, "loss": -0.0742, "num_tokens": 23890991.0, "reward": 0.02897367626428604, "reward_std": 1.052493929862976, "rewards/rollout_reward_func/mean": 0.02897367626428604, "rewards/rollout_reward_func/std": 1.0524938106536865, "sampling/importance_sampling_ratio/max": 1.0835473537445068, "sampling/importance_sampling_ratio/mean": 0.39696604013442993, "sampling/importance_sampling_ratio/min": 5.164704361959593e-07, "sampling/sampling_logp_difference/max": 2.1739017963409424, "sampling/sampling_logp_difference/mean": 0.4844551086425781, "step": 1779, "step_time": 19.099599657027284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.076458901166916, "epoch": 0.0089, "grad_norm": 0.14261092245578766, "kl": 0.3728668624535203, "learning_rate": 7.999718529297817e-06, "loss": -0.0742, "step": 1780, "step_time": 7.1486331230116775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.0625, "completions/mean_terminated_length": 6.125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.815543830394745, "epoch": 0.008905, "frac_reward_zero_std": 0.0, "grad_norm": 0.049048278480768204, "kl": 0.1546869669109583, "learning_rate": 7.999718206422829e-06, "loss": -0.0452, "num_tokens": 23924493.0, "reward": -0.6743718385696411, "reward_std": 0.1727137565612793, "rewards/rollout_reward_func/mean": -0.6743718385696411, "rewards/rollout_reward_func/std": 0.1727137565612793, "sampling/importance_sampling_ratio/max": 1.0483437776565552, "sampling/importance_sampling_ratio/mean": 0.38549351692199707, "sampling/importance_sampling_ratio/min": 5.1647589316417e-07, "sampling/sampling_logp_difference/max": 1.58700692653656, "sampling/sampling_logp_difference/mean": 0.39341962337493896, "step": 1781, "step_time": 18.78934683300031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.806711435317993, "epoch": 0.00891, "grad_norm": 0.034365180879831314, "kl": 0.16658620350062847, "learning_rate": 7.999717883362769e-06, "loss": -0.0454, "step": 1782, "step_time": 7.186972252995474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.4375, "completions/mean_terminated_length": 4.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.028565227985382, "epoch": 0.008915, "frac_reward_zero_std": 0.0, "grad_norm": 0.039852529764175415, "kl": 0.15126553364098072, "learning_rate": 7.999717560117638e-06, "loss": -0.0939, "num_tokens": 23954068.0, "reward": 0.16006693243980408, "reward_std": 1.2729148864746094, "rewards/rollout_reward_func/mean": 0.16006693243980408, "rewards/rollout_reward_func/std": 1.2729148864746094, "sampling/importance_sampling_ratio/max": 1.0467764139175415, "sampling/importance_sampling_ratio/mean": 0.39670416712760925, "sampling/importance_sampling_ratio/min": 8.217556413114835e-09, "sampling/sampling_logp_difference/max": 2.0828075408935547, "sampling/sampling_logp_difference/mean": 0.4840877056121826, "step": 1783, "step_time": 16.75521124099032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.020302653312683, "epoch": 0.00892, "grad_norm": 0.03533787652850151, "kl": 0.15163713414222002, "learning_rate": 7.999717236687438e-06, "loss": -0.094, "step": 1784, "step_time": 5.887993598007597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9779400303959846, "epoch": 0.008925, "frac_reward_zero_std": 0.0, "grad_norm": 0.1596924364566803, "kl": 0.21439114585518837, "learning_rate": 7.999716913072169e-06, "loss": -0.0555, "num_tokens": 23979549.0, "reward": 0.5632205605506897, "reward_std": 1.049818515777588, "rewards/rollout_reward_func/mean": 0.5632205605506897, "rewards/rollout_reward_func/std": 1.049818515777588, "sampling/importance_sampling_ratio/max": 1.2174863815307617, "sampling/importance_sampling_ratio/mean": 0.7181516289710999, "sampling/importance_sampling_ratio/min": 5.838818992742745e-07, "sampling/sampling_logp_difference/max": 1.9623008966445923, "sampling/sampling_logp_difference/mean": 0.39999088644981384, "step": 1785, "step_time": 14.354271044983761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9675613790750504, "epoch": 0.00893, "grad_norm": 0.1606082171201706, "kl": 0.2125803343951702, "learning_rate": 7.999716589271827e-06, "loss": -0.0557, "step": 1786, "step_time": 6.389316610991955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 3.909090995788574, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3061352968215942, "epoch": 0.008935, "frac_reward_zero_std": 0.0, "grad_norm": 0.06506135314702988, "kl": 0.5601161401718855, "learning_rate": 7.999716265286417e-06, "loss": -0.1088, "num_tokens": 24011667.0, "reward": 0.6431361436843872, "reward_std": 1.276309609413147, "rewards/rollout_reward_func/mean": 0.6431361436843872, "rewards/rollout_reward_func/std": 1.276309609413147, "sampling/importance_sampling_ratio/max": 1.062459111213684, "sampling/importance_sampling_ratio/mean": 0.5860253572463989, "sampling/importance_sampling_ratio/min": 1.8471595808478014e-08, "sampling/sampling_logp_difference/max": 2.2984256744384766, "sampling/sampling_logp_difference/mean": 0.3749130666255951, "step": 1787, "step_time": 17.242658443006803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.30890029668808, "epoch": 0.00894, "grad_norm": 0.06397345662117004, "kl": 0.5608318820595741, "learning_rate": 7.999715941115938e-06, "loss": -0.1089, "step": 1788, "step_time": 6.925218938980834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 4.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0450235307216644, "epoch": 0.008945, "frac_reward_zero_std": 0.0, "grad_norm": 0.11611475795507431, "kl": 0.23351215198636055, "learning_rate": 7.999715616760388e-06, "loss": -0.1085, "num_tokens": 24048099.0, "reward": 0.5551034212112427, "reward_std": 1.1075598001480103, "rewards/rollout_reward_func/mean": 0.5551034212112427, "rewards/rollout_reward_func/std": 1.1075598001480103, "sampling/importance_sampling_ratio/max": 1.144329309463501, "sampling/importance_sampling_ratio/mean": 0.590406060218811, "sampling/importance_sampling_ratio/min": 4.7205576265696436e-05, "sampling/sampling_logp_difference/max": 2.0564165115356445, "sampling/sampling_logp_difference/mean": 0.34486883878707886, "step": 1789, "step_time": 21.205988056011847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0403133630752563, "epoch": 0.00895, "grad_norm": 0.0985233336687088, "kl": 0.23474252969026566, "learning_rate": 7.999715292219767e-06, "loss": -0.1087, "step": 1790, "step_time": 8.526020930992672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8354894425719976, "epoch": 0.008955, "frac_reward_zero_std": 0.0, "grad_norm": 0.09860531240701675, "kl": 0.2686939090490341, "learning_rate": 7.999714967494077e-06, "loss": -0.0842, "num_tokens": 24075354.0, "reward": -0.24295854568481445, "reward_std": 1.018125057220459, "rewards/rollout_reward_func/mean": -0.24295854568481445, "rewards/rollout_reward_func/std": 1.0181251764297485, "sampling/importance_sampling_ratio/max": 1.0500012636184692, "sampling/importance_sampling_ratio/mean": 0.7316128015518188, "sampling/importance_sampling_ratio/min": 2.0597499315044843e-06, "sampling/sampling_logp_difference/max": 2.1888771057128906, "sampling/sampling_logp_difference/mean": 0.38169005513191223, "step": 1791, "step_time": 16.81033577400376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8343748971819878, "epoch": 0.00896, "grad_norm": 0.09639162570238113, "kl": 0.2711078152060509, "learning_rate": 7.999714642583318e-06, "loss": -0.0845, "step": 1792, "step_time": 7.064636592025636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 5.090909004211426, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.369744151830673, "epoch": 0.008965, "frac_reward_zero_std": 0.0, "grad_norm": 0.0355338491499424, "kl": 0.37240245193243027, "learning_rate": 7.999714317487487e-06, "loss": -0.0953, "num_tokens": 24099135.0, "reward": 1.0809129476547241, "reward_std": 1.2616959810256958, "rewards/rollout_reward_func/mean": 1.0809129476547241, "rewards/rollout_reward_func/std": 1.2616958618164062, "sampling/importance_sampling_ratio/max": 1.0670477151870728, "sampling/importance_sampling_ratio/mean": 0.6371211409568787, "sampling/importance_sampling_ratio/min": 5.5406307097882745e-08, "sampling/sampling_logp_difference/max": 2.7580740451812744, "sampling/sampling_logp_difference/mean": 0.39039960503578186, "step": 1793, "step_time": 16.02847187097359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.371639519929886, "epoch": 0.00897, "grad_norm": 0.036602456122636795, "kl": 0.3792453855276108, "learning_rate": 7.999713992206588e-06, "loss": -0.0953, "step": 1794, "step_time": 6.3792870699980995 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.857142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1522273570299149, "epoch": 0.008975, "frac_reward_zero_std": 0.0, "grad_norm": 0.18621528148651123, "kl": 0.21192031726241112, "learning_rate": 7.999713666740619e-06, "loss": -0.0569, "num_tokens": 24125290.0, "reward": -0.025417685508728027, "reward_std": 1.2585078477859497, "rewards/rollout_reward_func/mean": -0.025417685508728027, "rewards/rollout_reward_func/std": 1.2585078477859497, "sampling/importance_sampling_ratio/max": 1.140343427658081, "sampling/importance_sampling_ratio/mean": 0.7807074785232544, "sampling/importance_sampling_ratio/min": 0.0024375291541218758, "sampling/sampling_logp_difference/max": 1.3152270317077637, "sampling/sampling_logp_difference/mean": 0.1872694194316864, "step": 1795, "step_time": 14.049425210017944 }, { "clip_ratio/high_max": 0.043750000186264515, "clip_ratio/high_mean": 0.021875000093132257, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021875000093132257, "entropy": 1.1422792300581932, "epoch": 0.00898, "grad_norm": 0.14010556042194366, "kl": 0.21584652550518513, "learning_rate": 7.99971334108958e-06, "loss": -0.0575, "step": 1796, "step_time": 6.314680991999921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2675081342458725, "epoch": 0.008985, "frac_reward_zero_std": 0.0, "grad_norm": 0.17582136392593384, "kl": 0.25606338679790497, "learning_rate": 7.99971301525347e-06, "loss": -0.0404, "num_tokens": 24157914.0, "reward": -0.11471038311719894, "reward_std": 1.0785109996795654, "rewards/rollout_reward_func/mean": -0.11471038311719894, "rewards/rollout_reward_func/std": 1.0785109996795654, "sampling/importance_sampling_ratio/max": 1.0490447282791138, "sampling/importance_sampling_ratio/mean": 0.7439786195755005, "sampling/importance_sampling_ratio/min": 0.0008900067768990993, "sampling/sampling_logp_difference/max": 1.178931713104248, "sampling/sampling_logp_difference/mean": 0.19280906021595, "step": 1797, "step_time": 17.730057085005683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2698195204138756, "epoch": 0.00899, "grad_norm": 0.15914343297481537, "kl": 0.25871771946549416, "learning_rate": 7.999712689232292e-06, "loss": -0.0407, "step": 1798, "step_time": 6.973412990992074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.800000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9089507609605789, "epoch": 0.008995, "frac_reward_zero_std": 0.5, "grad_norm": 0.05575894936919212, "kl": 0.8401067703962326, "learning_rate": 7.999712363026044e-06, "loss": -0.037, "num_tokens": 24175794.0, "reward": 1.4254419803619385, "reward_std": 1.093088150024414, "rewards/rollout_reward_func/mean": 1.4254419803619385, "rewards/rollout_reward_func/std": 1.093088150024414, "sampling/importance_sampling_ratio/max": 1.0363491773605347, "sampling/importance_sampling_ratio/mean": 0.8313016891479492, "sampling/importance_sampling_ratio/min": 0.00018609709513839334, "sampling/sampling_logp_difference/max": 1.6556577682495117, "sampling/sampling_logp_difference/mean": 0.13791553676128387, "step": 1799, "step_time": 9.773391067006742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9083101935684681, "epoch": 0.009, "grad_norm": 0.05501062050461769, "kl": 0.841185238212347, "learning_rate": 7.999712036634726e-06, "loss": -0.0371, "step": 1800, "step_time": 4.832943568981136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.875, "completions/mean_terminated_length": 5.111111164093018, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.8709042966365814, "epoch": 0.009005, "frac_reward_zero_std": 0.0, "grad_norm": 0.06971549242734909, "kl": 0.49035361781716347, "learning_rate": 7.999711710058339e-06, "loss": -0.0951, "num_tokens": 24209166.0, "reward": 0.41596126556396484, "reward_std": 1.193223476409912, "rewards/rollout_reward_func/mean": 0.41596126556396484, "rewards/rollout_reward_func/std": 1.1932235956192017, "sampling/importance_sampling_ratio/max": 1.0721395015716553, "sampling/importance_sampling_ratio/mean": 0.44880616664886475, "sampling/importance_sampling_ratio/min": 3.02793779383137e-07, "sampling/sampling_logp_difference/max": 2.1524596214294434, "sampling/sampling_logp_difference/mean": 0.4429340362548828, "step": 1801, "step_time": 17.00744607600791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8678578436374664, "epoch": 0.00901, "grad_norm": 0.08157045394182205, "kl": 0.5348483771085739, "learning_rate": 7.99971138329688e-06, "loss": -0.0949, "step": 1802, "step_time": 6.916240185004426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 7.8125, "completions/mean_terminated_length": 4.090909004211426, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.091934099793434, "epoch": 0.009015, "frac_reward_zero_std": 0.0, "grad_norm": 0.0241920817643404, "kl": 0.2421383373439312, "learning_rate": 7.999711056350353e-06, "loss": -0.0842, "num_tokens": 24237881.0, "reward": 0.9441750645637512, "reward_std": 1.1541255712509155, "rewards/rollout_reward_func/mean": 0.9441750645637512, "rewards/rollout_reward_func/std": 1.154125690460205, "sampling/importance_sampling_ratio/max": 1.1104158163070679, "sampling/importance_sampling_ratio/mean": 0.6454601287841797, "sampling/importance_sampling_ratio/min": 2.6865098590178604e-08, "sampling/sampling_logp_difference/max": 2.3325467109680176, "sampling/sampling_logp_difference/mean": 0.4487351179122925, "step": 1803, "step_time": 17.503197956990334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0906338710337877, "epoch": 0.00902, "grad_norm": 0.02398546226322651, "kl": 0.23915202915668488, "learning_rate": 7.999710729218757e-06, "loss": -0.0843, "step": 1804, "step_time": 6.576690427973517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3264424037188292, "epoch": 0.009025, "frac_reward_zero_std": 0.5, "grad_norm": 0.025882698595523834, "kl": 0.2847226560115814, "learning_rate": 7.99971040190209e-06, "loss": -0.0443, "num_tokens": 24264651.0, "reward": 0.9896903038024902, "reward_std": 1.2760454416275024, "rewards/rollout_reward_func/mean": 0.9896903038024902, "rewards/rollout_reward_func/std": 1.276045560836792, "sampling/importance_sampling_ratio/max": 1.0185167789459229, "sampling/importance_sampling_ratio/mean": 0.7062822580337524, "sampling/importance_sampling_ratio/min": 3.0068480327827274e-07, "sampling/sampling_logp_difference/max": 1.8621506690979004, "sampling/sampling_logp_difference/mean": 0.2318859100341797, "step": 1805, "step_time": 16.557769063016167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.324466459453106, "epoch": 0.00903, "grad_norm": 0.0328187420964241, "kl": 0.2903209328651428, "learning_rate": 7.999710074400353e-06, "loss": -0.0442, "step": 1806, "step_time": 6.442001735995291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.0625, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8563827574253082, "epoch": 0.009035, "frac_reward_zero_std": 0.5, "grad_norm": 0.02962682768702507, "kl": 0.4185131564736366, "learning_rate": 7.999709746713548e-06, "loss": -0.0495, "num_tokens": 24282729.0, "reward": 1.638691782951355, "reward_std": 0.9880918860435486, "rewards/rollout_reward_func/mean": 1.638691782951355, "rewards/rollout_reward_func/std": 0.9880918860435486, "sampling/importance_sampling_ratio/max": 1.0198147296905518, "sampling/importance_sampling_ratio/mean": 0.8850303888320923, "sampling/importance_sampling_ratio/min": 0.0011190782533958554, "sampling/sampling_logp_difference/max": 1.7525174617767334, "sampling/sampling_logp_difference/mean": 0.14137203991413116, "step": 1807, "step_time": 9.755877337986021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8554271012544632, "epoch": 0.00904, "grad_norm": 0.028073105961084366, "kl": 0.4030712377279997, "learning_rate": 7.999709418841672e-06, "loss": -0.0496, "step": 1808, "step_time": 4.863610901971697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 5.769230842590332, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6928857564926147, "epoch": 0.009045, "frac_reward_zero_std": 0.0, "grad_norm": 0.007767301518470049, "kl": 0.19792740419507027, "learning_rate": 7.999709090784727e-06, "loss": -0.104, "num_tokens": 24311470.0, "reward": 0.7660180330276489, "reward_std": 1.2185697555541992, "rewards/rollout_reward_func/mean": 0.7660180330276489, "rewards/rollout_reward_func/std": 1.2185697555541992, "sampling/importance_sampling_ratio/max": 1.0167378187179565, "sampling/importance_sampling_ratio/mean": 0.6952282190322876, "sampling/importance_sampling_ratio/min": 4.982126029062783e-06, "sampling/sampling_logp_difference/max": 1.7254548072814941, "sampling/sampling_logp_difference/mean": 0.3525186777114868, "step": 1809, "step_time": 14.243147209010203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6951910257339478, "epoch": 0.00905, "grad_norm": 0.007833235897123814, "kl": 0.19718455895781517, "learning_rate": 7.999708762542714e-06, "loss": -0.104, "step": 1810, "step_time": 6.225308749999385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.9890369176864624, "epoch": 0.009055, "frac_reward_zero_std": 0.0, "grad_norm": 0.006724560633301735, "kl": 0.061023795045912266, "learning_rate": 7.99970843411563e-06, "loss": -0.0581, "num_tokens": 24343621.0, "reward": -0.6588042974472046, "reward_std": 0.708345890045166, "rewards/rollout_reward_func/mean": -0.6588042974472046, "rewards/rollout_reward_func/std": 0.708345890045166, "sampling/importance_sampling_ratio/max": 1.024430274963379, "sampling/importance_sampling_ratio/mean": 0.1917218416929245, "sampling/importance_sampling_ratio/min": 2.522298636620235e-08, "sampling/sampling_logp_difference/max": 2.637843608856201, "sampling/sampling_logp_difference/mean": 0.5683391094207764, "step": 1811, "step_time": 19.001895390014397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.9866297841072083, "epoch": 0.00906, "grad_norm": 0.00661025196313858, "kl": 0.060965880285948515, "learning_rate": 7.999708105503477e-06, "loss": -0.0581, "step": 1812, "step_time": 6.872613302999525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.2227835655212402, "epoch": 0.009065, "frac_reward_zero_std": 0.0, "grad_norm": 0.09028998762369156, "kl": 0.1207317691296339, "learning_rate": 7.999707776706253e-06, "loss": -0.0794, "num_tokens": 24381741.0, "reward": -0.3729805052280426, "reward_std": 0.7031456828117371, "rewards/rollout_reward_func/mean": -0.3729805052280426, "rewards/rollout_reward_func/std": 0.7031457424163818, "sampling/importance_sampling_ratio/max": 1.0362303256988525, "sampling/importance_sampling_ratio/mean": 0.3549041450023651, "sampling/importance_sampling_ratio/min": 1.1785085973770038e-07, "sampling/sampling_logp_difference/max": 2.3276219367980957, "sampling/sampling_logp_difference/mean": 0.5110526084899902, "step": 1813, "step_time": 19.928492602994083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.222811698913574, "epoch": 0.00907, "grad_norm": 0.09578948467969894, "kl": 0.11934922821819782, "learning_rate": 7.999707447723962e-06, "loss": -0.0793, "step": 1814, "step_time": 8.355803958998877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3379288632422686, "epoch": 0.009075, "frac_reward_zero_std": 0.0, "grad_norm": 0.0227569080889225, "kl": 0.18208902701735497, "learning_rate": 7.9997071185566e-06, "loss": -0.049, "num_tokens": 24411527.0, "reward": 0.4963322877883911, "reward_std": 1.1609383821487427, "rewards/rollout_reward_func/mean": 0.4963322877883911, "rewards/rollout_reward_func/std": 1.1609383821487427, "sampling/importance_sampling_ratio/max": 1.0695888996124268, "sampling/importance_sampling_ratio/mean": 0.7642654776573181, "sampling/importance_sampling_ratio/min": 8.339581108884886e-05, "sampling/sampling_logp_difference/max": 1.3589808940887451, "sampling/sampling_logp_difference/mean": 0.2210192084312439, "step": 1815, "step_time": 17.2012150859955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.332880461588502, "epoch": 0.00908, "grad_norm": 0.025345630943775177, "kl": 0.18215832486748695, "learning_rate": 7.999706789204169e-06, "loss": -0.049, "step": 1816, "step_time": 6.808454956000787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.9375, "completions/mean_terminated_length": 5.266666889190674, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.38949179276824, "epoch": 0.009085, "frac_reward_zero_std": 0.0, "grad_norm": 0.034697894006967545, "kl": 0.5088633820414543, "learning_rate": 7.999706459666668e-06, "loss": -0.0872, "num_tokens": 24428614.0, "reward": 0.4059711694717407, "reward_std": 1.4539945125579834, "rewards/rollout_reward_func/mean": 0.4059711694717407, "rewards/rollout_reward_func/std": 1.453994631767273, "sampling/importance_sampling_ratio/max": 1.0225772857666016, "sampling/importance_sampling_ratio/mean": 0.8191550970077515, "sampling/importance_sampling_ratio/min": 6.178913167786959e-07, "sampling/sampling_logp_difference/max": 1.9167544841766357, "sampling/sampling_logp_difference/mean": 0.34531843662261963, "step": 1817, "step_time": 6.301211319980212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3899837508797646, "epoch": 0.00909, "grad_norm": 0.03632527217268944, "kl": 0.5187177062034607, "learning_rate": 7.999706129944099e-06, "loss": -0.0871, "step": 1818, "step_time": 3.359108673001174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.6875, "completions/mean_terminated_length": 5.900000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.633795291185379, "epoch": 0.009095, "frac_reward_zero_std": 0.0, "grad_norm": 0.02037726156413555, "kl": 0.22495919838547707, "learning_rate": 7.99970580003646e-06, "loss": -0.0731, "num_tokens": 24452301.0, "reward": 0.6786975860595703, "reward_std": 1.4048999547958374, "rewards/rollout_reward_func/mean": 0.6786975860595703, "rewards/rollout_reward_func/std": 1.4048999547958374, "sampling/importance_sampling_ratio/max": 1.0375570058822632, "sampling/importance_sampling_ratio/mean": 0.5128546357154846, "sampling/importance_sampling_ratio/min": 1.1258283905135613e-07, "sampling/sampling_logp_difference/max": 1.9336532354354858, "sampling/sampling_logp_difference/mean": 0.4731277823448181, "step": 1819, "step_time": 16.279643598987604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.625021368265152, "epoch": 0.0091, "grad_norm": 0.02009558491408825, "kl": 0.229464303702116, "learning_rate": 7.999705469943751e-06, "loss": -0.0732, "step": 1820, "step_time": 6.206655154019245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.4375, "completions/mean_terminated_length": 5.9166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7248239107429981, "epoch": 0.009105, "frac_reward_zero_std": 0.0, "grad_norm": 0.09988810122013092, "kl": 0.29175156354904175, "learning_rate": 7.999705139665973e-06, "loss": -0.062, "num_tokens": 24478382.0, "reward": 0.17820316553115845, "reward_std": 1.206908941268921, "rewards/rollout_reward_func/mean": 0.17820316553115845, "rewards/rollout_reward_func/std": 1.2069090604782104, "sampling/importance_sampling_ratio/max": 1.0123255252838135, "sampling/importance_sampling_ratio/mean": 0.5165173411369324, "sampling/importance_sampling_ratio/min": 5.972855433356017e-05, "sampling/sampling_logp_difference/max": 2.204362154006958, "sampling/sampling_logp_difference/mean": 0.32549935579299927, "step": 1821, "step_time": 16.050168705987744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7186179645359516, "epoch": 0.00911, "grad_norm": 0.10318503528833389, "kl": 0.28747694194316864, "learning_rate": 7.999704809203126e-06, "loss": -0.0621, "step": 1822, "step_time": 6.263657638992299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.4375, "completions/mean_terminated_length": 4.4375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.48936135321855545, "epoch": 0.009115, "frac_reward_zero_std": 0.5, "grad_norm": 0.1807732880115509, "kl": 0.5982346683740616, "learning_rate": 7.99970447855521e-06, "loss": -0.0241, "num_tokens": 24496816.0, "reward": 1.563897728919983, "reward_std": 0.4655858874320984, "rewards/rollout_reward_func/mean": 1.563897728919983, "rewards/rollout_reward_func/std": 0.4655858874320984, "sampling/importance_sampling_ratio/max": 1.0189684629440308, "sampling/importance_sampling_ratio/mean": 0.756426215171814, "sampling/importance_sampling_ratio/min": 0.024177582934498787, "sampling/sampling_logp_difference/max": 1.4881412982940674, "sampling/sampling_logp_difference/mean": 0.13206730782985687, "step": 1823, "step_time": 11.436288871002034 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.48998066037893295, "epoch": 0.00912, "grad_norm": 0.05525499954819679, "kl": 0.6290579736232758, "learning_rate": 7.999704147722224e-06, "loss": -0.0252, "step": 1824, "step_time": 5.832121767001809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 6.125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7636414617300034, "epoch": 0.009125, "frac_reward_zero_std": 0.5, "grad_norm": 0.15213598310947418, "kl": 0.4160310998558998, "learning_rate": 7.99970381670417e-06, "loss": -0.0389, "num_tokens": 24521705.0, "reward": 0.8591251969337463, "reward_std": 1.4858522415161133, "rewards/rollout_reward_func/mean": 0.8591251969337463, "rewards/rollout_reward_func/std": 1.4858522415161133, "sampling/importance_sampling_ratio/max": 1.011073350906372, "sampling/importance_sampling_ratio/mean": 0.7292159199714661, "sampling/importance_sampling_ratio/min": 1.1764669949343443e-07, "sampling/sampling_logp_difference/max": 2.3083720207214355, "sampling/sampling_logp_difference/mean": 0.3305031657218933, "step": 1825, "step_time": 15.397371264989488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.7833295837044716, "epoch": 0.00913, "grad_norm": 0.06910517066717148, "kl": 0.4100191444158554, "learning_rate": 7.999703485501046e-06, "loss": -0.0394, "step": 1826, "step_time": 6.489183146011783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.08828726969659328, "epoch": 0.009135, "frac_reward_zero_std": 1.0, "grad_norm": 0.00032487406861037016, "kl": 0.23871682584285736, "learning_rate": 7.999703154112853e-06, "loss": 0.0006, "num_tokens": 24538958.0, "reward": 1.9499473571777344, "reward_std": 0.05169421061873436, "rewards/rollout_reward_func/mean": 1.9499473571777344, "rewards/rollout_reward_func/std": 0.05169421061873436, "sampling/importance_sampling_ratio/max": 1.0245546102523804, "sampling/importance_sampling_ratio/mean": 1.0107847452163696, "sampling/importance_sampling_ratio/min": 1.0009944438934326, "sampling/sampling_logp_difference/max": 0.012092594057321548, "sampling/sampling_logp_difference/mean": 0.003641844494268298, "step": 1827, "step_time": 5.8694892859930405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08902380801737309, "epoch": 0.00914, "grad_norm": 0.00032613909570500255, "kl": 0.23864911869168282, "learning_rate": 7.999702822539591e-06, "loss": 0.0006, "step": 1828, "step_time": 3.1904721730097663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.875, "completions/mean_terminated_length": 5.636363983154297, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4913427978754044, "epoch": 0.009145, "frac_reward_zero_std": 0.0, "grad_norm": 0.2577131986618042, "kl": 0.12186435516923666, "learning_rate": 7.99970249078126e-06, "loss": -0.0787, "num_tokens": 24569436.0, "reward": 0.21857966482639313, "reward_std": 1.199060320854187, "rewards/rollout_reward_func/mean": 0.21857966482639313, "rewards/rollout_reward_func/std": 1.1990602016448975, "sampling/importance_sampling_ratio/max": 1.022308111190796, "sampling/importance_sampling_ratio/mean": 0.4738098382949829, "sampling/importance_sampling_ratio/min": 8.602420712122694e-05, "sampling/sampling_logp_difference/max": 1.8874773979187012, "sampling/sampling_logp_difference/mean": 0.40986958146095276, "step": 1829, "step_time": 14.999441006992129 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.4694611243903637, "epoch": 0.00915, "grad_norm": 0.059705235064029694, "kl": 0.12121691554784775, "learning_rate": 7.999702158837859e-06, "loss": -0.0797, "step": 1830, "step_time": 6.330984520012862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.190678669139743, "epoch": 0.009155, "frac_reward_zero_std": 0.0, "grad_norm": 0.10609644651412964, "kl": 0.3678971566259861, "learning_rate": 7.99970182670939e-06, "loss": -0.0879, "num_tokens": 24600906.0, "reward": 0.775770902633667, "reward_std": 1.1014208793640137, "rewards/rollout_reward_func/mean": 0.775770902633667, "rewards/rollout_reward_func/std": 1.1014208793640137, "sampling/importance_sampling_ratio/max": 1.1160134077072144, "sampling/importance_sampling_ratio/mean": 0.5403770208358765, "sampling/importance_sampling_ratio/min": 2.714332367759198e-05, "sampling/sampling_logp_difference/max": 1.9571623802185059, "sampling/sampling_logp_difference/mean": 0.4203950762748718, "step": 1831, "step_time": 15.511087985010818 }, { "clip_ratio/high_max": 0.033333334140479565, "clip_ratio/high_mean": 0.016666667070239782, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016666667070239782, "entropy": 2.1748676113784313, "epoch": 0.00916, "grad_norm": 0.04611007496714592, "kl": 0.35699841752648354, "learning_rate": 7.99970149439585e-06, "loss": -0.0883, "step": 1832, "step_time": 6.460879190010019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3067973740398884, "epoch": 0.009165, "frac_reward_zero_std": 0.0, "grad_norm": 0.1329193264245987, "kl": 0.267303504049778, "learning_rate": 7.999701161897243e-06, "loss": -0.0456, "num_tokens": 24633030.0, "reward": 0.5485278367996216, "reward_std": 1.2728372812271118, "rewards/rollout_reward_func/mean": 0.5485278367996216, "rewards/rollout_reward_func/std": 1.2728372812271118, "sampling/importance_sampling_ratio/max": 1.2048475742340088, "sampling/importance_sampling_ratio/mean": 0.7731195688247681, "sampling/importance_sampling_ratio/min": 1.7494179473942495e-06, "sampling/sampling_logp_difference/max": 1.7855466604232788, "sampling/sampling_logp_difference/mean": 0.22768133878707886, "step": 1833, "step_time": 16.802173110001604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3056569648906589, "epoch": 0.00917, "grad_norm": 0.13107876479625702, "kl": 0.2657449170947075, "learning_rate": 7.999700829213565e-06, "loss": -0.046, "step": 1834, "step_time": 8.138607118991786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.5625, "completions/mean_terminated_length": 6.357142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.699368715286255, "epoch": 0.009175, "frac_reward_zero_std": 0.0, "grad_norm": 0.15704119205474854, "kl": 0.7327979877591133, "learning_rate": 7.99970049634482e-06, "loss": -0.0816, "num_tokens": 24661892.0, "reward": 0.09763490408658981, "reward_std": 1.3431543111801147, "rewards/rollout_reward_func/mean": 0.09763490408658981, "rewards/rollout_reward_func/std": 1.3431544303894043, "sampling/importance_sampling_ratio/max": 1.0305393934249878, "sampling/importance_sampling_ratio/mean": 0.4246494770050049, "sampling/importance_sampling_ratio/min": 1.1641289347608108e-05, "sampling/sampling_logp_difference/max": 2.3526535034179688, "sampling/sampling_logp_difference/mean": 0.4526841640472412, "step": 1835, "step_time": 14.150695278003695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.702238619327545, "epoch": 0.00918, "grad_norm": 0.17705433070659637, "kl": 0.6502725332975388, "learning_rate": 7.999700163291005e-06, "loss": -0.0821, "step": 1836, "step_time": 6.420604834987898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.0625, "completions/mean_terminated_length": 4.900000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.631383001804352, "epoch": 0.009185, "frac_reward_zero_std": 0.0, "grad_norm": 0.21242289245128632, "kl": 1.2535035833716393, "learning_rate": 7.999699830052122e-06, "loss": -0.0121, "num_tokens": 24692392.0, "reward": -0.32330888509750366, "reward_std": 1.1208134889602661, "rewards/rollout_reward_func/mean": -0.32330888509750366, "rewards/rollout_reward_func/std": 1.1208134889602661, "sampling/importance_sampling_ratio/max": 1.0313713550567627, "sampling/importance_sampling_ratio/mean": 0.41187724471092224, "sampling/importance_sampling_ratio/min": 4.362493086773611e-07, "sampling/sampling_logp_difference/max": 1.9173946380615234, "sampling/sampling_logp_difference/mean": 0.45841848850250244, "step": 1837, "step_time": 16.149101943010464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.637677013874054, "epoch": 0.00919, "grad_norm": 0.19128550589084625, "kl": 1.0800959840416908, "learning_rate": 7.99969949662817e-06, "loss": -0.0132, "step": 1838, "step_time": 6.365300635996391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.375, "completions/mean_terminated_length": 6.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.5809510946273804, "epoch": 0.009195, "frac_reward_zero_std": 0.0, "grad_norm": 0.08319427073001862, "kl": 0.3177830073982477, "learning_rate": 7.999699163019147e-06, "loss": -0.0919, "num_tokens": 24721742.0, "reward": 0.1061684861779213, "reward_std": 1.3859946727752686, "rewards/rollout_reward_func/mean": 0.1061684861779213, "rewards/rollout_reward_func/std": 1.3859946727752686, "sampling/importance_sampling_ratio/max": 1.0126593112945557, "sampling/importance_sampling_ratio/mean": 0.3243289589881897, "sampling/importance_sampling_ratio/min": 1.2913541240777704e-06, "sampling/sampling_logp_difference/max": 1.900397777557373, "sampling/sampling_logp_difference/mean": 0.5645865201950073, "step": 1839, "step_time": 17.416003883990925 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 3.5890416502952576, "epoch": 0.0092, "grad_norm": 0.056908298283815384, "kl": 0.2635493893176317, "learning_rate": 7.999698829225057e-06, "loss": -0.0923, "step": 1840, "step_time": 6.661610897004721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.125, "completions/mean_terminated_length": 5.857142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9077320396900177, "epoch": 0.009205, "frac_reward_zero_std": 0.0, "grad_norm": 0.03462069481611252, "kl": 0.25980759784579277, "learning_rate": 7.999698495245897e-06, "loss": -0.1078, "num_tokens": 24755882.0, "reward": 0.7374209761619568, "reward_std": 1.2492364645004272, "rewards/rollout_reward_func/mean": 0.7374209761619568, "rewards/rollout_reward_func/std": 1.2492364645004272, "sampling/importance_sampling_ratio/max": 1.0975310802459717, "sampling/importance_sampling_ratio/mean": 0.6398690938949585, "sampling/importance_sampling_ratio/min": 0.0001062570299836807, "sampling/sampling_logp_difference/max": 1.9023551940917969, "sampling/sampling_logp_difference/mean": 0.3215862512588501, "step": 1841, "step_time": 17.312636594986543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9059530198574066, "epoch": 0.00921, "grad_norm": 0.03068315051496029, "kl": 0.2597874142229557, "learning_rate": 7.999698161081669e-06, "loss": -0.1078, "step": 1842, "step_time": 7.671161182006472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 4.533333778381348, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7248922698199749, "epoch": 0.009215, "frac_reward_zero_std": 0.5, "grad_norm": 0.013295945711433887, "kl": 0.31619277596473694, "learning_rate": 7.999697826732372e-06, "loss": -0.0514, "num_tokens": 24780865.0, "reward": 1.4488811492919922, "reward_std": 1.0218801498413086, "rewards/rollout_reward_func/mean": 1.4488811492919922, "rewards/rollout_reward_func/std": 1.0218801498413086, "sampling/importance_sampling_ratio/max": 1.142175555229187, "sampling/importance_sampling_ratio/mean": 0.8986198902130127, "sampling/importance_sampling_ratio/min": 8.854844857353328e-09, "sampling/sampling_logp_difference/max": 2.101198196411133, "sampling/sampling_logp_difference/mean": 0.24114356935024261, "step": 1843, "step_time": 13.399004850012716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7253565564751625, "epoch": 0.00922, "grad_norm": 0.013491327874362469, "kl": 0.315277598798275, "learning_rate": 7.999697492198005e-06, "loss": -0.0514, "step": 1844, "step_time": 7.086890192993451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.875, "completions/mean_terminated_length": 5.636363983154297, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.6703104935586452, "epoch": 0.009225, "frac_reward_zero_std": 0.0, "grad_norm": 0.06671275943517685, "kl": 0.196607181802392, "learning_rate": 7.99969715747857e-06, "loss": -0.1149, "num_tokens": 24815293.0, "reward": 0.37472498416900635, "reward_std": 1.3227589130401611, "rewards/rollout_reward_func/mean": 0.37472498416900635, "rewards/rollout_reward_func/std": 1.3227590322494507, "sampling/importance_sampling_ratio/max": 1.1640279293060303, "sampling/importance_sampling_ratio/mean": 0.5471580028533936, "sampling/importance_sampling_ratio/min": 2.1462328447796608e-07, "sampling/sampling_logp_difference/max": 2.2137598991394043, "sampling/sampling_logp_difference/mean": 0.4781913757324219, "step": 1845, "step_time": 16.76797443398391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.669718161225319, "epoch": 0.00923, "grad_norm": 0.06678199023008347, "kl": 0.19683432020246983, "learning_rate": 7.999696822574068e-06, "loss": -0.115, "step": 1846, "step_time": 6.868324204988312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.11407647095620632, "epoch": 0.009235, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003949528909288347, "kl": 0.2585967294871807, "learning_rate": 7.999696487484495e-06, "loss": 0.0006, "num_tokens": 24831523.0, "reward": 1.9499473571777344, "reward_std": 0.05169421061873436, "rewards/rollout_reward_func/mean": 1.9499473571777344, "rewards/rollout_reward_func/std": 0.05169421061873436, "sampling/importance_sampling_ratio/max": 1.0251764059066772, "sampling/importance_sampling_ratio/mean": 1.013221263885498, "sampling/importance_sampling_ratio/min": 1.000473976135254, "sampling/sampling_logp_difference/max": 0.01599588617682457, "sampling/sampling_logp_difference/mean": 0.004589159972965717, "step": 1847, "step_time": 5.921475057999487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11315746046602726, "epoch": 0.00924, "grad_norm": 0.00038790490361861885, "kl": 0.2586643472313881, "learning_rate": 7.999696152209854e-06, "loss": 0.0006, "step": 1848, "step_time": 3.232349334008177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9246825575828552, "epoch": 0.009245, "frac_reward_zero_std": 0.0, "grad_norm": 0.028723187744617462, "kl": 0.34871146082878113, "learning_rate": 7.999695816750144e-06, "loss": -0.0774, "num_tokens": 24849832.0, "reward": 1.5901949405670166, "reward_std": 0.6484299898147583, "rewards/rollout_reward_func/mean": 1.5901949405670166, "rewards/rollout_reward_func/std": 0.6484300494194031, "sampling/importance_sampling_ratio/max": 1.0431923866271973, "sampling/importance_sampling_ratio/mean": 0.8930274844169617, "sampling/importance_sampling_ratio/min": 6.597359606530517e-05, "sampling/sampling_logp_difference/max": 1.8209991455078125, "sampling/sampling_logp_difference/mean": 0.21852168440818787, "step": 1849, "step_time": 9.673085572983837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9251820966601372, "epoch": 0.00925, "grad_norm": 0.028646152466535568, "kl": 0.3488277532160282, "learning_rate": 7.999695481105367e-06, "loss": -0.0774, "step": 1850, "step_time": 4.8325177319929935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 4.545454502105713, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.252641350030899, "epoch": 0.009255, "frac_reward_zero_std": 0.0, "grad_norm": 0.11803165078163147, "kl": 0.5505442097783089, "learning_rate": 7.999695145275519e-06, "loss": -0.0956, "num_tokens": 24882242.0, "reward": -0.036042071878910065, "reward_std": 1.2268792390823364, "rewards/rollout_reward_func/mean": -0.036042071878910065, "rewards/rollout_reward_func/std": 1.2268792390823364, "sampling/importance_sampling_ratio/max": 1.0854300260543823, "sampling/importance_sampling_ratio/mean": 0.3820498287677765, "sampling/importance_sampling_ratio/min": 0.0002571956138126552, "sampling/sampling_logp_difference/max": 1.8658583164215088, "sampling/sampling_logp_difference/mean": 0.3743129372596741, "step": 1851, "step_time": 17.737988396998844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.268798738718033, "epoch": 0.00926, "grad_norm": 0.1071469783782959, "kl": 0.5444399155676365, "learning_rate": 7.999694809260603e-06, "loss": -0.096, "step": 1852, "step_time": 8.203804203003529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 5.230769634246826, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7261568903923035, "epoch": 0.009265, "frac_reward_zero_std": 0.5, "grad_norm": 0.2659917175769806, "kl": 1.2518128715455532, "learning_rate": 7.999694473060617e-06, "loss": -0.0328, "num_tokens": 24908709.0, "reward": 1.060934066772461, "reward_std": 0.9562470316886902, "rewards/rollout_reward_func/mean": 1.060934066772461, "rewards/rollout_reward_func/std": 0.956247091293335, "sampling/importance_sampling_ratio/max": 1.141906499862671, "sampling/importance_sampling_ratio/mean": 0.7111746668815613, "sampling/importance_sampling_ratio/min": 1.3415734656518907e-06, "sampling/sampling_logp_difference/max": 2.2845635414123535, "sampling/sampling_logp_difference/mean": 0.26117902994155884, "step": 1853, "step_time": 18.20408358698478 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 1.7303434908390045, "epoch": 0.00927, "grad_norm": 0.18977949023246765, "kl": 0.9583987221121788, "learning_rate": 7.999694136675565e-06, "loss": -0.0351, "step": 1854, "step_time": 6.822040379993268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.100404769182205, "epoch": 0.009275, "frac_reward_zero_std": 0.0, "grad_norm": 0.1501431167125702, "kl": 0.23024656996130943, "learning_rate": 7.999693800105443e-06, "loss": -0.0761, "num_tokens": 24938657.0, "reward": -0.003966987133026123, "reward_std": 1.196592926979065, "rewards/rollout_reward_func/mean": -0.003966987133026123, "rewards/rollout_reward_func/std": 1.1965930461883545, "sampling/importance_sampling_ratio/max": 1.1588424444198608, "sampling/importance_sampling_ratio/mean": 0.7125540971755981, "sampling/importance_sampling_ratio/min": 2.7348944513505558e-06, "sampling/sampling_logp_difference/max": 1.7692418098449707, "sampling/sampling_logp_difference/mean": 0.36750587821006775, "step": 1855, "step_time": 15.381205819983734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.080546408891678, "epoch": 0.00928, "grad_norm": 0.10305041819810867, "kl": 0.2271723933517933, "learning_rate": 7.999693463350253e-06, "loss": -0.077, "step": 1856, "step_time": 6.205076420024852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.31254399567842484, "epoch": 0.009285, "frac_reward_zero_std": 0.5, "grad_norm": 0.007794433273375034, "kl": 0.23798365890979767, "learning_rate": 7.999693126409994e-06, "loss": -0.0172, "num_tokens": 24961482.0, "reward": 1.3728636503219604, "reward_std": 1.0629278421401978, "rewards/rollout_reward_func/mean": 1.3728636503219604, "rewards/rollout_reward_func/std": 1.0629278421401978, "sampling/importance_sampling_ratio/max": 1.0425416231155396, "sampling/importance_sampling_ratio/mean": 0.9539591073989868, "sampling/importance_sampling_ratio/min": 0.009985461831092834, "sampling/sampling_logp_difference/max": 0.9459197521209717, "sampling/sampling_logp_difference/mean": 0.05624502897262573, "step": 1857, "step_time": 13.20431955299864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.31418607011437416, "epoch": 0.00929, "grad_norm": 0.007690915372222662, "kl": 0.23751144483685493, "learning_rate": 7.999692789284667e-06, "loss": -0.0172, "step": 1858, "step_time": 5.709510812986991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7092590034008026, "epoch": 0.009295, "frac_reward_zero_std": 0.0, "grad_norm": 0.06962904334068298, "kl": 0.23400524258613586, "learning_rate": 7.999692451974272e-06, "loss": -0.0912, "num_tokens": 24992930.0, "reward": 0.49574965238571167, "reward_std": 1.2710777521133423, "rewards/rollout_reward_func/mean": 0.49574965238571167, "rewards/rollout_reward_func/std": 1.2710777521133423, "sampling/importance_sampling_ratio/max": 1.058805227279663, "sampling/importance_sampling_ratio/mean": 0.639685869216919, "sampling/importance_sampling_ratio/min": 1.4099097143116524e-06, "sampling/sampling_logp_difference/max": 1.8294109106063843, "sampling/sampling_logp_difference/mean": 0.35006389021873474, "step": 1859, "step_time": 15.190248076993157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7155812680721283, "epoch": 0.0093, "grad_norm": 0.10107864439487457, "kl": 0.23390062153339386, "learning_rate": 7.999692114478807e-06, "loss": -0.0915, "step": 1860, "step_time": 6.894272918012575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.857142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0313642676919699, "epoch": 0.009305, "frac_reward_zero_std": 0.0, "grad_norm": 0.17170485854148865, "kl": 0.39342448115348816, "learning_rate": 7.999691776798274e-06, "loss": -0.0625, "num_tokens": 25023031.0, "reward": 0.6485357284545898, "reward_std": 1.2838557958602905, "rewards/rollout_reward_func/mean": 0.6485357284545898, "rewards/rollout_reward_func/std": 1.2838557958602905, "sampling/importance_sampling_ratio/max": 1.0553053617477417, "sampling/importance_sampling_ratio/mean": 0.6772038340568542, "sampling/importance_sampling_ratio/min": 0.004911230411380529, "sampling/sampling_logp_difference/max": 1.0625274181365967, "sampling/sampling_logp_difference/mean": 0.15672540664672852, "step": 1861, "step_time": 15.790139826000086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0557359606027603, "epoch": 0.00931, "grad_norm": 0.15057210624217987, "kl": 0.4001532755792141, "learning_rate": 7.999691438932673e-06, "loss": -0.0633, "step": 1862, "step_time": 6.278536442987388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.5625, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6311790347099304, "epoch": 0.009315, "frac_reward_zero_std": 0.5, "grad_norm": 0.10729769617319107, "kl": 0.3159576877951622, "learning_rate": 7.999691100882003e-06, "loss": -0.0312, "num_tokens": 25048886.0, "reward": 0.8371395468711853, "reward_std": 1.323609709739685, "rewards/rollout_reward_func/mean": 0.8371395468711853, "rewards/rollout_reward_func/std": 1.3236098289489746, "sampling/importance_sampling_ratio/max": 1.321638584136963, "sampling/importance_sampling_ratio/mean": 0.6551797389984131, "sampling/importance_sampling_ratio/min": 0.001298287184908986, "sampling/sampling_logp_difference/max": 2.234715461730957, "sampling/sampling_logp_difference/mean": 0.20083799958229065, "step": 1863, "step_time": 16.35724473600567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.6442469358444214, "epoch": 0.00932, "grad_norm": 0.03363751992583275, "kl": 0.31394577771425247, "learning_rate": 7.999690762646265e-06, "loss": -0.0316, "step": 1864, "step_time": 6.078341421001824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.625, "completions/mean_terminated_length": 4.833333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7364908829331398, "epoch": 0.009325, "frac_reward_zero_std": 0.0, "grad_norm": 0.06935600191354752, "kl": 0.18155174609273672, "learning_rate": 7.999690424225457e-06, "loss": -0.0689, "num_tokens": 25068223.0, "reward": 0.33536916971206665, "reward_std": 1.5344181060791016, "rewards/rollout_reward_func/mean": 0.33536916971206665, "rewards/rollout_reward_func/std": 1.5344181060791016, "sampling/importance_sampling_ratio/max": 1.0283080339431763, "sampling/importance_sampling_ratio/mean": 0.6960394382476807, "sampling/importance_sampling_ratio/min": 3.0259593586379196e-06, "sampling/sampling_logp_difference/max": 1.6881877183914185, "sampling/sampling_logp_difference/mean": 0.3001357316970825, "step": 1865, "step_time": 12.510073847006424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7365128882229328, "epoch": 0.00933, "grad_norm": 0.07067099213600159, "kl": 0.18052762746810913, "learning_rate": 7.999690085619583e-06, "loss": -0.0688, "step": 1866, "step_time": 5.291442506975727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.21303591132164, "epoch": 0.009335, "frac_reward_zero_std": 0.0, "grad_norm": 0.18896864354610443, "kl": 0.17905454337596893, "learning_rate": 7.99968974682864e-06, "loss": -0.1002, "num_tokens": 25098410.0, "reward": 0.32972586154937744, "reward_std": 1.2164689302444458, "rewards/rollout_reward_func/mean": 0.32972586154937744, "rewards/rollout_reward_func/std": 1.2164689302444458, "sampling/importance_sampling_ratio/max": 1.171403408050537, "sampling/importance_sampling_ratio/mean": 0.6120768785476685, "sampling/importance_sampling_ratio/min": 9.976950110512917e-08, "sampling/sampling_logp_difference/max": 2.0450000762939453, "sampling/sampling_logp_difference/mean": 0.4225196838378906, "step": 1867, "step_time": 16.403521205982543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.235423296689987, "epoch": 0.00934, "grad_norm": 0.21463648974895477, "kl": 0.178091736510396, "learning_rate": 7.999689407852628e-06, "loss": -0.101, "step": 1868, "step_time": 6.472962048981572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 4.153846263885498, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.390005186200142, "epoch": 0.009345, "frac_reward_zero_std": 0.0, "grad_norm": 0.05823919549584389, "kl": 0.3818998523056507, "learning_rate": 7.999689068691549e-06, "loss": -0.0862, "num_tokens": 25123347.0, "reward": 1.1165257692337036, "reward_std": 1.2422795295715332, "rewards/rollout_reward_func/mean": 1.1165257692337036, "rewards/rollout_reward_func/std": 1.2422795295715332, "sampling/importance_sampling_ratio/max": 1.0282639265060425, "sampling/importance_sampling_ratio/mean": 0.6579955816268921, "sampling/importance_sampling_ratio/min": 0.0006982153281569481, "sampling/sampling_logp_difference/max": 1.624117374420166, "sampling/sampling_logp_difference/mean": 0.20398716628551483, "step": 1869, "step_time": 16.442058893982903 }, { "clip_ratio/high_max": 0.010869565419852734, "clip_ratio/high_mean": 0.005434782709926367, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005434782709926367, "entropy": 1.400485223159194, "epoch": 0.00935, "grad_norm": 0.05591844394803047, "kl": 0.37977883219718933, "learning_rate": 7.999688729345401e-06, "loss": -0.0862, "step": 1870, "step_time": 6.363749003998237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.9375, "completions/mean_terminated_length": 4.200000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9579377323389053, "epoch": 0.009355, "frac_reward_zero_std": 0.5, "grad_norm": 0.2589958906173706, "kl": 0.24669232219457626, "learning_rate": 7.999688389814182e-06, "loss": -0.0091, "num_tokens": 25148502.0, "reward": 0.15906788408756256, "reward_std": 0.7577537298202515, "rewards/rollout_reward_func/mean": 0.15906788408756256, "rewards/rollout_reward_func/std": 0.7577537894248962, "sampling/importance_sampling_ratio/max": 1.0687639713287354, "sampling/importance_sampling_ratio/mean": 0.8125237226486206, "sampling/importance_sampling_ratio/min": 0.002179183065891266, "sampling/sampling_logp_difference/max": 1.3685708045959473, "sampling/sampling_logp_difference/mean": 0.11849845945835114, "step": 1871, "step_time": 15.1978982599976 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026988636702299118, "entropy": 1.0906124264001846, "epoch": 0.00936, "grad_norm": 0.08730097115039825, "kl": 0.2181481085717678, "learning_rate": 7.999688050097899e-06, "loss": -0.0098, "step": 1872, "step_time": 5.922623659018427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 9.5, "completions/mean_terminated_length": 4.44444465637207, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.7355207800865173, "epoch": 0.009365, "frac_reward_zero_std": 0.0, "grad_norm": 0.02611834928393364, "kl": 0.24865062534809113, "learning_rate": 7.999687710196545e-06, "loss": -0.1008, "num_tokens": 25173895.0, "reward": 0.8875086307525635, "reward_std": 1.3990261554718018, "rewards/rollout_reward_func/mean": 0.8875086307525635, "rewards/rollout_reward_func/std": 1.3990262746810913, "sampling/importance_sampling_ratio/max": 1.0527406930923462, "sampling/importance_sampling_ratio/mean": 0.5137100219726562, "sampling/importance_sampling_ratio/min": 2.4356318135687616e-06, "sampling/sampling_logp_difference/max": 1.8878445625305176, "sampling/sampling_logp_difference/mean": 0.4602578282356262, "step": 1873, "step_time": 16.848827391993836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7437422573566437, "epoch": 0.00937, "grad_norm": 0.02695080265402794, "kl": 0.24808287620544434, "learning_rate": 7.999687370110125e-06, "loss": -0.1007, "step": 1874, "step_time": 6.081786089009256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.106277108192444, "epoch": 0.009375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2306901514530182, "kl": 0.3100177124142647, "learning_rate": 7.999687029838636e-06, "loss": -0.0769, "num_tokens": 25198770.0, "reward": 1.0302870273590088, "reward_std": 1.3066625595092773, "rewards/rollout_reward_func/mean": 1.0302870273590088, "rewards/rollout_reward_func/std": 1.3066625595092773, "sampling/importance_sampling_ratio/max": 1.047249436378479, "sampling/importance_sampling_ratio/mean": 0.6252357363700867, "sampling/importance_sampling_ratio/min": 5.742181770074239e-07, "sampling/sampling_logp_difference/max": 1.884994387626648, "sampling/sampling_logp_difference/mean": 0.41640806198120117, "step": 1875, "step_time": 15.9012920929963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.133833020925522, "epoch": 0.00938, "grad_norm": 0.25685086846351624, "kl": 0.311037790030241, "learning_rate": 7.999686689382078e-06, "loss": -0.0764, "step": 1876, "step_time": 5.817681457992876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.375, "completions/mean_terminated_length": 5.833333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.6145816445350647, "epoch": 0.009385, "frac_reward_zero_std": 0.0, "grad_norm": 0.10010357201099396, "kl": 0.19327867403626442, "learning_rate": 7.999686348740451e-06, "loss": -0.0993, "num_tokens": 25231620.0, "reward": 0.5169005393981934, "reward_std": 1.1734116077423096, "rewards/rollout_reward_func/mean": 0.5169005393981934, "rewards/rollout_reward_func/std": 1.1734117269515991, "sampling/importance_sampling_ratio/max": 1.1023147106170654, "sampling/importance_sampling_ratio/mean": 0.4873161017894745, "sampling/importance_sampling_ratio/min": 4.440023076313082e-06, "sampling/sampling_logp_difference/max": 1.9622561931610107, "sampling/sampling_logp_difference/mean": 0.45304688811302185, "step": 1877, "step_time": 16.327047810002114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5924929827451706, "epoch": 0.00939, "grad_norm": 0.08665025234222412, "kl": 0.19371379166841507, "learning_rate": 7.999686007913758e-06, "loss": -0.0995, "step": 1878, "step_time": 7.042499879011302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.845916897058487, "epoch": 0.009395, "frac_reward_zero_std": 0.0, "grad_norm": 0.04643632471561432, "kl": 0.27622172236442566, "learning_rate": 7.999685666901995e-06, "loss": -0.0863, "num_tokens": 25258286.0, "reward": 0.8351253271102905, "reward_std": 1.4026241302490234, "rewards/rollout_reward_func/mean": 0.8351253271102905, "rewards/rollout_reward_func/std": 1.4026241302490234, "sampling/importance_sampling_ratio/max": 1.0323922634124756, "sampling/importance_sampling_ratio/mean": 0.593119740486145, "sampling/importance_sampling_ratio/min": 0.00035747495712712407, "sampling/sampling_logp_difference/max": 1.6237244606018066, "sampling/sampling_logp_difference/mean": 0.25656354427337646, "step": 1879, "step_time": 18.378242835009587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8370519876480103, "epoch": 0.0094, "grad_norm": 0.0447666235268116, "kl": 0.2738334462046623, "learning_rate": 7.999685325705167e-06, "loss": -0.0864, "step": 1880, "step_time": 7.419827004996478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 4.461538791656494, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5429647024720907, "epoch": 0.009405, "frac_reward_zero_std": 0.0, "grad_norm": 0.0713091492652893, "kl": 0.4528568647801876, "learning_rate": 7.999684984323268e-06, "loss": -0.0762, "num_tokens": 25284087.0, "reward": 1.3220546245574951, "reward_std": 1.0681594610214233, "rewards/rollout_reward_func/mean": 1.3220546245574951, "rewards/rollout_reward_func/std": 1.0681593418121338, "sampling/importance_sampling_ratio/max": 1.0976444482803345, "sampling/importance_sampling_ratio/mean": 0.6838893890380859, "sampling/importance_sampling_ratio/min": 7.103203824954107e-05, "sampling/sampling_logp_difference/max": 1.5542100667953491, "sampling/sampling_logp_difference/mean": 0.28063273429870605, "step": 1881, "step_time": 14.915949802001705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5104502700269222, "epoch": 0.00941, "grad_norm": 0.07322903722524643, "kl": 0.4857153668999672, "learning_rate": 7.999684642756302e-06, "loss": -0.0762, "step": 1882, "step_time": 6.274760406988207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7371676564216614, "epoch": 0.009415, "frac_reward_zero_std": 0.5, "grad_norm": 0.10156653076410294, "kl": 0.18345678970217705, "learning_rate": 7.999684301004268e-06, "loss": -0.0502, "num_tokens": 25305916.0, "reward": 0.7798815369606018, "reward_std": 1.5122826099395752, "rewards/rollout_reward_func/mean": 0.7798815369606018, "rewards/rollout_reward_func/std": 1.5122824907302856, "sampling/importance_sampling_ratio/max": 1.0938087701797485, "sampling/importance_sampling_ratio/mean": 0.6446586847305298, "sampling/importance_sampling_ratio/min": 1.8408667529001832e-05, "sampling/sampling_logp_difference/max": 2.492083787918091, "sampling/sampling_logp_difference/mean": 0.2129942774772644, "step": 1883, "step_time": 14.433834997995291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.727153718471527, "epoch": 0.00942, "grad_norm": 0.07720018923282623, "kl": 0.18491188064217567, "learning_rate": 7.999683959067165e-06, "loss": -0.0508, "step": 1884, "step_time": 5.781440289007151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1771245896816254, "epoch": 0.009425, "frac_reward_zero_std": 0.0, "grad_norm": 0.25934284925460815, "kl": 0.6504438370466232, "learning_rate": 7.999683616944995e-06, "loss": -0.0866, "num_tokens": 25328669.0, "reward": 0.8639087080955505, "reward_std": 1.4038403034210205, "rewards/rollout_reward_func/mean": 0.8639087080955505, "rewards/rollout_reward_func/std": 1.4038403034210205, "sampling/importance_sampling_ratio/max": 1.0418609380722046, "sampling/importance_sampling_ratio/mean": 0.6230235695838928, "sampling/importance_sampling_ratio/min": 1.8683952163200956e-08, "sampling/sampling_logp_difference/max": 2.1134133338928223, "sampling/sampling_logp_difference/mean": 0.3758852481842041, "step": 1885, "step_time": 14.636187242009328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1551933884620667, "epoch": 0.00943, "grad_norm": 0.14723213016986847, "kl": 0.5431696251034737, "learning_rate": 7.999683274637757e-06, "loss": -0.0888, "step": 1886, "step_time": 5.48620253901754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.1875, "completions/mean_terminated_length": 5.666666507720947, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.789247542619705, "epoch": 0.009435, "frac_reward_zero_std": 0.0, "grad_norm": 0.02158590778708458, "kl": 0.15494761988520622, "learning_rate": 7.999682932145452e-06, "loss": -0.093, "num_tokens": 25359829.0, "reward": 0.14087259769439697, "reward_std": 1.4201841354370117, "rewards/rollout_reward_func/mean": 0.14087259769439697, "rewards/rollout_reward_func/std": 1.4201842546463013, "sampling/importance_sampling_ratio/max": 1.017828345298767, "sampling/importance_sampling_ratio/mean": 0.4185132384300232, "sampling/importance_sampling_ratio/min": 8.653405249303692e-12, "sampling/sampling_logp_difference/max": 2.832807779312134, "sampling/sampling_logp_difference/mean": 0.5118069052696228, "step": 1887, "step_time": 17.649400822017924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7870382964611053, "epoch": 0.00944, "grad_norm": 0.02200804091989994, "kl": 0.1547222137451172, "learning_rate": 7.999682589468078e-06, "loss": -0.0931, "step": 1888, "step_time": 6.8575161390181165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4390370547771454, "epoch": 0.009445, "frac_reward_zero_std": 0.5, "grad_norm": 0.03730208799242973, "kl": 0.13719433546066284, "learning_rate": 7.999682246605636e-06, "loss": -0.0483, "num_tokens": 25386620.0, "reward": 1.0837233066558838, "reward_std": 1.2470664978027344, "rewards/rollout_reward_func/mean": 1.0837233066558838, "rewards/rollout_reward_func/std": 1.2470664978027344, "sampling/importance_sampling_ratio/max": 1.1513941287994385, "sampling/importance_sampling_ratio/mean": 0.7703701853752136, "sampling/importance_sampling_ratio/min": 0.000555739679839462, "sampling/sampling_logp_difference/max": 1.5081894397735596, "sampling/sampling_logp_difference/mean": 0.17857079207897186, "step": 1889, "step_time": 17.91668313203263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4386359453201294, "epoch": 0.00945, "grad_norm": 0.03505594655871391, "kl": 0.137682493776083, "learning_rate": 7.999681903558126e-06, "loss": -0.0484, "step": 1890, "step_time": 6.65631613098958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 5.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.376889646053314, "epoch": 0.009455, "frac_reward_zero_std": 0.0, "grad_norm": 0.15441827476024628, "kl": 0.18956918641924858, "learning_rate": 7.99968156032555e-06, "loss": -0.0881, "num_tokens": 25411328.0, "reward": 0.5662894248962402, "reward_std": 1.3610280752182007, "rewards/rollout_reward_func/mean": 0.5662894248962402, "rewards/rollout_reward_func/std": 1.3610280752182007, "sampling/importance_sampling_ratio/max": 1.2095301151275635, "sampling/importance_sampling_ratio/mean": 0.6130341291427612, "sampling/importance_sampling_ratio/min": 9.44786080481208e-08, "sampling/sampling_logp_difference/max": 1.919731855392456, "sampling/sampling_logp_difference/mean": 0.47832369804382324, "step": 1891, "step_time": 14.445778593988507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 2.343661218881607, "epoch": 0.00946, "grad_norm": 0.1375754326581955, "kl": 0.19274261966347694, "learning_rate": 7.999681216907903e-06, "loss": -0.089, "step": 1892, "step_time": 6.71767817500222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0578691996634007, "epoch": 0.009465, "frac_reward_zero_std": 0.0, "grad_norm": 0.0350506417453289, "kl": 0.5308796167373657, "learning_rate": 7.999680873305191e-06, "loss": -0.0686, "num_tokens": 25442898.0, "reward": 0.5780192017555237, "reward_std": 1.286276936531067, "rewards/rollout_reward_func/mean": 0.5780192017555237, "rewards/rollout_reward_func/std": 1.2862770557403564, "sampling/importance_sampling_ratio/max": 1.5281285047531128, "sampling/importance_sampling_ratio/mean": 0.6105658411979675, "sampling/importance_sampling_ratio/min": 3.7222858129126735e-09, "sampling/sampling_logp_difference/max": 2.3312649726867676, "sampling/sampling_logp_difference/mean": 0.42311346530914307, "step": 1893, "step_time": 15.686350890988251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.04338738322258, "epoch": 0.00947, "grad_norm": 0.03765390068292618, "kl": 0.5317147839814425, "learning_rate": 7.99968052951741e-06, "loss": -0.0686, "step": 1894, "step_time": 6.831444310009829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7304640412330627, "epoch": 0.009475, "frac_reward_zero_std": 0.0, "grad_norm": 0.13415928184986115, "kl": 0.1896733194589615, "learning_rate": 7.999680185544562e-06, "loss": -0.0747, "num_tokens": 25478035.0, "reward": 0.49843737483024597, "reward_std": 1.0148552656173706, "rewards/rollout_reward_func/mean": 0.49843737483024597, "rewards/rollout_reward_func/std": 1.0148552656173706, "sampling/importance_sampling_ratio/max": 1.0847374200820923, "sampling/importance_sampling_ratio/mean": 0.6429446339607239, "sampling/importance_sampling_ratio/min": 0.0007386973593384027, "sampling/sampling_logp_difference/max": 1.1635940074920654, "sampling/sampling_logp_difference/mean": 0.23860913515090942, "step": 1895, "step_time": 18.668117841996718 }, { "clip_ratio/high_max": 0.04545454680919647, "clip_ratio/high_mean": 0.022727273404598236, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022727273404598236, "entropy": 1.6920853853225708, "epoch": 0.00948, "grad_norm": 0.08328623324632645, "kl": 0.189423818141222, "learning_rate": 7.999679841386644e-06, "loss": -0.075, "step": 1896, "step_time": 7.7310969779937295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 4.9375, "completions/mean_terminated_length": 4.9375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.4800278590992093, "epoch": 0.009485, "frac_reward_zero_std": 0.5, "grad_norm": 0.10471446067094803, "kl": 0.2937850281596184, "learning_rate": 7.999679497043661e-06, "loss": -0.0321, "num_tokens": 25501062.0, "reward": 1.4217586517333984, "reward_std": 1.0148636102676392, "rewards/rollout_reward_func/mean": 1.4217586517333984, "rewards/rollout_reward_func/std": 1.0148636102676392, "sampling/importance_sampling_ratio/max": 1.0294362306594849, "sampling/importance_sampling_ratio/mean": 0.9071494936943054, "sampling/importance_sampling_ratio/min": 0.0012319338275119662, "sampling/sampling_logp_difference/max": 1.4997901916503906, "sampling/sampling_logp_difference/mean": 0.08877354860305786, "step": 1897, "step_time": 11.85065795799892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011363636702299118, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "entropy": 0.46298970095813274, "epoch": 0.00949, "grad_norm": 0.06551680713891983, "kl": 0.31186243891716003, "learning_rate": 7.999679152515609e-06, "loss": -0.0322, "step": 1898, "step_time": 5.472867670992855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 5.384615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7064621588215232, "epoch": 0.009495, "frac_reward_zero_std": 0.0, "grad_norm": 0.013209991157054901, "kl": 0.22204676643013954, "learning_rate": 7.999678807802491e-06, "loss": -0.0902, "num_tokens": 25535573.0, "reward": 0.8302109241485596, "reward_std": 1.204721450805664, "rewards/rollout_reward_func/mean": 0.8302109241485596, "rewards/rollout_reward_func/std": 1.204721450805664, "sampling/importance_sampling_ratio/max": 1.0433515310287476, "sampling/importance_sampling_ratio/mean": 0.7012727856636047, "sampling/importance_sampling_ratio/min": 8.996021642815322e-05, "sampling/sampling_logp_difference/max": 1.600306510925293, "sampling/sampling_logp_difference/mean": 0.26661789417266846, "step": 1899, "step_time": 16.36337200998969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.708186324685812, "epoch": 0.0095, "grad_norm": 0.013396571390330791, "kl": 0.2215201035141945, "learning_rate": 7.999678462904302e-06, "loss": -0.0902, "step": 1900, "step_time": 7.632715408006334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 5.200000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.8891431987285614, "epoch": 0.009505, "frac_reward_zero_std": 0.0, "grad_norm": 0.1707434356212616, "kl": 0.4312775582075119, "learning_rate": 7.999678117821048e-06, "loss": -0.0575, "num_tokens": 25563782.0, "reward": -0.6124098300933838, "reward_std": 0.534079909324646, "rewards/rollout_reward_func/mean": -0.6124098300933838, "rewards/rollout_reward_func/std": 0.534079909324646, "sampling/importance_sampling_ratio/max": 1.0598750114440918, "sampling/importance_sampling_ratio/mean": 0.4803006947040558, "sampling/importance_sampling_ratio/min": 2.8975553050258895e-06, "sampling/sampling_logp_difference/max": 1.9500290155410767, "sampling/sampling_logp_difference/mean": 0.487102746963501, "step": 1901, "step_time": 15.32402911799727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 2.8921889066696167, "epoch": 0.00951, "grad_norm": 0.1476602852344513, "kl": 0.4363022707402706, "learning_rate": 7.999677772552726e-06, "loss": -0.0576, "step": 1902, "step_time": 6.7635552220162936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5779008213430643, "epoch": 0.009515, "frac_reward_zero_std": 0.0, "grad_norm": 0.16458578407764435, "kl": 0.9314928986132145, "learning_rate": 7.999677427099336e-06, "loss": -0.0443, "num_tokens": 25593514.0, "reward": 0.30703118443489075, "reward_std": 1.2426173686981201, "rewards/rollout_reward_func/mean": 0.30703118443489075, "rewards/rollout_reward_func/std": 1.2426174879074097, "sampling/importance_sampling_ratio/max": 1.0751227140426636, "sampling/importance_sampling_ratio/mean": 0.6688700318336487, "sampling/importance_sampling_ratio/min": 1.8989434465765953e-05, "sampling/sampling_logp_difference/max": 1.7144935131072998, "sampling/sampling_logp_difference/mean": 0.2769579291343689, "step": 1903, "step_time": 14.537154016026761 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 1.5833294177427888, "epoch": 0.00952, "grad_norm": 0.10912808775901794, "kl": 0.7624071873724461, "learning_rate": 7.999677081460877e-06, "loss": -0.0457, "step": 1904, "step_time": 5.93338673497783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 5.538461685180664, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5841612219810486, "epoch": 0.009525, "frac_reward_zero_std": 0.5, "grad_norm": 0.004540037829428911, "kl": 0.20096294581890106, "learning_rate": 7.999676735637354e-06, "loss": -0.0393, "num_tokens": 25621496.0, "reward": 0.5885874629020691, "reward_std": 1.0566939115524292, "rewards/rollout_reward_func/mean": 0.5885874629020691, "rewards/rollout_reward_func/std": 1.0566940307617188, "sampling/importance_sampling_ratio/max": 1.0508387088775635, "sampling/importance_sampling_ratio/mean": 0.7032164335250854, "sampling/importance_sampling_ratio/min": 0.00010516963811824098, "sampling/sampling_logp_difference/max": 1.8481709957122803, "sampling/sampling_logp_difference/mean": 0.20129895210266113, "step": 1905, "step_time": 20.147374515028787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5877626836299896, "epoch": 0.00953, "grad_norm": 0.004905070178210735, "kl": 0.2011379934847355, "learning_rate": 7.99967638962876e-06, "loss": -0.0393, "step": 1906, "step_time": 7.7554572700028075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6486906111240387, "epoch": 0.009535, "frac_reward_zero_std": 0.0, "grad_norm": 0.14421464502811432, "kl": 0.2761779986321926, "learning_rate": 7.9996760434351e-06, "loss": -0.0359, "num_tokens": 25647769.0, "reward": -0.7412458062171936, "reward_std": 0.6403384208679199, "rewards/rollout_reward_func/mean": -0.7412458062171936, "rewards/rollout_reward_func/std": 0.6403384804725647, "sampling/importance_sampling_ratio/max": 1.093592882156372, "sampling/importance_sampling_ratio/mean": 0.7271214723587036, "sampling/importance_sampling_ratio/min": 1.4202768170434865e-06, "sampling/sampling_logp_difference/max": 1.656770944595337, "sampling/sampling_logp_difference/mean": 0.2548655867576599, "step": 1907, "step_time": 14.884673745997134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.643932282924652, "epoch": 0.00954, "grad_norm": 0.14222851395606995, "kl": 0.2781612388789654, "learning_rate": 7.999675697056373e-06, "loss": -0.0361, "step": 1908, "step_time": 6.454096168003161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7017546705901623, "epoch": 0.009545, "frac_reward_zero_std": 0.0, "grad_norm": 0.34438470005989075, "kl": 0.45767445862293243, "learning_rate": 7.999675350492578e-06, "loss": -0.0569, "num_tokens": 25669343.0, "reward": 0.9046894907951355, "reward_std": 1.0562455654144287, "rewards/rollout_reward_func/mean": 0.9046894907951355, "rewards/rollout_reward_func/std": 1.0562456846237183, "sampling/importance_sampling_ratio/max": 1.031018614768982, "sampling/importance_sampling_ratio/mean": 0.8150227069854736, "sampling/importance_sampling_ratio/min": 0.004751968197524548, "sampling/sampling_logp_difference/max": 0.985541820526123, "sampling/sampling_logp_difference/mean": 0.11243468523025513, "step": 1909, "step_time": 11.580684905013186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.053977273404598236, "clip_ratio/low_min": 0.022727273404598236, "clip_ratio/region_mean": 0.053977273404598236, "entropy": 0.7770413160324097, "epoch": 0.00955, "grad_norm": 0.15255139768123627, "kl": 0.48795320838689804, "learning_rate": 7.999675003743715e-06, "loss": -0.0588, "step": 1910, "step_time": 5.84464888900402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9566007470712066, "epoch": 0.009555, "frac_reward_zero_std": 0.0, "grad_norm": 0.013919156976044178, "kl": 0.17074349708855152, "learning_rate": 7.999674656809786e-06, "loss": -0.0837, "num_tokens": 25700089.0, "reward": 0.5958358645439148, "reward_std": 1.250234842300415, "rewards/rollout_reward_func/mean": 0.5958358645439148, "rewards/rollout_reward_func/std": 1.2502349615097046, "sampling/importance_sampling_ratio/max": 1.0379271507263184, "sampling/importance_sampling_ratio/mean": 0.6998050212860107, "sampling/importance_sampling_ratio/min": 4.033773439005017e-05, "sampling/sampling_logp_difference/max": 1.8338532447814941, "sampling/sampling_logp_difference/mean": 0.3329085111618042, "step": 1911, "step_time": 15.021193624983425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9530971823260188, "epoch": 0.00956, "grad_norm": 0.014133964665234089, "kl": 0.1710136979818344, "learning_rate": 7.999674309690788e-06, "loss": -0.0836, "step": 1912, "step_time": 6.164958214998478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.5625, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9374580681324005, "epoch": 0.009565, "frac_reward_zero_std": 0.0, "grad_norm": 0.055719781666994095, "kl": 0.290583111345768, "learning_rate": 7.999673962386724e-06, "loss": -0.0922, "num_tokens": 25736893.0, "reward": 0.6398757696151733, "reward_std": 1.1611824035644531, "rewards/rollout_reward_func/mean": 0.6398757696151733, "rewards/rollout_reward_func/std": 1.1611824035644531, "sampling/importance_sampling_ratio/max": 1.0834945440292358, "sampling/importance_sampling_ratio/mean": 0.6509491205215454, "sampling/importance_sampling_ratio/min": 4.117251137358835e-06, "sampling/sampling_logp_difference/max": 1.9206123352050781, "sampling/sampling_logp_difference/mean": 0.3354295790195465, "step": 1913, "step_time": 21.25976191999507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9467166662216187, "epoch": 0.00957, "grad_norm": 0.06093373894691467, "kl": 0.2898306977003813, "learning_rate": 7.999673614897592e-06, "loss": -0.0921, "step": 1914, "step_time": 9.349527202008176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.375, "completions/mean_terminated_length": 4.909090995788574, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7360369712114334, "epoch": 0.009575, "frac_reward_zero_std": 0.0, "grad_norm": 0.025704732164740562, "kl": 0.19743519369512796, "learning_rate": 7.999673267223393e-06, "loss": -0.0817, "num_tokens": 25760365.0, "reward": 0.569545328617096, "reward_std": 1.3773819208145142, "rewards/rollout_reward_func/mean": 0.569545328617096, "rewards/rollout_reward_func/std": 1.3773820400238037, "sampling/importance_sampling_ratio/max": 1.0123099088668823, "sampling/importance_sampling_ratio/mean": 0.6153110265731812, "sampling/importance_sampling_ratio/min": 5.049911919741135e-07, "sampling/sampling_logp_difference/max": 2.6237125396728516, "sampling/sampling_logp_difference/mean": 0.31686121225357056, "step": 1915, "step_time": 15.025428781009396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7366198981180787, "epoch": 0.00958, "grad_norm": 0.02347138524055481, "kl": 0.18856210634112358, "learning_rate": 7.999672919364125e-06, "loss": -0.0818, "step": 1916, "step_time": 5.713415291000274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.0625, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9580006897449493, "epoch": 0.009585, "frac_reward_zero_std": 0.0, "grad_norm": 0.09727369993925095, "kl": 0.3040565326809883, "learning_rate": 7.999672571319792e-06, "loss": -0.0768, "num_tokens": 25778588.0, "reward": 0.3240901827812195, "reward_std": 1.5313425064086914, "rewards/rollout_reward_func/mean": 0.3240901827812195, "rewards/rollout_reward_func/std": 1.531342625617981, "sampling/importance_sampling_ratio/max": 1.0334033966064453, "sampling/importance_sampling_ratio/mean": 0.8825654983520508, "sampling/importance_sampling_ratio/min": 0.0003206674591638148, "sampling/sampling_logp_difference/max": 1.3968546390533447, "sampling/sampling_logp_difference/mean": 0.18641528487205505, "step": 1917, "step_time": 9.440210486995056 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.9634263589978218, "epoch": 0.00959, "grad_norm": 0.04172930121421814, "kl": 0.2991451472043991, "learning_rate": 7.99967222309039e-06, "loss": -0.0769, "step": 1918, "step_time": 4.77517375802563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 4.454545497894287, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.053450871258974, "epoch": 0.009595, "frac_reward_zero_std": 0.0, "grad_norm": 0.2834090292453766, "kl": 2.6858073212206364, "learning_rate": 7.999671874675921e-06, "loss": -0.1031, "num_tokens": 25812437.0, "reward": 0.4376441538333893, "reward_std": 1.375924825668335, "rewards/rollout_reward_func/mean": 0.4376441538333893, "rewards/rollout_reward_func/std": 1.375924825668335, "sampling/importance_sampling_ratio/max": 1.0738927125930786, "sampling/importance_sampling_ratio/mean": 0.5182923078536987, "sampling/importance_sampling_ratio/min": 0.0005107950419187546, "sampling/sampling_logp_difference/max": 1.594818115234375, "sampling/sampling_logp_difference/mean": 0.3240894079208374, "step": 1919, "step_time": 18.52661179898132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.056920137256384, "epoch": 0.0096, "grad_norm": 0.21209366619586945, "kl": 2.0175225641578436, "learning_rate": 7.999671526076385e-06, "loss": -0.1047, "step": 1920, "step_time": 7.847917726001469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.6875, "completions/mean_terminated_length": 5.900000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1588096618652344, "epoch": 0.009605, "frac_reward_zero_std": 0.0, "grad_norm": 0.11879859119653702, "kl": 0.13185034785419703, "learning_rate": 7.999671177291783e-06, "loss": -0.1122, "num_tokens": 25843137.0, "reward": 0.44985586404800415, "reward_std": 1.3657565116882324, "rewards/rollout_reward_func/mean": 0.44985586404800415, "rewards/rollout_reward_func/std": 1.365756630897522, "sampling/importance_sampling_ratio/max": 1.050628423690796, "sampling/importance_sampling_ratio/mean": 0.5105769634246826, "sampling/importance_sampling_ratio/min": 3.4456032153684646e-05, "sampling/sampling_logp_difference/max": 2.3272476196289062, "sampling/sampling_logp_difference/mean": 0.3378238379955292, "step": 1921, "step_time": 16.98389384301845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.162215620279312, "epoch": 0.00961, "grad_norm": 0.15147161483764648, "kl": 0.13218567799776793, "learning_rate": 7.999670828322113e-06, "loss": -0.112, "step": 1922, "step_time": 6.724171236011898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.8125, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.937030702829361, "epoch": 0.009615, "frac_reward_zero_std": 0.0, "grad_norm": 0.015169781632721424, "kl": 0.19274761155247688, "learning_rate": 7.999670479167375e-06, "loss": -0.1041, "num_tokens": 25877249.0, "reward": 0.6806603670120239, "reward_std": 1.2328286170959473, "rewards/rollout_reward_func/mean": 0.6806603670120239, "rewards/rollout_reward_func/std": 1.2328286170959473, "sampling/importance_sampling_ratio/max": 1.0589724779129028, "sampling/importance_sampling_ratio/mean": 0.5756709575653076, "sampling/importance_sampling_ratio/min": 0.00018163937784265727, "sampling/sampling_logp_difference/max": 1.7846808433532715, "sampling/sampling_logp_difference/mean": 0.3242611885070801, "step": 1923, "step_time": 18.417495307032368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9428978562355042, "epoch": 0.00962, "grad_norm": 0.014749818481504917, "kl": 0.19296561367809772, "learning_rate": 7.999670129827571e-06, "loss": -0.1042, "step": 1924, "step_time": 7.2348296900017885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 4.727272987365723, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.947637617588043, "epoch": 0.009625, "frac_reward_zero_std": 0.0, "grad_norm": 0.07095424085855484, "kl": 0.18900659680366516, "learning_rate": 7.9996697803027e-06, "loss": -0.086, "num_tokens": 25909691.0, "reward": 0.11922566592693329, "reward_std": 1.0703699588775635, "rewards/rollout_reward_func/mean": 0.11922566592693329, "rewards/rollout_reward_func/std": 1.0703699588775635, "sampling/importance_sampling_ratio/max": 1.0650924444198608, "sampling/importance_sampling_ratio/mean": 0.489986777305603, "sampling/importance_sampling_ratio/min": 5.056272289749586e-09, "sampling/sampling_logp_difference/max": 2.0015170574188232, "sampling/sampling_logp_difference/mean": 0.5230871438980103, "step": 1925, "step_time": 17.71908152999822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.9576019048690796, "epoch": 0.00963, "grad_norm": 0.07086395472288132, "kl": 0.1886768937110901, "learning_rate": 7.999669430592762e-06, "loss": -0.0861, "step": 1926, "step_time": 8.041369496000698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.538461685180664, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.42294442653656, "epoch": 0.009635, "frac_reward_zero_std": 0.0, "grad_norm": 0.17590484023094177, "kl": 0.24899828806519508, "learning_rate": 7.999669080697755e-06, "loss": -0.0482, "num_tokens": 25940178.0, "reward": 1.0610333681106567, "reward_std": 1.131681203842163, "rewards/rollout_reward_func/mean": 1.0610333681106567, "rewards/rollout_reward_func/std": 1.1316810846328735, "sampling/importance_sampling_ratio/max": 1.034318208694458, "sampling/importance_sampling_ratio/mean": 0.725278377532959, "sampling/importance_sampling_ratio/min": 3.443829336902127e-05, "sampling/sampling_logp_difference/max": 1.9861396551132202, "sampling/sampling_logp_difference/mean": 0.2393290400505066, "step": 1927, "step_time": 14.443784529998084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4164121448993683, "epoch": 0.00964, "grad_norm": 0.1809655874967575, "kl": 0.24803731963038445, "learning_rate": 7.999668730617682e-06, "loss": -0.0485, "step": 1928, "step_time": 6.796628195006633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.9375, "completions/mean_terminated_length": 4.700000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2119148671627045, "epoch": 0.009645, "frac_reward_zero_std": 0.0, "grad_norm": 0.21543854475021362, "kl": 0.38798851519823074, "learning_rate": 7.999668380352543e-06, "loss": -0.0701, "num_tokens": 25967230.0, "reward": 0.3067888021469116, "reward_std": 1.5729387998580933, "rewards/rollout_reward_func/mean": 0.3067888021469116, "rewards/rollout_reward_func/std": 1.5729389190673828, "sampling/importance_sampling_ratio/max": 1.0138421058654785, "sampling/importance_sampling_ratio/mean": 0.529394268989563, "sampling/importance_sampling_ratio/min": 0.0008241962641477585, "sampling/sampling_logp_difference/max": 1.9160513877868652, "sampling/sampling_logp_difference/mean": 0.32021641731262207, "step": 1929, "step_time": 13.444683263995103 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.1941079199314117, "epoch": 0.00965, "grad_norm": 0.11911793798208237, "kl": 0.40412719920277596, "learning_rate": 7.999668029902336e-06, "loss": -0.0715, "step": 1930, "step_time": 5.695009784001741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7750077489763498, "epoch": 0.009655, "frac_reward_zero_std": 0.0, "grad_norm": 0.025269849225878716, "kl": 0.22816983237862587, "learning_rate": 7.999667679267063e-06, "loss": -0.103, "num_tokens": 25997676.0, "reward": 0.374394029378891, "reward_std": 1.4034438133239746, "rewards/rollout_reward_func/mean": 0.374394029378891, "rewards/rollout_reward_func/std": 1.4034438133239746, "sampling/importance_sampling_ratio/max": 1.0982800722122192, "sampling/importance_sampling_ratio/mean": 0.5787426233291626, "sampling/importance_sampling_ratio/min": 0.00015168088430073112, "sampling/sampling_logp_difference/max": 1.8883624076843262, "sampling/sampling_logp_difference/mean": 0.34257644414901733, "step": 1931, "step_time": 15.858653102011885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7703111302107573, "epoch": 0.00966, "grad_norm": 0.022790485993027687, "kl": 0.2264961525797844, "learning_rate": 7.99966732844672e-06, "loss": -0.103, "step": 1932, "step_time": 6.435762826004066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.3125, "completions/mean_terminated_length": 4.4166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7157838456332684, "epoch": 0.009665, "frac_reward_zero_std": 0.0, "grad_norm": 0.08964837342500687, "kl": 0.2300345115363598, "learning_rate": 7.999666977441314e-06, "loss": -0.0792, "num_tokens": 26025414.0, "reward": -0.12960755825042725, "reward_std": 0.9424342513084412, "rewards/rollout_reward_func/mean": -0.12960755825042725, "rewards/rollout_reward_func/std": 0.9424342513084412, "sampling/importance_sampling_ratio/max": 1.0309563875198364, "sampling/importance_sampling_ratio/mean": 0.6934256553649902, "sampling/importance_sampling_ratio/min": 1.5027037079562433e-05, "sampling/sampling_logp_difference/max": 1.5299965143203735, "sampling/sampling_logp_difference/mean": 0.2987386882305145, "step": 1933, "step_time": 14.39742419499089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7122944556176662, "epoch": 0.00967, "grad_norm": 0.06001325696706772, "kl": 0.2312304861843586, "learning_rate": 7.99966662625084e-06, "loss": -0.0796, "step": 1934, "step_time": 6.37518849800108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.538461685180664, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6754686832427979, "epoch": 0.009675, "frac_reward_zero_std": 0.0, "grad_norm": 0.025173816829919815, "kl": 0.29784077405929565, "learning_rate": 7.999666274875298e-06, "loss": -0.0815, "num_tokens": 26057585.0, "reward": 1.0827505588531494, "reward_std": 1.0983678102493286, "rewards/rollout_reward_func/mean": 1.0827505588531494, "rewards/rollout_reward_func/std": 1.0983678102493286, "sampling/importance_sampling_ratio/max": 1.1122726202011108, "sampling/importance_sampling_ratio/mean": 0.7089661359786987, "sampling/importance_sampling_ratio/min": 1.1666985301417299e-05, "sampling/sampling_logp_difference/max": 1.5289839506149292, "sampling/sampling_logp_difference/mean": 0.2959802746772766, "step": 1935, "step_time": 15.61610308602394 }, { "clip_ratio/high_max": 0.010869565419852734, "clip_ratio/high_mean": 0.005434782709926367, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005434782709926367, "entropy": 1.6694265305995941, "epoch": 0.00968, "grad_norm": 0.0287308506667614, "kl": 0.30737903341650963, "learning_rate": 7.999665923314688e-06, "loss": -0.0815, "step": 1936, "step_time": 6.532046194988652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.92307710647583, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4910370744764805, "epoch": 0.009685, "frac_reward_zero_std": 0.0, "grad_norm": 0.04488549754023552, "kl": 0.2514856271445751, "learning_rate": 7.999665571569013e-06, "loss": -0.064, "num_tokens": 26088307.0, "reward": 0.5520747900009155, "reward_std": 1.1945687532424927, "rewards/rollout_reward_func/mean": 0.5520747900009155, "rewards/rollout_reward_func/std": 1.1945688724517822, "sampling/importance_sampling_ratio/max": 1.0754982233047485, "sampling/importance_sampling_ratio/mean": 0.7326767444610596, "sampling/importance_sampling_ratio/min": 7.057657853692945e-07, "sampling/sampling_logp_difference/max": 1.714038610458374, "sampling/sampling_logp_difference/mean": 0.31825610995292664, "step": 1937, "step_time": 17.965139928011922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4855427648872137, "epoch": 0.00969, "grad_norm": 0.053642675280570984, "kl": 0.24898360669612885, "learning_rate": 7.999665219638272e-06, "loss": -0.0641, "step": 1938, "step_time": 7.899723955008085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 4.461538791656494, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7741452306509018, "epoch": 0.009695, "frac_reward_zero_std": 0.0, "grad_norm": 0.12735599279403687, "kl": 0.4092333987355232, "learning_rate": 7.999664867522463e-06, "loss": -0.0619, "num_tokens": 26112578.0, "reward": 0.9380888342857361, "reward_std": 1.3481593132019043, "rewards/rollout_reward_func/mean": 0.9380888342857361, "rewards/rollout_reward_func/std": 1.3481593132019043, "sampling/importance_sampling_ratio/max": 1.0348169803619385, "sampling/importance_sampling_ratio/mean": 0.7316994667053223, "sampling/importance_sampling_ratio/min": 2.6024434191640466e-05, "sampling/sampling_logp_difference/max": 1.8988322019577026, "sampling/sampling_logp_difference/mean": 0.34481552243232727, "step": 1939, "step_time": 13.191754144005245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7785268481820822, "epoch": 0.0097, "grad_norm": 0.12355875968933105, "kl": 0.41021475940942764, "learning_rate": 7.999664515221587e-06, "loss": -0.0618, "step": 1940, "step_time": 5.6467593319976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9427471039816737, "epoch": 0.009705, "frac_reward_zero_std": 0.5, "grad_norm": 0.03643106296658516, "kl": 0.24020874127745628, "learning_rate": 7.999664162735645e-06, "loss": -0.0412, "num_tokens": 26138636.0, "reward": 1.0555392503738403, "reward_std": 1.307839035987854, "rewards/rollout_reward_func/mean": 1.0555392503738403, "rewards/rollout_reward_func/std": 1.3078389167785645, "sampling/importance_sampling_ratio/max": 1.047936201095581, "sampling/importance_sampling_ratio/mean": 0.7807626128196716, "sampling/importance_sampling_ratio/min": 0.00015805111615918577, "sampling/sampling_logp_difference/max": 1.302614450454712, "sampling/sampling_logp_difference/mean": 0.16129399836063385, "step": 1941, "step_time": 15.764252627981477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.944157786667347, "epoch": 0.00971, "grad_norm": 0.03720393776893616, "kl": 0.2400825135409832, "learning_rate": 7.999663810064636e-06, "loss": -0.0412, "step": 1942, "step_time": 6.40124063201074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.5625, "completions/mean_terminated_length": 6.083333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9395570904016495, "epoch": 0.009715, "frac_reward_zero_std": 0.0, "grad_norm": 0.07637787610292435, "kl": 0.6282093413174152, "learning_rate": 7.999663457208559e-06, "loss": -0.0806, "num_tokens": 26162931.0, "reward": 0.5371158123016357, "reward_std": 1.498436689376831, "rewards/rollout_reward_func/mean": 0.5371158123016357, "rewards/rollout_reward_func/std": 1.4984368085861206, "sampling/importance_sampling_ratio/max": 1.0357660055160522, "sampling/importance_sampling_ratio/mean": 0.531101644039154, "sampling/importance_sampling_ratio/min": 1.1639240256045014e-06, "sampling/sampling_logp_difference/max": 2.030106544494629, "sampling/sampling_logp_difference/mean": 0.3381829857826233, "step": 1943, "step_time": 14.798414487973787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.942364789545536, "epoch": 0.00972, "grad_norm": 0.06983170658349991, "kl": 0.5767340771853924, "learning_rate": 7.999663104167417e-06, "loss": -0.081, "step": 1944, "step_time": 5.832316910018562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.125, "completions/mean_terminated_length": 4.125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5515681616961956, "epoch": 0.009725, "frac_reward_zero_std": 0.5, "grad_norm": 0.17632171511650085, "kl": 0.25010307505726814, "learning_rate": 7.999662750941207e-06, "loss": -0.0035, "num_tokens": 26187865.0, "reward": 1.2565953731536865, "reward_std": 1.1650152206420898, "rewards/rollout_reward_func/mean": 1.2565953731536865, "rewards/rollout_reward_func/std": 1.1650152206420898, "sampling/importance_sampling_ratio/max": 1.1494213342666626, "sampling/importance_sampling_ratio/mean": 0.931091845035553, "sampling/importance_sampling_ratio/min": 0.01628118008375168, "sampling/sampling_logp_difference/max": 1.2170625925064087, "sampling/sampling_logp_difference/mean": 0.07600993663072586, "step": 1945, "step_time": 14.664370604979922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.544635035097599, "epoch": 0.00973, "grad_norm": 0.18706031143665314, "kl": 0.2504380792379379, "learning_rate": 7.999662397529931e-06, "loss": -0.0039, "step": 1946, "step_time": 6.551192686994909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 4.6875, "completions/mean_terminated_length": 4.6875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.4107564315199852, "epoch": 0.009735, "frac_reward_zero_std": 0.5, "grad_norm": 0.03920506685972214, "kl": 0.5158456191420555, "learning_rate": 7.999662043933588e-06, "loss": -0.0366, "num_tokens": 26211069.0, "reward": 1.760541558265686, "reward_std": 0.6267579197883606, "rewards/rollout_reward_func/mean": 1.760541558265686, "rewards/rollout_reward_func/std": 0.6267579793930054, "sampling/importance_sampling_ratio/max": 1.0196009874343872, "sampling/importance_sampling_ratio/mean": 0.9459416270256042, "sampling/importance_sampling_ratio/min": 0.0003438675485085696, "sampling/sampling_logp_difference/max": 1.530397653579712, "sampling/sampling_logp_difference/mean": 0.09470711648464203, "step": 1947, "step_time": 13.074138999989373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4068409241735935, "epoch": 0.00974, "grad_norm": 0.04495516046881676, "kl": 0.551925852894783, "learning_rate": 7.999661690152178e-06, "loss": -0.0363, "step": 1948, "step_time": 6.845085794993793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1024293527007103, "epoch": 0.009745, "frac_reward_zero_std": 0.5, "grad_norm": 0.052215371280908585, "kl": 0.2426232434809208, "learning_rate": 7.999661336185702e-06, "loss": -0.0362, "num_tokens": 26241113.0, "reward": 1.0325779914855957, "reward_std": 1.1803995370864868, "rewards/rollout_reward_func/mean": 1.0325779914855957, "rewards/rollout_reward_func/std": 1.1803996562957764, "sampling/importance_sampling_ratio/max": 1.0890936851501465, "sampling/importance_sampling_ratio/mean": 0.8345825672149658, "sampling/importance_sampling_ratio/min": 3.7111719848326175e-06, "sampling/sampling_logp_difference/max": 1.913425087928772, "sampling/sampling_logp_difference/mean": 0.16977286338806152, "step": 1949, "step_time": 18.859221620994504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1013146676123142, "epoch": 0.00975, "grad_norm": 0.043407391756772995, "kl": 0.24244758859276772, "learning_rate": 7.99966098203416e-06, "loss": -0.0361, "step": 1950, "step_time": 8.027428751986008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.24833167158067226, "epoch": 0.009755, "frac_reward_zero_std": 0.5, "grad_norm": 0.26711586117744446, "kl": 0.2979557439684868, "learning_rate": 7.999660627697551e-06, "loss": 0.0253, "num_tokens": 26259946.0, "reward": 0.4895516037940979, "reward_std": 1.4579116106033325, "rewards/rollout_reward_func/mean": 0.4895516037940979, "rewards/rollout_reward_func/std": 1.4579116106033325, "sampling/importance_sampling_ratio/max": 1.0328421592712402, "sampling/importance_sampling_ratio/mean": 0.980863630771637, "sampling/importance_sampling_ratio/min": 0.3857874274253845, "sampling/sampling_logp_difference/max": 0.32006990909576416, "sampling/sampling_logp_difference/mean": 0.020455386489629745, "step": 1951, "step_time": 6.168754966987763 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.24661420658230782, "epoch": 0.00976, "grad_norm": 0.11880913376808167, "kl": 0.2977295406162739, "learning_rate": 7.999660273175875e-06, "loss": 0.0239, "step": 1952, "step_time": 3.4155035330040846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1563077569007874, "epoch": 0.009765, "frac_reward_zero_std": 0.0, "grad_norm": 0.47827717661857605, "kl": 1.3264057748019695, "learning_rate": 7.999659918469135e-06, "loss": -0.0655, "num_tokens": 26288314.0, "reward": 0.09933306276798248, "reward_std": 1.261109471321106, "rewards/rollout_reward_func/mean": 0.09933306276798248, "rewards/rollout_reward_func/std": 1.261109471321106, "sampling/importance_sampling_ratio/max": 1.1317516565322876, "sampling/importance_sampling_ratio/mean": 0.5990923047065735, "sampling/importance_sampling_ratio/min": 3.265642760652554e-07, "sampling/sampling_logp_difference/max": 1.873070478439331, "sampling/sampling_logp_difference/mean": 0.3966194987297058, "step": 1953, "step_time": 15.300809358988772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.059375000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.059375000186264515, "entropy": 2.2050396725535393, "epoch": 0.00977, "grad_norm": 0.27754122018814087, "kl": 1.2145683728158474, "learning_rate": 7.999659563577324e-06, "loss": -0.0694, "step": 1954, "step_time": 6.43040711597132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 5.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4673971831798553, "epoch": 0.009775, "frac_reward_zero_std": 0.5, "grad_norm": 0.059458933770656586, "kl": 0.18795348331332207, "learning_rate": 7.99965920850045e-06, "loss": -0.0295, "num_tokens": 26311751.0, "reward": 1.0890088081359863, "reward_std": 1.2121580839157104, "rewards/rollout_reward_func/mean": 1.0890088081359863, "rewards/rollout_reward_func/std": 1.212157964706421, "sampling/importance_sampling_ratio/max": 1.1198196411132812, "sampling/importance_sampling_ratio/mean": 0.6915218234062195, "sampling/importance_sampling_ratio/min": 0.00017650079098530114, "sampling/sampling_logp_difference/max": 1.8317149877548218, "sampling/sampling_logp_difference/mean": 0.23823405802249908, "step": 1955, "step_time": 14.69229778299632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.490383394062519, "epoch": 0.00978, "grad_norm": 0.09362155944108963, "kl": 0.18950921297073364, "learning_rate": 7.99965885323851e-06, "loss": -0.0293, "step": 1956, "step_time": 6.0592077600013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.844205716624856, "epoch": 0.009785, "frac_reward_zero_std": 0.5, "grad_norm": 0.02454203926026821, "kl": 0.33398857340216637, "learning_rate": 7.999658497791501e-06, "loss": -0.0256, "num_tokens": 26337931.0, "reward": 0.8734192848205566, "reward_std": 1.319342851638794, "rewards/rollout_reward_func/mean": 0.8734192848205566, "rewards/rollout_reward_func/std": 1.319342851638794, "sampling/importance_sampling_ratio/max": 1.1420862674713135, "sampling/importance_sampling_ratio/mean": 0.8387904167175293, "sampling/importance_sampling_ratio/min": 0.0003150775737594813, "sampling/sampling_logp_difference/max": 1.5607984066009521, "sampling/sampling_logp_difference/mean": 0.13571155071258545, "step": 1957, "step_time": 16.766246308005066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.856356093659997, "epoch": 0.00979, "grad_norm": 0.06215677037835121, "kl": 0.32621343433856964, "learning_rate": 7.999658142159428e-06, "loss": -0.0258, "step": 1958, "step_time": 6.840239726996515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.9375, "completions/mean_terminated_length": 6.583333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3365646600723267, "epoch": 0.009795, "frac_reward_zero_std": 0.0, "grad_norm": 0.32394590973854065, "kl": 0.7231881134212017, "learning_rate": 7.999657786342286e-06, "loss": -0.0732, "num_tokens": 26371329.0, "reward": -0.035404130816459656, "reward_std": 1.135019302368164, "rewards/rollout_reward_func/mean": -0.035404130816459656, "rewards/rollout_reward_func/std": 1.135019302368164, "sampling/importance_sampling_ratio/max": 1.0377428531646729, "sampling/importance_sampling_ratio/mean": 0.4405791759490967, "sampling/importance_sampling_ratio/min": 2.1775188319139716e-09, "sampling/sampling_logp_difference/max": 1.9476284980773926, "sampling/sampling_logp_difference/mean": 0.43092870712280273, "step": 1959, "step_time": 17.208597201984958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.373567968606949, "epoch": 0.0098, "grad_norm": 0.32580092549324036, "kl": 0.6545960009098053, "learning_rate": 7.99965743034008e-06, "loss": -0.0743, "step": 1960, "step_time": 6.89102590798575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8912755325436592, "epoch": 0.009805, "frac_reward_zero_std": 0.5, "grad_norm": 0.04116338863968849, "kl": 0.1762324944138527, "learning_rate": 7.999657074152804e-06, "loss": -0.0057, "num_tokens": 26390144.0, "reward": 0.5139963030815125, "reward_std": 1.5413120985031128, "rewards/rollout_reward_func/mean": 0.5139963030815125, "rewards/rollout_reward_func/std": 1.5413120985031128, "sampling/importance_sampling_ratio/max": 1.0152212381362915, "sampling/importance_sampling_ratio/mean": 0.799384355545044, "sampling/importance_sampling_ratio/min": 8.891299512470141e-05, "sampling/sampling_logp_difference/max": 2.0596418380737305, "sampling/sampling_logp_difference/mean": 0.16200056672096252, "step": 1961, "step_time": 11.52661560101842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8999962732195854, "epoch": 0.00981, "grad_norm": 0.04411986097693443, "kl": 0.1739225760102272, "learning_rate": 7.999656717780465e-06, "loss": -0.0056, "step": 1962, "step_time": 5.12658969599579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 6.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.216394357383251, "epoch": 0.009815, "frac_reward_zero_std": 0.0, "grad_norm": 0.05268179625272751, "kl": 0.15894510876387358, "learning_rate": 7.999656361223059e-06, "loss": -0.1038, "num_tokens": 26423323.0, "reward": 0.37024927139282227, "reward_std": 1.3797882795333862, "rewards/rollout_reward_func/mean": 0.37024927139282227, "rewards/rollout_reward_func/std": 1.3797882795333862, "sampling/importance_sampling_ratio/max": 1.0261237621307373, "sampling/importance_sampling_ratio/mean": 0.5089558362960815, "sampling/importance_sampling_ratio/min": 2.966430201922776e-06, "sampling/sampling_logp_difference/max": 1.4395761489868164, "sampling/sampling_logp_difference/mean": 0.4078528583049774, "step": 1963, "step_time": 18.730742168991128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.213285081088543, "epoch": 0.00982, "grad_norm": 0.05389200896024704, "kl": 0.15882336534559727, "learning_rate": 7.999656004480588e-06, "loss": -0.1038, "step": 1964, "step_time": 7.880690291975043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 9.1875, "completions/mean_terminated_length": 5.099999904632568, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.9218767881393433, "epoch": 0.009825, "frac_reward_zero_std": 0.0, "grad_norm": 0.16790036857128143, "kl": 1.111644845455885, "learning_rate": 7.999655647553049e-06, "loss": -0.0651, "num_tokens": 26449621.0, "reward": 0.40789568424224854, "reward_std": 1.3563047647476196, "rewards/rollout_reward_func/mean": 0.40789568424224854, "rewards/rollout_reward_func/std": 1.3563047647476196, "sampling/importance_sampling_ratio/max": 1.083624005317688, "sampling/importance_sampling_ratio/mean": 0.3975757658481598, "sampling/importance_sampling_ratio/min": 5.102428531245096e-06, "sampling/sampling_logp_difference/max": 1.939924955368042, "sampling/sampling_logp_difference/mean": 0.5035005807876587, "step": 1965, "step_time": 14.251772869989509 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "entropy": 2.9315566420555115, "epoch": 0.00983, "grad_norm": 0.11469445377588272, "kl": 0.9001461416482925, "learning_rate": 7.999655290440444e-06, "loss": -0.0659, "step": 1966, "step_time": 6.198453517004964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.625, "completions/mean_terminated_length": 7.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.825958013534546, "epoch": 0.009835, "frac_reward_zero_std": 0.0, "grad_norm": 0.08697161078453064, "kl": 0.1649402752518654, "learning_rate": 7.999654933142772e-06, "loss": -0.0846, "num_tokens": 26478753.0, "reward": -0.589034378528595, "reward_std": 0.8748562335968018, "rewards/rollout_reward_func/mean": -0.589034378528595, "rewards/rollout_reward_func/std": 0.8748563528060913, "sampling/importance_sampling_ratio/max": 1.0594252347946167, "sampling/importance_sampling_ratio/mean": 0.3038296103477478, "sampling/importance_sampling_ratio/min": 7.869483056310855e-07, "sampling/sampling_logp_difference/max": 1.9914788007736206, "sampling/sampling_logp_difference/mean": 0.4433272182941437, "step": 1967, "step_time": 17.907889394002268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.819526582956314, "epoch": 0.00984, "grad_norm": 0.08811511099338531, "kl": 0.16033857222646475, "learning_rate": 7.999654575660037e-06, "loss": -0.0845, "step": 1968, "step_time": 6.784847142000217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 5.466667175292969, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9397158920764923, "epoch": 0.009845, "frac_reward_zero_std": 0.0, "grad_norm": 0.08301670104265213, "kl": 0.39117156341671944, "learning_rate": 7.999654217992233e-06, "loss": -0.0935, "num_tokens": 26509297.0, "reward": 0.8404189348220825, "reward_std": 1.1904878616333008, "rewards/rollout_reward_func/mean": 0.8404189348220825, "rewards/rollout_reward_func/std": 1.1904878616333008, "sampling/importance_sampling_ratio/max": 1.2159922122955322, "sampling/importance_sampling_ratio/mean": 0.7260074019432068, "sampling/importance_sampling_ratio/min": 1.4182202903612051e-05, "sampling/sampling_logp_difference/max": 2.374648332595825, "sampling/sampling_logp_difference/mean": 0.3965591788291931, "step": 1969, "step_time": 14.763191536985687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9416645281016827, "epoch": 0.00985, "grad_norm": 0.06463813781738281, "kl": 0.3942416310310364, "learning_rate": 7.999653860139363e-06, "loss": -0.0937, "step": 1970, "step_time": 6.735650088012335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 5.6666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6917117610573769, "epoch": 0.009855, "frac_reward_zero_std": 0.0, "grad_norm": 0.06496118009090424, "kl": 0.2097970936447382, "learning_rate": 7.999653502101428e-06, "loss": -0.0982, "num_tokens": 26538955.0, "reward": 0.5035857558250427, "reward_std": 1.3384568691253662, "rewards/rollout_reward_func/mean": 0.5035857558250427, "rewards/rollout_reward_func/std": 1.3384568691253662, "sampling/importance_sampling_ratio/max": 1.1148319244384766, "sampling/importance_sampling_ratio/mean": 0.6377079486846924, "sampling/importance_sampling_ratio/min": 6.029240921634482e-06, "sampling/sampling_logp_difference/max": 2.5813260078430176, "sampling/sampling_logp_difference/mean": 0.31062525510787964, "step": 1971, "step_time": 14.42426908899506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6947743892669678, "epoch": 0.00986, "grad_norm": 0.06307075172662735, "kl": 0.20907400641590357, "learning_rate": 7.999653143878426e-06, "loss": -0.0983, "step": 1972, "step_time": 6.208225674010464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0755086075514555, "epoch": 0.009865, "frac_reward_zero_std": 0.0, "grad_norm": 0.026057545095682144, "kl": 0.26638181507587433, "learning_rate": 7.999652785470358e-06, "loss": -0.0666, "num_tokens": 26570987.0, "reward": 0.49445390701293945, "reward_std": 1.15365469455719, "rewards/rollout_reward_func/mean": 0.49445390701293945, "rewards/rollout_reward_func/std": 1.15365469455719, "sampling/importance_sampling_ratio/max": 1.07926607131958, "sampling/importance_sampling_ratio/mean": 0.7753302454948425, "sampling/importance_sampling_ratio/min": 0.0008258359739556909, "sampling/sampling_logp_difference/max": 1.7740859985351562, "sampling/sampling_logp_difference/mean": 0.20471833646297455, "step": 1973, "step_time": 15.500031015981222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0742761585861444, "epoch": 0.00987, "grad_norm": 0.025324365124106407, "kl": 0.2667756527662277, "learning_rate": 7.999652426877224e-06, "loss": -0.0666, "step": 1974, "step_time": 6.46527966602298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.625, "completions/mean_terminated_length": 5.692307949066162, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8767929673194885, "epoch": 0.009875, "frac_reward_zero_std": 0.5, "grad_norm": 0.010868663899600506, "kl": 0.318644218146801, "learning_rate": 7.999652068099024e-06, "loss": -0.0191, "num_tokens": 26595778.0, "reward": -0.8331286907196045, "reward_std": 0.19923412799835205, "rewards/rollout_reward_func/mean": -0.8331286907196045, "rewards/rollout_reward_func/std": 0.19923411309719086, "sampling/importance_sampling_ratio/max": 1.0323774814605713, "sampling/importance_sampling_ratio/mean": 0.5755142569541931, "sampling/importance_sampling_ratio/min": 1.5829826224944554e-05, "sampling/sampling_logp_difference/max": 2.005430221557617, "sampling/sampling_logp_difference/mean": 0.28138357400894165, "step": 1975, "step_time": 16.84807354400982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8725037574768066, "epoch": 0.00988, "grad_norm": 0.010848336853086948, "kl": 0.32098815590143204, "learning_rate": 7.999651709135758e-06, "loss": -0.0191, "step": 1976, "step_time": 6.427765849992284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 5.384615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.435656815767288, "epoch": 0.009885, "frac_reward_zero_std": 0.0, "grad_norm": 0.1260477900505066, "kl": 0.3157920017838478, "learning_rate": 7.999651349987427e-06, "loss": -0.0897, "num_tokens": 26618059.0, "reward": 0.8356813192367554, "reward_std": 1.3520612716674805, "rewards/rollout_reward_func/mean": 0.8356813192367554, "rewards/rollout_reward_func/std": 1.35206139087677, "sampling/importance_sampling_ratio/max": 1.1225802898406982, "sampling/importance_sampling_ratio/mean": 0.7069730758666992, "sampling/importance_sampling_ratio/min": 3.161980544064136e-07, "sampling/sampling_logp_difference/max": 2.191824436187744, "sampling/sampling_logp_difference/mean": 0.45867401361465454, "step": 1977, "step_time": 15.861911471991334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.428146868944168, "epoch": 0.00989, "grad_norm": 0.11082587391138077, "kl": 0.28511475399136543, "learning_rate": 7.99965099065403e-06, "loss": -0.0903, "step": 1978, "step_time": 6.257712454011198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 8.333333969116211, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.1437992453575134, "epoch": 0.009895, "frac_reward_zero_std": 0.0, "grad_norm": 0.020284991711378098, "kl": 0.15036046132445335, "learning_rate": 7.999650631135566e-06, "loss": -0.0933, "num_tokens": 26646447.0, "reward": -0.0559375062584877, "reward_std": 1.392457127571106, "rewards/rollout_reward_func/mean": -0.0559375062584877, "rewards/rollout_reward_func/std": 1.392457127571106, "sampling/importance_sampling_ratio/max": 1.0174405574798584, "sampling/importance_sampling_ratio/mean": 0.33532899618148804, "sampling/importance_sampling_ratio/min": 4.0676599155631266e-07, "sampling/sampling_logp_difference/max": 2.1550183296203613, "sampling/sampling_logp_difference/mean": 0.4785979986190796, "step": 1979, "step_time": 14.402634302023216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.1468769907951355, "epoch": 0.0099, "grad_norm": 0.020403118804097176, "kl": 0.15198657289147377, "learning_rate": 7.999650271432035e-06, "loss": -0.0933, "step": 1980, "step_time": 6.423174465002376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.567755937576294, "epoch": 0.009905, "frac_reward_zero_std": 0.5, "grad_norm": 0.011158834211528301, "kl": 0.27161780931055546, "learning_rate": 7.999649911543442e-06, "loss": -0.0487, "num_tokens": 26673793.0, "reward": -0.5181494951248169, "reward_std": 0.9053601622581482, "rewards/rollout_reward_func/mean": -0.5181494951248169, "rewards/rollout_reward_func/std": 0.9053601622581482, "sampling/importance_sampling_ratio/max": 1.0320099592208862, "sampling/importance_sampling_ratio/mean": 0.6405209898948669, "sampling/importance_sampling_ratio/min": 2.587145718280226e-05, "sampling/sampling_logp_difference/max": 1.508873462677002, "sampling/sampling_logp_difference/mean": 0.22031770646572113, "step": 1981, "step_time": 18.239140940000652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.561266303062439, "epoch": 0.00991, "grad_norm": 0.010982475243508816, "kl": 0.2665325775742531, "learning_rate": 7.99964955146978e-06, "loss": -0.0488, "step": 1982, "step_time": 7.007825649983715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 4.615384578704834, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.679823875427246, "epoch": 0.009915, "frac_reward_zero_std": 0.5, "grad_norm": 0.013282149098813534, "kl": 0.2755502425134182, "learning_rate": 7.999649191211052e-06, "loss": -0.0268, "num_tokens": 26701787.0, "reward": 0.724331259727478, "reward_std": 1.5091071128845215, "rewards/rollout_reward_func/mean": 0.724331259727478, "rewards/rollout_reward_func/std": 1.5091071128845215, "sampling/importance_sampling_ratio/max": 1.0962111949920654, "sampling/importance_sampling_ratio/mean": 0.7004822492599487, "sampling/importance_sampling_ratio/min": 3.291202665423043e-05, "sampling/sampling_logp_difference/max": 1.9235106706619263, "sampling/sampling_logp_difference/mean": 0.23913873732089996, "step": 1983, "step_time": 18.81449171100394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6702111661434174, "epoch": 0.00992, "grad_norm": 0.011915960349142551, "kl": 0.26261430978775024, "learning_rate": 7.999648830767258e-06, "loss": -0.0269, "step": 1984, "step_time": 7.36914557601267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.025888055562973, "epoch": 0.009925, "frac_reward_zero_std": 0.0, "grad_norm": 0.031136754900217056, "kl": 0.38982383348047733, "learning_rate": 7.999648470138399e-06, "loss": -0.0738, "num_tokens": 26736403.0, "reward": 0.22467553615570068, "reward_std": 1.216208815574646, "rewards/rollout_reward_func/mean": 0.22467553615570068, "rewards/rollout_reward_func/std": 1.216208815574646, "sampling/importance_sampling_ratio/max": 1.0795150995254517, "sampling/importance_sampling_ratio/mean": 0.5938835144042969, "sampling/importance_sampling_ratio/min": 2.6302452624804573e-06, "sampling/sampling_logp_difference/max": 1.950096607208252, "sampling/sampling_logp_difference/mean": 0.35982924699783325, "step": 1985, "step_time": 19.774126253003487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.026140958070755, "epoch": 0.00993, "grad_norm": 0.031439974904060364, "kl": 0.38030524365603924, "learning_rate": 7.999648109324476e-06, "loss": -0.0739, "step": 1986, "step_time": 8.151311970024835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 4.785714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.579963356256485, "epoch": 0.009935, "frac_reward_zero_std": 0.5, "grad_norm": 0.02185310609638691, "kl": 0.3098388873040676, "learning_rate": 7.999647748325485e-06, "loss": -0.0325, "num_tokens": 26761135.0, "reward": 0.926506757736206, "reward_std": 1.3968991041183472, "rewards/rollout_reward_func/mean": 0.926506757736206, "rewards/rollout_reward_func/std": 1.3968991041183472, "sampling/importance_sampling_ratio/max": 1.0647499561309814, "sampling/importance_sampling_ratio/mean": 0.7594654560089111, "sampling/importance_sampling_ratio/min": 0.00026427971897646785, "sampling/sampling_logp_difference/max": 1.5809671878814697, "sampling/sampling_logp_difference/mean": 0.21426749229431152, "step": 1987, "step_time": 15.703806989986333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.583919495344162, "epoch": 0.00994, "grad_norm": 0.02480350062251091, "kl": 0.3309475965797901, "learning_rate": 7.999647387141429e-06, "loss": -0.0324, "step": 1988, "step_time": 6.408771260001231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 5.285714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.372610330581665, "epoch": 0.009945, "frac_reward_zero_std": 0.0, "grad_norm": 0.0365794338285923, "kl": 0.2877213656902313, "learning_rate": 7.999647025772306e-06, "loss": -0.0628, "num_tokens": 26790269.0, "reward": 0.5080071687698364, "reward_std": 1.2821323871612549, "rewards/rollout_reward_func/mean": 0.5080071687698364, "rewards/rollout_reward_func/std": 1.2821322679519653, "sampling/importance_sampling_ratio/max": 1.1035088300704956, "sampling/importance_sampling_ratio/mean": 0.7033225297927856, "sampling/importance_sampling_ratio/min": 5.1706059878142696e-08, "sampling/sampling_logp_difference/max": 1.8511302471160889, "sampling/sampling_logp_difference/mean": 0.48058074712753296, "step": 1989, "step_time": 14.606930162015487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3675405085086823, "epoch": 0.00995, "grad_norm": 0.0375748835504055, "kl": 0.28857339918613434, "learning_rate": 7.999646664218119e-06, "loss": -0.0628, "step": 1990, "step_time": 6.225939923009719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1711413841694593, "epoch": 0.009955, "frac_reward_zero_std": 0.0, "grad_norm": 0.04570819437503815, "kl": 0.3211113102734089, "learning_rate": 7.999646302478865e-06, "loss": -0.102, "num_tokens": 26822747.0, "reward": 0.4340655207633972, "reward_std": 1.2638766765594482, "rewards/rollout_reward_func/mean": 0.4340655207633972, "rewards/rollout_reward_func/std": 1.2638767957687378, "sampling/importance_sampling_ratio/max": 1.0915218591690063, "sampling/importance_sampling_ratio/mean": 0.5789909362792969, "sampling/importance_sampling_ratio/min": 2.2807585992268287e-05, "sampling/sampling_logp_difference/max": 2.010725498199463, "sampling/sampling_logp_difference/mean": 0.39925771951675415, "step": 1991, "step_time": 16.05994260800071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1768073067069054, "epoch": 0.00996, "grad_norm": 0.04560471326112747, "kl": 0.3261595033109188, "learning_rate": 7.999645940554546e-06, "loss": -0.1021, "step": 1992, "step_time": 7.1171169050066965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.375, "completions/mean_terminated_length": 5.833333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5288281738758087, "epoch": 0.009965, "frac_reward_zero_std": 0.0, "grad_norm": 0.10043086856603622, "kl": 0.23961328342556953, "learning_rate": 7.999645578445163e-06, "loss": -0.1008, "num_tokens": 26856233.0, "reward": 0.3895757496356964, "reward_std": 1.2428889274597168, "rewards/rollout_reward_func/mean": 0.3895757496356964, "rewards/rollout_reward_func/std": 1.2428890466690063, "sampling/importance_sampling_ratio/max": 1.092700481414795, "sampling/importance_sampling_ratio/mean": 0.5588033199310303, "sampling/importance_sampling_ratio/min": 7.968395010493623e-08, "sampling/sampling_logp_difference/max": 2.059264659881592, "sampling/sampling_logp_difference/mean": 0.43118155002593994, "step": 1993, "step_time": 17.27560289601388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.535926565527916, "epoch": 0.00997, "grad_norm": 0.09387068450450897, "kl": 0.2413116917014122, "learning_rate": 7.999645216150713e-06, "loss": -0.1008, "step": 1994, "step_time": 6.992278103003628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 8.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.0718077421188354, "epoch": 0.009975, "frac_reward_zero_std": 0.0, "grad_norm": 0.01763138361275196, "kl": 0.23937151581048965, "learning_rate": 7.999644853671198e-06, "loss": -0.105, "num_tokens": 26885307.0, "reward": 0.034551724791526794, "reward_std": 1.4121334552764893, "rewards/rollout_reward_func/mean": 0.034551724791526794, "rewards/rollout_reward_func/std": 1.4121335744857788, "sampling/importance_sampling_ratio/max": 1.0146340131759644, "sampling/importance_sampling_ratio/mean": 0.37926509976387024, "sampling/importance_sampling_ratio/min": 2.784301614155993e-06, "sampling/sampling_logp_difference/max": 2.3350987434387207, "sampling/sampling_logp_difference/mean": 0.48994696140289307, "step": 1995, "step_time": 15.836185971988016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.0764638781547546, "epoch": 0.00998, "grad_norm": 0.01740233041346073, "kl": 0.23692375607788563, "learning_rate": 7.999644491006616e-06, "loss": -0.1051, "step": 1996, "step_time": 6.331924361991696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2014675084501505, "epoch": 0.009985, "frac_reward_zero_std": 0.0, "grad_norm": 0.014697231352329254, "kl": 0.2753970557823777, "learning_rate": 7.999644128156968e-06, "loss": -0.0887, "num_tokens": 26917388.0, "reward": 0.8500995635986328, "reward_std": 1.2492644786834717, "rewards/rollout_reward_func/mean": 0.8500995635986328, "rewards/rollout_reward_func/std": 1.2492645978927612, "sampling/importance_sampling_ratio/max": 1.0705422163009644, "sampling/importance_sampling_ratio/mean": 0.7044427990913391, "sampling/importance_sampling_ratio/min": 0.00016356752894353122, "sampling/sampling_logp_difference/max": 1.6793508529663086, "sampling/sampling_logp_difference/mean": 0.2508302330970764, "step": 1997, "step_time": 16.227088153988007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2035385631024837, "epoch": 0.00999, "grad_norm": 0.014116755686700344, "kl": 0.27512999530881643, "learning_rate": 7.999643765122257e-06, "loss": -0.0887, "step": 1998, "step_time": 7.433452989978832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.92307710647583, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2676712572574615, "epoch": 0.009995, "frac_reward_zero_std": 0.0, "grad_norm": 0.08835691958665848, "kl": 0.25567711517214775, "learning_rate": 7.99964340190248e-06, "loss": -0.0976, "num_tokens": 26947536.0, "reward": 0.5534782409667969, "reward_std": 1.2533038854599, "rewards/rollout_reward_func/mean": 0.5534782409667969, "rewards/rollout_reward_func/std": 1.2533040046691895, "sampling/importance_sampling_ratio/max": 1.057546615600586, "sampling/importance_sampling_ratio/mean": 0.5846561789512634, "sampling/importance_sampling_ratio/min": 9.989923273678869e-05, "sampling/sampling_logp_difference/max": 2.059035301208496, "sampling/sampling_logp_difference/mean": 0.3266305923461914, "step": 1999, "step_time": 15.232567835992086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2720328271389008, "epoch": 0.01, "grad_norm": 0.08456602692604065, "kl": 0.2539852671325207, "learning_rate": 7.999643038497635e-06, "loss": -0.0981, "step": 2000, "step_time": 6.230605002027005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 7.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.9137219190597534, "epoch": 0.010005, "frac_reward_zero_std": 0.0, "grad_norm": 0.4425029158592224, "kl": 0.09055280638858676, "learning_rate": 7.999642674907728e-06, "loss": -0.0627, "num_tokens": 26981153.0, "reward": -0.7647054195404053, "reward_std": 0.7663540244102478, "rewards/rollout_reward_func/mean": -0.7647054195404053, "rewards/rollout_reward_func/std": 0.7663540244102478, "sampling/importance_sampling_ratio/max": 1.0281950235366821, "sampling/importance_sampling_ratio/mean": 0.1855880618095398, "sampling/importance_sampling_ratio/min": 1.3621004654851276e-05, "sampling/sampling_logp_difference/max": 1.943491816520691, "sampling/sampling_logp_difference/mean": 0.4082169830799103, "step": 2001, "step_time": 18.504668592009693 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 2.9102298617362976, "epoch": 0.01001, "grad_norm": 0.06414932757616043, "kl": 0.09066157042980194, "learning_rate": 7.999642311132752e-06, "loss": -0.0656, "step": 2002, "step_time": 6.512262882999494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 3.8333334922790527, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.112980842590332, "epoch": 0.010015, "frac_reward_zero_std": 0.0, "grad_norm": 0.07509638369083405, "kl": 0.2518348731100559, "learning_rate": 7.999641947172714e-06, "loss": -0.0995, "num_tokens": 27013564.0, "reward": 0.4944525361061096, "reward_std": 1.2110716104507446, "rewards/rollout_reward_func/mean": 0.4944525361061096, "rewards/rollout_reward_func/std": 1.2110717296600342, "sampling/importance_sampling_ratio/max": 1.0274908542633057, "sampling/importance_sampling_ratio/mean": 0.6178637742996216, "sampling/importance_sampling_ratio/min": 5.489935119840084e-07, "sampling/sampling_logp_difference/max": 1.6984009742736816, "sampling/sampling_logp_difference/mean": 0.3863985240459442, "step": 2003, "step_time": 15.912380666995887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1140679121017456, "epoch": 0.01002, "grad_norm": 0.0752517506480217, "kl": 0.25798455998301506, "learning_rate": 7.999641583027608e-06, "loss": -0.0995, "step": 2004, "step_time": 6.917797217014595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 4.6666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.67152738571167, "epoch": 0.010025, "frac_reward_zero_std": 0.5, "grad_norm": 0.20642490684986115, "kl": 0.16600328125059605, "learning_rate": 7.999641218697438e-06, "loss": -0.0409, "num_tokens": 27039705.0, "reward": 0.9014264345169067, "reward_std": 1.3973088264465332, "rewards/rollout_reward_func/mean": 0.9014264345169067, "rewards/rollout_reward_func/std": 1.3973088264465332, "sampling/importance_sampling_ratio/max": 1.0500702857971191, "sampling/importance_sampling_ratio/mean": 0.6781163215637207, "sampling/importance_sampling_ratio/min": 3.269741137046367e-05, "sampling/sampling_logp_difference/max": 1.8734323978424072, "sampling/sampling_logp_difference/mean": 0.21960145235061646, "step": 2005, "step_time": 18.44197937498393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6703183054924011, "epoch": 0.01003, "grad_norm": 0.23025967180728912, "kl": 0.16579446382820606, "learning_rate": 7.999640854182202e-06, "loss": -0.0412, "step": 2006, "step_time": 6.3990383210039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.625, "completions/mean_terminated_length": 4.933333396911621, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1312974486500025, "epoch": 0.010035, "frac_reward_zero_std": 0.0, "grad_norm": 0.037116508930921555, "kl": 0.8387691006064415, "learning_rate": 7.999640489481902e-06, "loss": -0.0676, "num_tokens": 27063398.0, "reward": 1.3135619163513184, "reward_std": 1.0571224689483643, "rewards/rollout_reward_func/mean": 1.3135619163513184, "rewards/rollout_reward_func/std": 1.0571224689483643, "sampling/importance_sampling_ratio/max": 1.1021413803100586, "sampling/importance_sampling_ratio/mean": 0.7794100046157837, "sampling/importance_sampling_ratio/min": 3.238366844016127e-05, "sampling/sampling_logp_difference/max": 1.8486087322235107, "sampling/sampling_logp_difference/mean": 0.2567020356655121, "step": 2007, "step_time": 11.49818132299697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1340009775012732, "epoch": 0.01004, "grad_norm": 0.03163105249404907, "kl": 0.7824186310172081, "learning_rate": 7.999640124596536e-06, "loss": -0.0678, "step": 2008, "step_time": 6.350184742012061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.428571701049805, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.349129905924201, "epoch": 0.010045, "frac_reward_zero_std": 0.0, "grad_norm": 0.06079956144094467, "kl": 0.4620080329477787, "learning_rate": 7.999639759526104e-06, "loss": -0.0762, "num_tokens": 27089693.0, "reward": 0.8497394323348999, "reward_std": 1.2548892498016357, "rewards/rollout_reward_func/mean": 0.8497394323348999, "rewards/rollout_reward_func/std": 1.2548892498016357, "sampling/importance_sampling_ratio/max": 1.0579861402511597, "sampling/importance_sampling_ratio/mean": 0.6696557998657227, "sampling/importance_sampling_ratio/min": 0.0007306889165192842, "sampling/sampling_logp_difference/max": 1.5027246475219727, "sampling/sampling_logp_difference/mean": 0.24000366032123566, "step": 2009, "step_time": 17.77352832700126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3556218966841698, "epoch": 0.01005, "grad_norm": 0.06565281748771667, "kl": 0.4936801455914974, "learning_rate": 7.999639394270607e-06, "loss": -0.0763, "step": 2010, "step_time": 6.474123092018999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5954426359385252, "epoch": 0.010055, "frac_reward_zero_std": 0.5, "grad_norm": 0.02262902446091175, "kl": 0.3289976418018341, "learning_rate": 7.999639028830044e-06, "loss": -0.0379, "num_tokens": 27107109.0, "reward": 1.7392253875732422, "reward_std": 0.1728261560201645, "rewards/rollout_reward_func/mean": 1.7392253875732422, "rewards/rollout_reward_func/std": 0.17282617092132568, "sampling/importance_sampling_ratio/max": 1.0226688385009766, "sampling/importance_sampling_ratio/mean": 0.9519472122192383, "sampling/importance_sampling_ratio/min": 4.4391082809625004e-08, "sampling/sampling_logp_difference/max": 2.1999735832214355, "sampling/sampling_logp_difference/mean": 0.19590644538402557, "step": 2011, "step_time": 6.136100792980869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5970110148191452, "epoch": 0.01006, "grad_norm": 0.023139702156186104, "kl": 0.3289416953921318, "learning_rate": 7.999638663204418e-06, "loss": -0.0379, "step": 2012, "step_time": 3.295799484010786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 4.2727274894714355, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.274223178625107, "epoch": 0.010065, "frac_reward_zero_std": 0.0, "grad_norm": 0.057789795100688934, "kl": 0.3716767840087414, "learning_rate": 7.999638297393725e-06, "loss": -0.0832, "num_tokens": 27129082.0, "reward": 0.713175892829895, "reward_std": 1.209760069847107, "rewards/rollout_reward_func/mean": 0.713175892829895, "rewards/rollout_reward_func/std": 1.209760069847107, "sampling/importance_sampling_ratio/max": 1.0643465518951416, "sampling/importance_sampling_ratio/mean": 0.5874100923538208, "sampling/importance_sampling_ratio/min": 4.079362980746737e-08, "sampling/sampling_logp_difference/max": 1.9939792156219482, "sampling/sampling_logp_difference/mean": 0.3669578433036804, "step": 2013, "step_time": 12.355613899009768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.271353453397751, "epoch": 0.01007, "grad_norm": 0.10727915912866592, "kl": 0.3452687971293926, "learning_rate": 7.999637931397968e-06, "loss": -0.0833, "step": 2014, "step_time": 5.6224518829985755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.5625, "completions/mean_terminated_length": 4.555555820465088, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.917992651462555, "epoch": 0.010075, "frac_reward_zero_std": 0.0, "grad_norm": 0.008710152469575405, "kl": 0.1055594990029931, "learning_rate": 7.999637565217146e-06, "loss": -0.1003, "num_tokens": 27160350.0, "reward": 0.33333301544189453, "reward_std": 1.197128415107727, "rewards/rollout_reward_func/mean": 0.33333301544189453, "rewards/rollout_reward_func/std": 1.197128415107727, "sampling/importance_sampling_ratio/max": 1.0242105722427368, "sampling/importance_sampling_ratio/mean": 0.5074735879898071, "sampling/importance_sampling_ratio/min": 8.74355635005486e-07, "sampling/sampling_logp_difference/max": 1.9014301300048828, "sampling/sampling_logp_difference/mean": 0.4419543147087097, "step": 2015, "step_time": 16.05689840602281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.9155771732330322, "epoch": 0.01008, "grad_norm": 0.008378013037145138, "kl": 0.10526157543063164, "learning_rate": 7.999637198851257e-06, "loss": -0.1003, "step": 2016, "step_time": 7.3118056599923875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 4.692307949066162, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.367288613691926, "epoch": 0.010085, "frac_reward_zero_std": 0.0, "grad_norm": 0.1865423023700714, "kl": 0.24543160572648048, "learning_rate": 7.999636832300306e-06, "loss": -0.0873, "num_tokens": 27187650.0, "reward": 0.9388731718063354, "reward_std": 1.3814384937286377, "rewards/rollout_reward_func/mean": 0.9388731718063354, "rewards/rollout_reward_func/std": 1.3814384937286377, "sampling/importance_sampling_ratio/max": 1.0177165269851685, "sampling/importance_sampling_ratio/mean": 0.7287610173225403, "sampling/importance_sampling_ratio/min": 7.201176777016371e-05, "sampling/sampling_logp_difference/max": 1.6407921314239502, "sampling/sampling_logp_difference/mean": 0.279633104801178, "step": 2017, "step_time": 12.244183293005335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.3771695950999856, "epoch": 0.01009, "grad_norm": 0.11901033669710159, "kl": 0.25946881622076035, "learning_rate": 7.999636465564287e-06, "loss": -0.0882, "step": 2018, "step_time": 6.252606167996419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2791321594268084, "epoch": 0.010095, "frac_reward_zero_std": 0.5, "grad_norm": 0.005824780557304621, "kl": 0.1804359257221222, "learning_rate": 7.999636098643205e-06, "loss": -0.0367, "num_tokens": 27212907.0, "reward": 0.7911933660507202, "reward_std": 1.3561452627182007, "rewards/rollout_reward_func/mean": 0.7911933660507202, "rewards/rollout_reward_func/std": 1.3561453819274902, "sampling/importance_sampling_ratio/max": 1.0249590873718262, "sampling/importance_sampling_ratio/mean": 0.6975142955780029, "sampling/importance_sampling_ratio/min": 5.827654240420088e-05, "sampling/sampling_logp_difference/max": 1.9607532024383545, "sampling/sampling_logp_difference/mean": 0.2218884825706482, "step": 2019, "step_time": 15.996202371985419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2811166569590569, "epoch": 0.0101, "grad_norm": 0.005257678218185902, "kl": 0.18063324689865112, "learning_rate": 7.999635731537057e-06, "loss": -0.0368, "step": 2020, "step_time": 6.067340688998229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 5.500000476837158, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.605947233736515, "epoch": 0.010105, "frac_reward_zero_std": 0.0, "grad_norm": 0.18766477704048157, "kl": 0.5685914158821106, "learning_rate": 7.999635364245844e-06, "loss": -0.0206, "num_tokens": 27241001.0, "reward": -0.09286844730377197, "reward_std": 1.0295053720474243, "rewards/rollout_reward_func/mean": -0.09286844730377197, "rewards/rollout_reward_func/std": 1.0295053720474243, "sampling/importance_sampling_ratio/max": 1.047816276550293, "sampling/importance_sampling_ratio/mean": 0.6442845463752747, "sampling/importance_sampling_ratio/min": 4.023400470032357e-05, "sampling/sampling_logp_difference/max": 1.8181040287017822, "sampling/sampling_logp_difference/mean": 0.21943990886211395, "step": 2021, "step_time": 17.32814125600271 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "entropy": 1.6238280236721039, "epoch": 0.01011, "grad_norm": 0.1892159879207611, "kl": 0.5137834884226322, "learning_rate": 7.999634996769565e-06, "loss": -0.022, "step": 2022, "step_time": 7.0993785679893335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3528144136071205, "epoch": 0.010115, "frac_reward_zero_std": 0.0, "grad_norm": 0.08633199334144592, "kl": 0.234460961073637, "learning_rate": 7.999634629108221e-06, "loss": -0.072, "num_tokens": 27272074.0, "reward": 0.6333838701248169, "reward_std": 1.143999695777893, "rewards/rollout_reward_func/mean": 0.6333838701248169, "rewards/rollout_reward_func/std": 1.1439999341964722, "sampling/importance_sampling_ratio/max": 1.069809913635254, "sampling/importance_sampling_ratio/mean": 0.6998066902160645, "sampling/importance_sampling_ratio/min": 1.9753317701542983e-06, "sampling/sampling_logp_difference/max": 1.9023010730743408, "sampling/sampling_logp_difference/mean": 0.3285924196243286, "step": 2023, "step_time": 15.909913487994345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3699028808623552, "epoch": 0.01012, "grad_norm": 0.12338866293430328, "kl": 0.22916167229413986, "learning_rate": 7.999634261261815e-06, "loss": -0.0724, "step": 2024, "step_time": 6.282809407013701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.3125, "completions/mean_terminated_length": 5.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0881005004048347, "epoch": 0.010125, "frac_reward_zero_std": 0.0, "grad_norm": 0.3216087818145752, "kl": 0.37324296310544014, "learning_rate": 7.999633893230342e-06, "loss": -0.0923, "num_tokens": 27299588.0, "reward": -0.024928361177444458, "reward_std": 1.1686149835586548, "rewards/rollout_reward_func/mean": -0.024928361177444458, "rewards/rollout_reward_func/std": 1.1686149835586548, "sampling/importance_sampling_ratio/max": 1.0255669355392456, "sampling/importance_sampling_ratio/mean": 0.6925466060638428, "sampling/importance_sampling_ratio/min": 3.0732035156688653e-06, "sampling/sampling_logp_difference/max": 1.7874003648757935, "sampling/sampling_logp_difference/mean": 0.39973729848861694, "step": 2025, "step_time": 14.86387075499806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.081950131803751, "epoch": 0.01013, "grad_norm": 0.3187312185764313, "kl": 0.3728157728910446, "learning_rate": 7.999633525013805e-06, "loss": -0.0925, "step": 2026, "step_time": 6.945542011017096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.8125, "completions/mean_terminated_length": 5.133333683013916, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0919116027653217, "epoch": 0.010135, "frac_reward_zero_std": 0.0, "grad_norm": 0.3423958420753479, "kl": 0.276270367205143, "learning_rate": 7.999633156612202e-06, "loss": -0.0889, "num_tokens": 27318456.0, "reward": 1.6194472312927246, "reward_std": 0.9818475246429443, "rewards/rollout_reward_func/mean": 1.6194472312927246, "rewards/rollout_reward_func/std": 0.9818475842475891, "sampling/importance_sampling_ratio/max": 1.075437068939209, "sampling/importance_sampling_ratio/mean": 0.8445464372634888, "sampling/importance_sampling_ratio/min": 2.3169842222614534e-07, "sampling/sampling_logp_difference/max": 1.880323052406311, "sampling/sampling_logp_difference/mean": 0.2890753746032715, "step": 2027, "step_time": 10.123591920986655 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.0633195415139198, "epoch": 0.01014, "grad_norm": 0.12112101912498474, "kl": 0.28884419798851013, "learning_rate": 7.999632788025535e-06, "loss": -0.0914, "step": 2028, "step_time": 5.169016258980264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.9375, "completions/mean_terminated_length": 9.40000057220459, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.348798096179962, "epoch": 0.010145, "frac_reward_zero_std": 0.0, "grad_norm": 0.5189916491508484, "kl": 0.14966512797400355, "learning_rate": 7.999632419253804e-06, "loss": -0.0354, "num_tokens": 27351518.0, "reward": -0.5172141790390015, "reward_std": 0.8607239723205566, "rewards/rollout_reward_func/mean": -0.5172141790390015, "rewards/rollout_reward_func/std": 0.8607239723205566, "sampling/importance_sampling_ratio/max": 0.7019119262695312, "sampling/importance_sampling_ratio/mean": 0.056626398116350174, "sampling/importance_sampling_ratio/min": 3.785229637287557e-05, "sampling/sampling_logp_difference/max": 1.9754483699798584, "sampling/sampling_logp_difference/mean": 0.4335261583328247, "step": 2029, "step_time": 20.520160180982202 }, { "clip_ratio/high_max": 0.03750000149011612, "clip_ratio/high_mean": 0.01875000074505806, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01875000074505806, "entropy": 3.3319199085235596, "epoch": 0.01015, "grad_norm": 0.11757352203130722, "kl": 0.14956212602555752, "learning_rate": 7.999632050297006e-06, "loss": -0.0377, "step": 2030, "step_time": 7.003516818993376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6867158114910126, "epoch": 0.010155, "frac_reward_zero_std": 0.0, "grad_norm": 0.04169278219342232, "kl": 0.1976894773542881, "learning_rate": 7.999631681155145e-06, "loss": -0.0942, "num_tokens": 27375809.0, "reward": 1.096759557723999, "reward_std": 1.2624640464782715, "rewards/rollout_reward_func/mean": 1.096759557723999, "rewards/rollout_reward_func/std": 1.2624640464782715, "sampling/importance_sampling_ratio/max": 1.057288646697998, "sampling/importance_sampling_ratio/mean": 0.7643686532974243, "sampling/importance_sampling_ratio/min": 1.3480615734806634e-08, "sampling/sampling_logp_difference/max": 2.319810390472412, "sampling/sampling_logp_difference/mean": 0.3238687217235565, "step": 2031, "step_time": 15.186275934014702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6840541362762451, "epoch": 0.01016, "grad_norm": 0.040274955332279205, "kl": 0.19695395976305008, "learning_rate": 7.999631311828217e-06, "loss": -0.0943, "step": 2032, "step_time": 6.409773943974869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 4.727272987365723, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2088285386562347, "epoch": 0.010165, "frac_reward_zero_std": 0.0, "grad_norm": 0.04478410258889198, "kl": 0.22102505341172218, "learning_rate": 7.999630942316226e-06, "loss": -0.1004, "num_tokens": 27410083.0, "reward": 0.688590407371521, "reward_std": 1.177897572517395, "rewards/rollout_reward_func/mean": 0.688590407371521, "rewards/rollout_reward_func/std": 1.177897572517395, "sampling/importance_sampling_ratio/max": 1.0570721626281738, "sampling/importance_sampling_ratio/mean": 0.5655986666679382, "sampling/importance_sampling_ratio/min": 1.9918337784474716e-05, "sampling/sampling_logp_difference/max": 1.593571424484253, "sampling/sampling_logp_difference/mean": 0.38648155331611633, "step": 2033, "step_time": 18.950641375995474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2014488726854324, "epoch": 0.01017, "grad_norm": 0.04239055886864662, "kl": 0.2183178160339594, "learning_rate": 7.999630572619171e-06, "loss": -0.1005, "step": 2034, "step_time": 8.616544189004344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 5.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.274663746356964, "epoch": 0.010175, "frac_reward_zero_std": 0.0, "grad_norm": 0.11353640258312225, "kl": 0.22263218462467194, "learning_rate": 7.99963020273705e-06, "loss": -0.053, "num_tokens": 27442412.0, "reward": 0.16604971885681152, "reward_std": 1.293393850326538, "rewards/rollout_reward_func/mean": 0.16604971885681152, "rewards/rollout_reward_func/std": 1.2933939695358276, "sampling/importance_sampling_ratio/max": 1.2864142656326294, "sampling/importance_sampling_ratio/mean": 0.5884131789207458, "sampling/importance_sampling_ratio/min": 6.979436875553802e-06, "sampling/sampling_logp_difference/max": 1.734560489654541, "sampling/sampling_logp_difference/mean": 0.3588055968284607, "step": 2035, "step_time": 16.146148875996005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.258184462785721, "epoch": 0.01018, "grad_norm": 0.10933618992567062, "kl": 0.2245282344520092, "learning_rate": 7.999629832669865e-06, "loss": -0.0531, "step": 2036, "step_time": 6.809713415976148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 7.125, "completions/mean_terminated_length": 4.1666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8751869462430477, "epoch": 0.010185, "frac_reward_zero_std": 0.0, "grad_norm": 0.27975019812583923, "kl": 0.29544298350811005, "learning_rate": 7.999629462417615e-06, "loss": -0.0977, "num_tokens": 27470868.0, "reward": -0.1961759328842163, "reward_std": 1.0544246435165405, "rewards/rollout_reward_func/mean": -0.1961759328842163, "rewards/rollout_reward_func/std": 1.0544246435165405, "sampling/importance_sampling_ratio/max": 1.2957412004470825, "sampling/importance_sampling_ratio/mean": 0.7088513374328613, "sampling/importance_sampling_ratio/min": 1.918490255548022e-08, "sampling/sampling_logp_difference/max": 2.0919952392578125, "sampling/sampling_logp_difference/mean": 0.43616539239883423, "step": 2037, "step_time": 18.622616666019894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.871911022812128, "epoch": 0.01019, "grad_norm": 0.08991090208292007, "kl": 0.3113831914961338, "learning_rate": 7.9996290919803e-06, "loss": -0.0979, "step": 2038, "step_time": 8.231389737993595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.857142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4871392846107483, "epoch": 0.010195, "frac_reward_zero_std": 0.5, "grad_norm": 0.05496927350759506, "kl": 0.2443631924688816, "learning_rate": 7.99962872135792e-06, "loss": -0.029, "num_tokens": 27495945.0, "reward": 0.6596056222915649, "reward_std": 1.1560204029083252, "rewards/rollout_reward_func/mean": 0.6596056222915649, "rewards/rollout_reward_func/std": 1.1560204029083252, "sampling/importance_sampling_ratio/max": 1.0303761959075928, "sampling/importance_sampling_ratio/mean": 0.6384891867637634, "sampling/importance_sampling_ratio/min": 4.6721220314793754e-06, "sampling/sampling_logp_difference/max": 2.4514551162719727, "sampling/sampling_logp_difference/mean": 0.23385468125343323, "step": 2039, "step_time": 13.355849776999094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4720997512340546, "epoch": 0.0102, "grad_norm": 0.056478358805179596, "kl": 0.24725273996591568, "learning_rate": 7.999628350550478e-06, "loss": -0.029, "step": 2040, "step_time": 5.910256798029877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 5.285714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.853076010942459, "epoch": 0.010205, "frac_reward_zero_std": 0.0, "grad_norm": 0.173702672123909, "kl": 0.38547978922724724, "learning_rate": 7.99962797955797e-06, "loss": -0.0483, "num_tokens": 27525059.0, "reward": 1.017207145690918, "reward_std": 1.1598759889602661, "rewards/rollout_reward_func/mean": 1.017207145690918, "rewards/rollout_reward_func/std": 1.1598759889602661, "sampling/importance_sampling_ratio/max": 1.0766053199768066, "sampling/importance_sampling_ratio/mean": 0.6662771701812744, "sampling/importance_sampling_ratio/min": 1.6272920220217202e-06, "sampling/sampling_logp_difference/max": 2.0357789993286133, "sampling/sampling_logp_difference/mean": 0.2929690182209015, "step": 2041, "step_time": 13.398680963000515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8439931571483612, "epoch": 0.01021, "grad_norm": 0.14900575578212738, "kl": 0.29361360520124435, "learning_rate": 7.999627608380397e-06, "loss": -0.049, "step": 2042, "step_time": 6.349187772008008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.428571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1018485901877284, "epoch": 0.010215, "frac_reward_zero_std": 0.0, "grad_norm": 0.02551746927201748, "kl": 0.21153846010565758, "learning_rate": 7.99962723701776e-06, "loss": -0.0858, "num_tokens": 27548995.0, "reward": 0.23261219263076782, "reward_std": 1.3438336849212646, "rewards/rollout_reward_func/mean": 0.23261219263076782, "rewards/rollout_reward_func/std": 1.3438336849212646, "sampling/importance_sampling_ratio/max": 1.0212634801864624, "sampling/importance_sampling_ratio/mean": 0.8249151706695557, "sampling/importance_sampling_ratio/min": 1.7078295400096977e-07, "sampling/sampling_logp_difference/max": 1.7843800783157349, "sampling/sampling_logp_difference/mean": 0.24018684029579163, "step": 2043, "step_time": 13.430699753997033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1045070830732584, "epoch": 0.01022, "grad_norm": 0.02855520509183407, "kl": 0.21022790670394897, "learning_rate": 7.999626865470058e-06, "loss": -0.0857, "step": 2044, "step_time": 6.918479398009367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.081620367243886, "epoch": 0.010225, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003620992647483945, "kl": 0.27694781869649887, "learning_rate": 7.999626493737292e-06, "loss": 0.0007, "num_tokens": 27566199.0, "reward": 1.751725435256958, "reward_std": 0.1530284583568573, "rewards/rollout_reward_func/mean": 1.751725435256958, "rewards/rollout_reward_func/std": 0.1530284583568573, "sampling/importance_sampling_ratio/max": 1.0252058506011963, "sampling/importance_sampling_ratio/mean": 1.0121197700500488, "sampling/importance_sampling_ratio/min": 1.0014536380767822, "sampling/sampling_logp_difference/max": 0.02287454903125763, "sampling/sampling_logp_difference/mean": 0.003105913056060672, "step": 2045, "step_time": 6.114952294010436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08074096590280533, "epoch": 0.01023, "grad_norm": 0.0003647575213108212, "kl": 0.2769903838634491, "learning_rate": 7.999626121819462e-06, "loss": 0.0007, "step": 2046, "step_time": 3.9149732679943554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 5.928571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4746064692735672, "epoch": 0.010235, "frac_reward_zero_std": 0.0, "grad_norm": 0.27163538336753845, "kl": 0.24764010310173035, "learning_rate": 7.999625749716567e-06, "loss": -0.0876, "num_tokens": 27604562.0, "reward": 0.8692636489868164, "reward_std": 1.0457775592803955, "rewards/rollout_reward_func/mean": 0.8692636489868164, "rewards/rollout_reward_func/std": 1.0457775592803955, "sampling/importance_sampling_ratio/max": 1.1677122116088867, "sampling/importance_sampling_ratio/mean": 0.6776871681213379, "sampling/importance_sampling_ratio/min": 0.000160194409545511, "sampling/sampling_logp_difference/max": 1.5404243469238281, "sampling/sampling_logp_difference/mean": 0.2406320571899414, "step": 2047, "step_time": 16.17173804201593 }, { "clip_ratio/high_max": 0.07629870437085629, "clip_ratio/high_mean": 0.03814935218542814, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03814935218542814, "entropy": 1.4691183790564537, "epoch": 0.01024, "grad_norm": 0.1510109305381775, "kl": 0.2521835081279278, "learning_rate": 7.999625377428607e-06, "loss": -0.0891, "step": 2048, "step_time": 7.108238723987597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 5.125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9820051901042461, "epoch": 0.010245, "frac_reward_zero_std": 0.0, "grad_norm": 0.4573865532875061, "kl": 0.26413819193840027, "learning_rate": 7.999625004955584e-06, "loss": -0.0118, "num_tokens": 27633836.0, "reward": 0.34442636370658875, "reward_std": 1.2980645895004272, "rewards/rollout_reward_func/mean": 0.34442636370658875, "rewards/rollout_reward_func/std": 1.2980645895004272, "sampling/importance_sampling_ratio/max": 1.0174309015274048, "sampling/importance_sampling_ratio/mean": 0.8070693612098694, "sampling/importance_sampling_ratio/min": 1.018702005239902e-05, "sampling/sampling_logp_difference/max": 2.3101749420166016, "sampling/sampling_logp_difference/mean": 0.17577318847179413, "step": 2049, "step_time": 12.663219122987357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.034090910106897354, "clip_ratio/low_min": 0.022727273404598236, "clip_ratio/region_mean": 0.034090910106897354, "entropy": 1.020973764359951, "epoch": 0.01025, "grad_norm": 0.2789141833782196, "kl": 0.2822794020175934, "learning_rate": 7.999624632297496e-06, "loss": -0.0143, "step": 2050, "step_time": 6.822766188983223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 5.6666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.130188897252083, "epoch": 0.010255, "frac_reward_zero_std": 0.0, "grad_norm": 0.029302259907126427, "kl": 0.21920793503522873, "learning_rate": 7.999624259454343e-06, "loss": -0.0735, "num_tokens": 27660015.0, "reward": 0.6206544041633606, "reward_std": 1.1887965202331543, "rewards/rollout_reward_func/mean": 0.6206544041633606, "rewards/rollout_reward_func/std": 1.1887966394424438, "sampling/importance_sampling_ratio/max": 1.0175294876098633, "sampling/importance_sampling_ratio/mean": 0.5357536673545837, "sampling/importance_sampling_ratio/min": 3.551098473053571e-07, "sampling/sampling_logp_difference/max": 1.7930785417556763, "sampling/sampling_logp_difference/mean": 0.3765804171562195, "step": 2051, "step_time": 14.161101893012528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.017613636795431376, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017613636795431376, "entropy": 2.1989458799362183, "epoch": 0.01026, "grad_norm": 0.024636488407850266, "kl": 0.2610999085009098, "learning_rate": 7.999623886426127e-06, "loss": -0.0736, "step": 2052, "step_time": 6.4871336779906414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 5.153846263885498, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.319010704755783, "epoch": 0.010265, "frac_reward_zero_std": 0.0, "grad_norm": 0.23162712156772614, "kl": 0.7035419307649136, "learning_rate": 7.999623513212846e-06, "loss": -0.0689, "num_tokens": 27688615.0, "reward": 1.0592303276062012, "reward_std": 1.1514739990234375, "rewards/rollout_reward_func/mean": 1.0592303276062012, "rewards/rollout_reward_func/std": 1.1514739990234375, "sampling/importance_sampling_ratio/max": 1.0720512866973877, "sampling/importance_sampling_ratio/mean": 0.6837140321731567, "sampling/importance_sampling_ratio/min": 1.1239328159717843e-05, "sampling/sampling_logp_difference/max": 2.018798828125, "sampling/sampling_logp_difference/mean": 0.26377594470977783, "step": 2053, "step_time": 16.721692322025774 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.01785714365541935, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02678571455180645, "entropy": 1.3540110364556313, "epoch": 0.01027, "grad_norm": 0.10680385679006577, "kl": 0.6612467840313911, "learning_rate": 7.9996231398145e-06, "loss": -0.0709, "step": 2054, "step_time": 7.255590397006017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 4.625, "completions/mean_terminated_length": 4.625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.39999587647616863, "epoch": 0.010275, "frac_reward_zero_std": 0.5, "grad_norm": 0.3065043091773987, "kl": 0.2695881873369217, "learning_rate": 7.999622766231092e-06, "loss": -0.0093, "num_tokens": 27712716.0, "reward": 1.0585434436798096, "reward_std": 1.2219024896621704, "rewards/rollout_reward_func/mean": 1.0585434436798096, "rewards/rollout_reward_func/std": 1.22190260887146, "sampling/importance_sampling_ratio/max": 1.0186470746994019, "sampling/importance_sampling_ratio/mean": 0.8899887204170227, "sampling/importance_sampling_ratio/min": 0.026336941868066788, "sampling/sampling_logp_difference/max": 1.6205010414123535, "sampling/sampling_logp_difference/mean": 0.061419978737831116, "step": 2055, "step_time": 11.94751901399286 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.057291666977107525, "entropy": 0.41299193538725376, "epoch": 0.01028, "grad_norm": 0.032794494181871414, "kl": 0.25730181485414505, "learning_rate": 7.999622392462618e-06, "loss": -0.011, "step": 2056, "step_time": 6.262298453992116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 4.2727274894714355, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.752302199602127, "epoch": 0.010285, "frac_reward_zero_std": 0.0, "grad_norm": 0.09838440269231796, "kl": 0.14295385032892227, "learning_rate": 7.99962201850908e-06, "loss": -0.0961, "num_tokens": 27742280.0, "reward": 1.0927700996398926, "reward_std": 1.2609163522720337, "rewards/rollout_reward_func/mean": 1.0927700996398926, "rewards/rollout_reward_func/std": 1.2609163522720337, "sampling/importance_sampling_ratio/max": 1.0139875411987305, "sampling/importance_sampling_ratio/mean": 0.6691504716873169, "sampling/importance_sampling_ratio/min": 4.797495876118774e-06, "sampling/sampling_logp_difference/max": 2.1282577514648438, "sampling/sampling_logp_difference/mean": 0.28031477332115173, "step": 2057, "step_time": 16.58039329299936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7568953335285187, "epoch": 0.01029, "grad_norm": 0.11394774913787842, "kl": 0.14124585315585136, "learning_rate": 7.999621644370478e-06, "loss": -0.0957, "step": 2058, "step_time": 6.891267267012154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 5.84615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9328110218048096, "epoch": 0.010295, "frac_reward_zero_std": 0.5, "grad_norm": 0.05013418197631836, "kl": 0.19628233276307583, "learning_rate": 7.999621270046812e-06, "loss": -0.023, "num_tokens": 27768804.0, "reward": 0.8882172703742981, "reward_std": 1.3887934684753418, "rewards/rollout_reward_func/mean": 0.8882172703742981, "rewards/rollout_reward_func/std": 1.388793706893921, "sampling/importance_sampling_ratio/max": 1.0126270055770874, "sampling/importance_sampling_ratio/mean": 0.5877665877342224, "sampling/importance_sampling_ratio/min": 3.1249264793586917e-06, "sampling/sampling_logp_difference/max": 2.5903544425964355, "sampling/sampling_logp_difference/mean": 0.27303430438041687, "step": 2059, "step_time": 16.828148918997613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9431272745132446, "epoch": 0.0103, "grad_norm": 0.04904012009501457, "kl": 0.19420571438968182, "learning_rate": 7.999620895538082e-06, "loss": -0.0231, "step": 2060, "step_time": 6.9529771210072795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.4375, "completions/mean_terminated_length": 4.733333587646484, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4332362487912178, "epoch": 0.010305, "frac_reward_zero_std": 0.0, "grad_norm": 0.18628714978694916, "kl": 1.0084465183317661, "learning_rate": 7.999620520844288e-06, "loss": -0.0478, "num_tokens": 27787269.0, "reward": 0.3521309494972229, "reward_std": 1.4938862323760986, "rewards/rollout_reward_func/mean": 0.3521309494972229, "rewards/rollout_reward_func/std": 1.493886113166809, "sampling/importance_sampling_ratio/max": 1.0058488845825195, "sampling/importance_sampling_ratio/mean": 0.6933152079582214, "sampling/importance_sampling_ratio/min": 3.35978256771341e-05, "sampling/sampling_logp_difference/max": 2.0881829261779785, "sampling/sampling_logp_difference/mean": 0.26569706201553345, "step": 2061, "step_time": 6.426947374988231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.017613636795431376, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017613636795431376, "entropy": 1.4453990310430527, "epoch": 0.01031, "grad_norm": 0.1886408030986786, "kl": 1.000132530927658, "learning_rate": 7.99962014596543e-06, "loss": -0.0492, "step": 2062, "step_time": 3.359129047006718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 5.428571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4361969605088234, "epoch": 0.010315, "frac_reward_zero_std": 0.0, "grad_norm": 0.19140715897083282, "kl": 0.6330986768007278, "learning_rate": 7.999619770901508e-06, "loss": -0.0473, "num_tokens": 27817374.0, "reward": 0.842769980430603, "reward_std": 1.013716220855713, "rewards/rollout_reward_func/mean": 0.842769980430603, "rewards/rollout_reward_func/std": 1.0137163400650024, "sampling/importance_sampling_ratio/max": 1.0571179389953613, "sampling/importance_sampling_ratio/mean": 0.5494910478591919, "sampling/importance_sampling_ratio/min": 0.0009777754312381148, "sampling/sampling_logp_difference/max": 1.6397407054901123, "sampling/sampling_logp_difference/mean": 0.25507503747940063, "step": 2063, "step_time": 14.975720379996346 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.005434782709926367, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021059782709926367, "entropy": 1.4369132444262505, "epoch": 0.01032, "grad_norm": 0.08949834108352661, "kl": 0.640645757317543, "learning_rate": 7.999619395652523e-06, "loss": -0.0479, "step": 2064, "step_time": 6.8499027339858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.1875, "completions/mean_terminated_length": 5.666666507720947, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.129182904958725, "epoch": 0.010325, "frac_reward_zero_std": 0.0, "grad_norm": 0.20000503957271576, "kl": 0.23486948385834694, "learning_rate": 7.999619020218472e-06, "loss": -0.0458, "num_tokens": 27851812.0, "reward": -0.18672658503055573, "reward_std": 1.0696871280670166, "rewards/rollout_reward_func/mean": -0.18672658503055573, "rewards/rollout_reward_func/std": 1.0696872472763062, "sampling/importance_sampling_ratio/max": 1.0421704053878784, "sampling/importance_sampling_ratio/mean": 0.38656237721443176, "sampling/importance_sampling_ratio/min": 2.5123413766037217e-11, "sampling/sampling_logp_difference/max": 2.423271417617798, "sampling/sampling_logp_difference/mean": 0.5635271072387695, "step": 2065, "step_time": 17.677652269005193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.1329326033592224, "epoch": 0.01033, "grad_norm": 0.2034543752670288, "kl": 0.23492345586419106, "learning_rate": 7.999618644599357e-06, "loss": -0.0458, "step": 2066, "step_time": 7.048111304990016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.9375, "completions/mean_terminated_length": 6.300000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.8618580996990204, "epoch": 0.010335, "frac_reward_zero_std": 0.0, "grad_norm": 0.13593220710754395, "kl": 0.23088070191442966, "learning_rate": 7.999618268795179e-06, "loss": -0.0592, "num_tokens": 27880107.0, "reward": -0.4728662073612213, "reward_std": 0.9568214416503906, "rewards/rollout_reward_func/mean": -0.4728662073612213, "rewards/rollout_reward_func/std": 0.9568215608596802, "sampling/importance_sampling_ratio/max": 1.0468934774398804, "sampling/importance_sampling_ratio/mean": 0.31698736548423767, "sampling/importance_sampling_ratio/min": 3.027433592706075e-08, "sampling/sampling_logp_difference/max": 2.4530656337738037, "sampling/sampling_logp_difference/mean": 0.4943033754825592, "step": 2067, "step_time": 15.209933858990553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8581965267658234, "epoch": 0.01034, "grad_norm": 0.13992738723754883, "kl": 0.2298563588410616, "learning_rate": 7.999617892805937e-06, "loss": -0.0592, "step": 2068, "step_time": 6.2709260169940535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.6875, "completions/mean_terminated_length": 5.363636493682861, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0855613946914673, "epoch": 0.010345, "frac_reward_zero_std": 0.0, "grad_norm": 0.16189423203468323, "kl": 0.1712217852473259, "learning_rate": 7.999617516631633e-06, "loss": -0.0908, "num_tokens": 27908260.0, "reward": 0.6007108688354492, "reward_std": 1.2584917545318604, "rewards/rollout_reward_func/mean": 0.6007108688354492, "rewards/rollout_reward_func/std": 1.2584917545318604, "sampling/importance_sampling_ratio/max": 1.0551300048828125, "sampling/importance_sampling_ratio/mean": 0.5947785973548889, "sampling/importance_sampling_ratio/min": 1.4968194591347128e-05, "sampling/sampling_logp_difference/max": 1.7774683237075806, "sampling/sampling_logp_difference/mean": 0.30674102902412415, "step": 2069, "step_time": 18.142228425000212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.082814246416092, "epoch": 0.01035, "grad_norm": 0.1507268100976944, "kl": 0.17066417448222637, "learning_rate": 7.999617140272261e-06, "loss": -0.0913, "step": 2070, "step_time": 6.929118389001815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.4375, "completions/mean_terminated_length": 5.9166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.250056654214859, "epoch": 0.010355, "frac_reward_zero_std": 0.0, "grad_norm": 0.07679012417793274, "kl": 0.2720710523426533, "learning_rate": 7.999616763727827e-06, "loss": -0.065, "num_tokens": 27939486.0, "reward": 0.42443329095840454, "reward_std": 1.3652536869049072, "rewards/rollout_reward_func/mean": 0.42443329095840454, "rewards/rollout_reward_func/std": 1.3652536869049072, "sampling/importance_sampling_ratio/max": 1.055511236190796, "sampling/importance_sampling_ratio/mean": 0.5346593856811523, "sampling/importance_sampling_ratio/min": 1.1417758827292346e-07, "sampling/sampling_logp_difference/max": 2.271653175354004, "sampling/sampling_logp_difference/mean": 0.3492225408554077, "step": 2071, "step_time": 15.739239595990512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.22813618183136, "epoch": 0.01036, "grad_norm": 0.06267759948968887, "kl": 0.26923519745469093, "learning_rate": 7.99961638699833e-06, "loss": -0.0655, "step": 2072, "step_time": 6.898218913003802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.4878270924091339, "epoch": 0.010365, "frac_reward_zero_std": 0.5, "grad_norm": 0.21955034136772156, "kl": 0.21575549617409706, "learning_rate": 7.999616010083769e-06, "loss": -0.0317, "num_tokens": 27962583.0, "reward": 0.4439062476158142, "reward_std": 1.1848993301391602, "rewards/rollout_reward_func/mean": 0.4439062476158142, "rewards/rollout_reward_func/std": 1.1848993301391602, "sampling/importance_sampling_ratio/max": 1.0215373039245605, "sampling/importance_sampling_ratio/mean": 0.9284560680389404, "sampling/importance_sampling_ratio/min": 0.0008221825701184571, "sampling/sampling_logp_difference/max": 1.3542919158935547, "sampling/sampling_logp_difference/mean": 0.08914010971784592, "step": 2073, "step_time": 12.64838861800672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4991333242505789, "epoch": 0.01037, "grad_norm": 0.2544284760951996, "kl": 0.2151993364095688, "learning_rate": 7.999615632984143e-06, "loss": -0.0322, "step": 2074, "step_time": 5.9844522869971115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2616597414016724, "epoch": 0.010375, "frac_reward_zero_std": 0.5, "grad_norm": 0.027400977909564972, "kl": 0.18887123093008995, "learning_rate": 7.999615255699455e-06, "loss": -0.0493, "num_tokens": 27988284.0, "reward": 0.740524172782898, "reward_std": 1.2937273979187012, "rewards/rollout_reward_func/mean": 0.740524172782898, "rewards/rollout_reward_func/std": 1.2937275171279907, "sampling/importance_sampling_ratio/max": 1.0473722219467163, "sampling/importance_sampling_ratio/mean": 0.81036376953125, "sampling/importance_sampling_ratio/min": 1.2109598173992708e-05, "sampling/sampling_logp_difference/max": 1.8040732145309448, "sampling/sampling_logp_difference/mean": 0.1756923347711563, "step": 2075, "step_time": 14.786424512014491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.290751501917839, "epoch": 0.01038, "grad_norm": 0.030691159889101982, "kl": 0.18800029531121254, "learning_rate": 7.999614878229703e-06, "loss": -0.0494, "step": 2076, "step_time": 6.820036987002823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 5.4166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.372219517827034, "epoch": 0.010385, "frac_reward_zero_std": 0.0, "grad_norm": 0.18661987781524658, "kl": 0.7862659506499767, "learning_rate": 7.999614500574885e-06, "loss": -0.0686, "num_tokens": 28020477.0, "reward": -0.18092653155326843, "reward_std": 0.7261516451835632, "rewards/rollout_reward_func/mean": -0.18092653155326843, "rewards/rollout_reward_func/std": 0.726151704788208, "sampling/importance_sampling_ratio/max": 1.055160403251648, "sampling/importance_sampling_ratio/mean": 0.5355631709098816, "sampling/importance_sampling_ratio/min": 1.477739743904749e-07, "sampling/sampling_logp_difference/max": 1.9614872932434082, "sampling/sampling_logp_difference/mean": 0.4475361704826355, "step": 2077, "step_time": 16.472511999993003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.374400317668915, "epoch": 0.01039, "grad_norm": 0.17404115200042725, "kl": 0.766303576529026, "learning_rate": 7.999614122735007e-06, "loss": -0.0682, "step": 2078, "step_time": 6.914181587999337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 4.714285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5588503777980804, "epoch": 0.010395, "frac_reward_zero_std": 0.0, "grad_norm": 0.5244823098182678, "kl": 0.28220806270837784, "learning_rate": 7.999613744710062e-06, "loss": -0.0642, "num_tokens": 28049429.0, "reward": 0.9293780326843262, "reward_std": 1.261358618736267, "rewards/rollout_reward_func/mean": 0.9293780326843262, "rewards/rollout_reward_func/std": 1.261358618736267, "sampling/importance_sampling_ratio/max": 1.0197089910507202, "sampling/importance_sampling_ratio/mean": 0.7209632396697998, "sampling/importance_sampling_ratio/min": 5.497686743183294e-06, "sampling/sampling_logp_difference/max": 1.8554378747940063, "sampling/sampling_logp_difference/mean": 0.21583563089370728, "step": 2079, "step_time": 15.127547735974076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 1.603232353925705, "epoch": 0.0104, "grad_norm": 0.05382005125284195, "kl": 0.2807326540350914, "learning_rate": 7.999613366500054e-06, "loss": -0.0669, "step": 2080, "step_time": 7.363762629000121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 4.9375, "completions/mean_terminated_length": 4.9375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.38873238302767277, "epoch": 0.010405, "frac_reward_zero_std": 0.5, "grad_norm": 0.023251308128237724, "kl": 0.27796709164977074, "learning_rate": 7.999612988104984e-06, "loss": -0.0374, "num_tokens": 28071445.0, "reward": 1.7895119190216064, "reward_std": 0.6094258427619934, "rewards/rollout_reward_func/mean": 1.7895119190216064, "rewards/rollout_reward_func/std": 0.6094259023666382, "sampling/importance_sampling_ratio/max": 1.036804437637329, "sampling/importance_sampling_ratio/mean": 0.9161384105682373, "sampling/importance_sampling_ratio/min": 0.0021642534993588924, "sampling/sampling_logp_difference/max": 1.0842539072036743, "sampling/sampling_logp_difference/mean": 0.07750333845615387, "step": 2081, "step_time": 11.881069562979974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39113436080515385, "epoch": 0.01041, "grad_norm": 0.02348555065691471, "kl": 0.268818449229002, "learning_rate": 7.99961260952485e-06, "loss": -0.0375, "step": 2082, "step_time": 5.772123866001493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.92307710647583, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1544828191399574, "epoch": 0.010415, "frac_reward_zero_std": 0.0, "grad_norm": 0.08411722630262375, "kl": 0.27040527015924454, "learning_rate": 7.999612230759652e-06, "loss": -0.0966, "num_tokens": 28100180.0, "reward": 1.112231731414795, "reward_std": 1.2579108476638794, "rewards/rollout_reward_func/mean": 1.112231731414795, "rewards/rollout_reward_func/std": 1.2579108476638794, "sampling/importance_sampling_ratio/max": 1.029089093208313, "sampling/importance_sampling_ratio/mean": 0.7599185705184937, "sampling/importance_sampling_ratio/min": 1.4229565749701578e-05, "sampling/sampling_logp_difference/max": 1.7852482795715332, "sampling/sampling_logp_difference/mean": 0.25227922201156616, "step": 2083, "step_time": 13.590813012007857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1713418662548065, "epoch": 0.01042, "grad_norm": 0.09300531446933746, "kl": 0.26671571284532547, "learning_rate": 7.99961185180939e-06, "loss": -0.0964, "step": 2084, "step_time": 6.080398179998156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.875, "completions/mean_terminated_length": 5.111111164093018, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.827682912349701, "epoch": 0.010425, "frac_reward_zero_std": 0.0, "grad_norm": 0.03940047696232796, "kl": 0.14382084272801876, "learning_rate": 7.999611472674065e-06, "loss": -0.0801, "num_tokens": 28131235.0, "reward": 0.42119061946868896, "reward_std": 1.3454540967941284, "rewards/rollout_reward_func/mean": 0.42119061946868896, "rewards/rollout_reward_func/std": 1.3454539775848389, "sampling/importance_sampling_ratio/max": 1.0497469902038574, "sampling/importance_sampling_ratio/mean": 0.44510799646377563, "sampling/importance_sampling_ratio/min": 3.737310461104926e-09, "sampling/sampling_logp_difference/max": 1.9787929058074951, "sampling/sampling_logp_difference/mean": 0.4939696490764618, "step": 2085, "step_time": 17.7573630400002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8326918482780457, "epoch": 0.01043, "grad_norm": 0.04304240271449089, "kl": 0.1413552388548851, "learning_rate": 7.999611093353675e-06, "loss": -0.0801, "step": 2086, "step_time": 7.282208760996582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.8125, "completions/mean_terminated_length": 5.545454502105713, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.621145486831665, "epoch": 0.010435, "frac_reward_zero_std": 0.0, "grad_norm": 0.09016279131174088, "kl": 0.28334246948361397, "learning_rate": 7.999610713848224e-06, "loss": -0.0237, "num_tokens": 28156503.0, "reward": -0.4127974510192871, "reward_std": 0.9944584369659424, "rewards/rollout_reward_func/mean": -0.4127974510192871, "rewards/rollout_reward_func/std": 0.9944584369659424, "sampling/importance_sampling_ratio/max": 1.042956829071045, "sampling/importance_sampling_ratio/mean": 0.40529510378837585, "sampling/importance_sampling_ratio/min": 2.8058664156560553e-06, "sampling/sampling_logp_difference/max": 1.7101266384124756, "sampling/sampling_logp_difference/mean": 0.44707852602005005, "step": 2087, "step_time": 12.612880626998958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.639878064393997, "epoch": 0.01044, "grad_norm": 0.09187908470630646, "kl": 0.2845311313867569, "learning_rate": 7.999610334157707e-06, "loss": -0.0241, "step": 2088, "step_time": 5.960251644006348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 5.428571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5787109360098839, "epoch": 0.010445, "frac_reward_zero_std": 0.0, "grad_norm": 0.03462134674191475, "kl": 0.2962592542171478, "learning_rate": 7.99960995428213e-06, "loss": -0.0893, "num_tokens": 28181965.0, "reward": 1.196545124053955, "reward_std": 1.2856941223144531, "rewards/rollout_reward_func/mean": 1.196545124053955, "rewards/rollout_reward_func/std": 1.2856941223144531, "sampling/importance_sampling_ratio/max": 1.0474584102630615, "sampling/importance_sampling_ratio/mean": 0.7117835283279419, "sampling/importance_sampling_ratio/min": 8.883637201506644e-05, "sampling/sampling_logp_difference/max": 2.0735104084014893, "sampling/sampling_logp_difference/mean": 0.28091293573379517, "step": 2089, "step_time": 15.820329057984054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5780225917696953, "epoch": 0.01045, "grad_norm": 0.037839606404304504, "kl": 0.29854006692767143, "learning_rate": 7.999609574221487e-06, "loss": -0.0894, "step": 2090, "step_time": 7.100844636006514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 5.777777671813965, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.2538663744926453, "epoch": 0.010455, "frac_reward_zero_std": 0.0, "grad_norm": 0.11197540909051895, "kl": 0.5068597830832005, "learning_rate": 7.99960919397578e-06, "loss": -0.0805, "num_tokens": 28215771.0, "reward": -0.24472856521606445, "reward_std": 0.9736312031745911, "rewards/rollout_reward_func/mean": -0.24472856521606445, "rewards/rollout_reward_func/std": 0.9736312627792358, "sampling/importance_sampling_ratio/max": 1.1621649265289307, "sampling/importance_sampling_ratio/mean": 0.3454170823097229, "sampling/importance_sampling_ratio/min": 7.209521157847121e-08, "sampling/sampling_logp_difference/max": 2.1289525032043457, "sampling/sampling_logp_difference/mean": 0.5598070025444031, "step": 2091, "step_time": 15.988523278996581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.2496700286865234, "epoch": 0.01046, "grad_norm": 0.09484535455703735, "kl": 0.5123320752754807, "learning_rate": 7.999608813545012e-06, "loss": -0.081, "step": 2092, "step_time": 6.954365668017999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.8125, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.7401920557022095, "epoch": 0.010465, "frac_reward_zero_std": 0.0, "grad_norm": 0.061954546719789505, "kl": 0.18787337839603424, "learning_rate": 7.999608432929179e-06, "loss": -0.0443, "num_tokens": 28240519.0, "reward": 0.30766722559928894, "reward_std": 1.4689769744873047, "rewards/rollout_reward_func/mean": 0.30766722559928894, "rewards/rollout_reward_func/std": 1.4689769744873047, "sampling/importance_sampling_ratio/max": 1.037629246711731, "sampling/importance_sampling_ratio/mean": 0.49096524715423584, "sampling/importance_sampling_ratio/min": 6.380466288646858e-07, "sampling/sampling_logp_difference/max": 2.241513967514038, "sampling/sampling_logp_difference/mean": 0.49092429876327515, "step": 2093, "step_time": 16.477744107003673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.73090660572052, "epoch": 0.01047, "grad_norm": 0.0569520965218544, "kl": 0.18427076190710068, "learning_rate": 7.999608052128284e-06, "loss": -0.0444, "step": 2094, "step_time": 6.07789877903997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.875, "completions/mean_terminated_length": 6.200000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.462480813264847, "epoch": 0.010475, "frac_reward_zero_std": 0.0, "grad_norm": 0.0511026568710804, "kl": 0.4403271805495024, "learning_rate": 7.999607671142324e-06, "loss": -0.045, "num_tokens": 28270426.0, "reward": 0.299608051776886, "reward_std": 1.357002854347229, "rewards/rollout_reward_func/mean": 0.299608051776886, "rewards/rollout_reward_func/std": 1.3570029735565186, "sampling/importance_sampling_ratio/max": 1.0557165145874023, "sampling/importance_sampling_ratio/mean": 0.4271189272403717, "sampling/importance_sampling_ratio/min": 4.687240107159596e-07, "sampling/sampling_logp_difference/max": 2.244680404663086, "sampling/sampling_logp_difference/mean": 0.373869925737381, "step": 2095, "step_time": 16.817962894012453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4605919122695923, "epoch": 0.01048, "grad_norm": 0.04487232863903046, "kl": 0.43798472359776497, "learning_rate": 7.999607289971302e-06, "loss": -0.0453, "step": 2096, "step_time": 6.693583650005166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.9375, "completions/mean_terminated_length": 5.222222328186035, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.602547287940979, "epoch": 0.010485, "frac_reward_zero_std": 0.0, "grad_norm": 0.3491950035095215, "kl": 0.14576123654842377, "learning_rate": 7.999606908615216e-06, "loss": -0.0688, "num_tokens": 28305078.0, "reward": -0.4422193169593811, "reward_std": 0.935975968837738, "rewards/rollout_reward_func/mean": -0.4422193169593811, "rewards/rollout_reward_func/std": 0.9359760880470276, "sampling/importance_sampling_ratio/max": 1.0799221992492676, "sampling/importance_sampling_ratio/mean": 0.3880496621131897, "sampling/importance_sampling_ratio/min": 4.186164369457401e-05, "sampling/sampling_logp_difference/max": 1.5279817581176758, "sampling/sampling_logp_difference/mean": 0.3777514100074768, "step": 2097, "step_time": 15.675793654008885 }, { "clip_ratio/high_max": 0.010869565419852734, "clip_ratio/high_mean": 0.005434782709926367, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005434782709926367, "entropy": 2.5966142416000366, "epoch": 0.01049, "grad_norm": 0.021720020100474358, "kl": 0.1403541285544634, "learning_rate": 7.999606527074068e-06, "loss": -0.0704, "step": 2098, "step_time": 6.989068993018009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 4.384615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4926823396235704, "epoch": 0.010495, "frac_reward_zero_std": 0.5, "grad_norm": 0.15100637078285217, "kl": 0.29828327521681786, "learning_rate": 7.999606145347855e-06, "loss": -0.0516, "num_tokens": 28328666.0, "reward": 1.290783405303955, "reward_std": 1.1795021295547485, "rewards/rollout_reward_func/mean": 1.290783405303955, "rewards/rollout_reward_func/std": 1.179502010345459, "sampling/importance_sampling_ratio/max": 1.1054519414901733, "sampling/importance_sampling_ratio/mean": 0.7275000810623169, "sampling/importance_sampling_ratio/min": 1.932336459731232e-07, "sampling/sampling_logp_difference/max": 1.9142879247665405, "sampling/sampling_logp_difference/mean": 0.26913321018218994, "step": 2099, "step_time": 14.272091343984357 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.4908104464411736, "epoch": 0.0105, "grad_norm": 0.08256887644529343, "kl": 0.2992357201874256, "learning_rate": 7.99960576343658e-06, "loss": -0.0517, "step": 2100, "step_time": 7.384515735000605 } ], "logging_steps": 1.0, "max_steps": 400000, "num_input_tokens_seen": 28328666, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }