{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.006, "eval_steps": 500, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2813.0, "completions/max_terminated_length": 2813.0, "completions/mean_length": 2062.46875, "completions/mean_terminated_length": 2062.46875, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.1340037016198039, "epoch": 8e-05, "frac_reward_zero_std": 0.625, "grad_norm": 0.9753277897834778, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0097, "num_tokens": 78941.0, "reward": 0.5106250047683716, "reward_std": 0.16591878235340118, "rewards/rollout_reward_func/mean": 0.5106250047683716, "rewards/rollout_reward_func/std": 0.38574549555778503, "sampling/importance_sampling_ratio/max": 1.89468514919281, "sampling/importance_sampling_ratio/mean": 0.917938768863678, "sampling/importance_sampling_ratio/min": 0.26035696268081665, "sampling/sampling_logp_difference/max": 1.035329818725586, "sampling/sampling_logp_difference/mean": 0.020964600145816803, "step": 1, "step_time": 18.408817325000086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2795.0, "completions/max_terminated_length": 2795.0, "completions/mean_length": 2091.09375, "completions/mean_terminated_length": 2091.09375, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.1304742144420743, "epoch": 0.00016, "frac_reward_zero_std": 0.625, "grad_norm": 1.355797290802002, "kl": 0.0, "learning_rate": 2.2857142857142855e-07, "loss": -0.0694, "num_tokens": 158774.0, "reward": 0.38593748211860657, "reward_std": 0.15246255695819855, "rewards/rollout_reward_func/mean": 0.38593748211860657, "rewards/rollout_reward_func/std": 0.3016391694545746, "sampling/importance_sampling_ratio/max": 2.747450828552246, "sampling/importance_sampling_ratio/mean": 0.995655357837677, "sampling/importance_sampling_ratio/min": 0.30046260356903076, "sampling/sampling_logp_difference/max": 1.1795392036437988, "sampling/sampling_logp_difference/mean": 0.022426610812544823, "step": 2, "step_time": 17.007036188999905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2783.0, "completions/max_terminated_length": 2783.0, "completions/mean_length": 1881.75, "completions/mean_terminated_length": 1881.75, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.12374210823327303, "epoch": 0.00024, "frac_reward_zero_std": 0.625, "grad_norm": 0.6671572327613831, "kl": 0.0014705628045703634, "learning_rate": 4.571428571428571e-07, "loss": -0.0235, "num_tokens": 231260.0, "reward": 0.4012500047683716, "reward_std": 0.20683754980564117, "rewards/rollout_reward_func/mean": 0.4012500047683716, "rewards/rollout_reward_func/std": 0.33187392354011536, "sampling/importance_sampling_ratio/max": 1.3709925413131714, "sampling/importance_sampling_ratio/mean": 0.8955257534980774, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9760880470275879, "sampling/sampling_logp_difference/mean": 0.020091338083148003, "step": 3, "step_time": 16.527492177000227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2795.0, "completions/max_terminated_length": 2795.0, "completions/mean_length": 2263.03125, "completions/mean_terminated_length": 2263.03125, "completions/min_length": 1569.0, "completions/min_terminated_length": 1569.0, "entropy": 0.15253359219059348, "epoch": 0.00032, "frac_reward_zero_std": 0.875, "grad_norm": 0.9312232136726379, "kl": 0.0016055026353569701, "learning_rate": 6.857142857142857e-07, "loss": -0.0241, "num_tokens": 316879.0, "reward": 0.3787500262260437, "reward_std": 0.0624999962747097, "rewards/rollout_reward_func/mean": 0.3787500262260437, "rewards/rollout_reward_func/std": 0.26268768310546875, "sampling/importance_sampling_ratio/max": 1.9853609800338745, "sampling/importance_sampling_ratio/mean": 0.9613277912139893, "sampling/importance_sampling_ratio/min": 0.4037262201309204, "sampling/sampling_logp_difference/max": 0.6126779317855835, "sampling/sampling_logp_difference/mean": 0.02291969209909439, "step": 4, "step_time": 18.10109623099993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003289473708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "completions/clipped_ratio": 0.0, "completions/max_length": 2807.0, "completions/max_terminated_length": 2807.0, "completions/mean_length": 2197.1875, "completions/mean_terminated_length": 2197.1875, "completions/min_length": 1570.0, "completions/min_terminated_length": 1570.0, "entropy": 0.14702600054442883, "epoch": 0.0004, "frac_reward_zero_std": 0.5, "grad_norm": 2.2064177989959717, "kl": 0.002026251120696543, "learning_rate": 9.142857142857142e-07, "loss": -0.0169, "num_tokens": 400370.0, "reward": 0.4140625, "reward_std": 0.15217570960521698, "rewards/rollout_reward_func/mean": 0.4140625, "rewards/rollout_reward_func/std": 0.33838188648223877, "sampling/importance_sampling_ratio/max": 2.348391532897949, "sampling/importance_sampling_ratio/mean": 1.0998704433441162, "sampling/importance_sampling_ratio/min": 0.6395935416221619, "sampling/sampling_logp_difference/max": 0.6357507705688477, "sampling/sampling_logp_difference/mean": 0.020555175840854645, "step": 5, "step_time": 16.967482953000058 }, { "clip_ratio/high_max": 0.009868421126157045, "clip_ratio/high_mean": 0.004934210563078523, "clip_ratio/low_mean": 0.0034829721553251147, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008417182718403637, "completions/clipped_ratio": 0.0, "completions/max_length": 2449.0, "completions/max_terminated_length": 2449.0, "completions/mean_length": 1818.28125, "completions/mean_terminated_length": 1818.28125, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.0911772302351892, "epoch": 0.00048, "frac_reward_zero_std": 0.625, "grad_norm": 0.8283352255821228, "kl": 0.001795254382159328, "learning_rate": 1.1428571428571428e-06, "loss": -0.0209, "num_tokens": 470216.0, "reward": 0.5893750190734863, "reward_std": 0.1562499850988388, "rewards/rollout_reward_func/mean": 0.5893750190734863, "rewards/rollout_reward_func/std": 0.4223737418651581, "sampling/importance_sampling_ratio/max": 1.7877978086471558, "sampling/importance_sampling_ratio/mean": 0.9894595146179199, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9945626258850098, "sampling/sampling_logp_difference/mean": 0.01755390875041485, "step": 6, "step_time": 15.150656265999714 }, { "clip_ratio/high_max": 0.009375000139698386, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.006411405862309039, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012661405955441296, "completions/clipped_ratio": 0.0, "completions/max_length": 2791.0, "completions/max_terminated_length": 2791.0, "completions/mean_length": 2104.28125, "completions/mean_terminated_length": 2104.28125, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.16090343240648508, "epoch": 0.00056, "frac_reward_zero_std": 0.5, "grad_norm": 1.4131731986999512, "kl": 0.002872211887734011, "learning_rate": 1.3714285714285715e-06, "loss": -0.03, "num_tokens": 550373.0, "reward": 0.29500001668930054, "reward_std": 0.0949999988079071, "rewards/rollout_reward_func/mean": 0.29500001668930054, "rewards/rollout_reward_func/std": 0.1687716543674469, "sampling/importance_sampling_ratio/max": 1.3797976970672607, "sampling/importance_sampling_ratio/mean": 0.9415616989135742, "sampling/importance_sampling_ratio/min": 0.29769256711006165, "sampling/sampling_logp_difference/max": 0.9464168548583984, "sampling/sampling_logp_difference/mean": 0.02406277321279049, "step": 7, "step_time": 16.95326116199999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0016447368543595076, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016447368543595076, "completions/clipped_ratio": 0.0, "completions/max_length": 2814.0, "completions/max_terminated_length": 2814.0, "completions/mean_length": 2265.5625, "completions/mean_terminated_length": 2265.5625, "completions/min_length": 1569.0, "completions/min_terminated_length": 1569.0, "entropy": 0.20542557537555695, "epoch": 0.00064, "frac_reward_zero_std": 0.75, "grad_norm": 1.0486057996749878, "kl": 0.00249500987411011, "learning_rate": 1.6e-06, "loss": 0.0307, "num_tokens": 636318.0, "reward": 0.3434374928474426, "reward_std": 0.08029377460479736, "rewards/rollout_reward_func/mean": 0.3434374928474426, "rewards/rollout_reward_func/std": 0.22245851159095764, "sampling/importance_sampling_ratio/max": 2.239882230758667, "sampling/importance_sampling_ratio/mean": 0.9191794395446777, "sampling/importance_sampling_ratio/min": 0.3405879735946655, "sampling/sampling_logp_difference/max": 1.0898922681808472, "sampling/sampling_logp_difference/mean": 0.023122236132621765, "step": 8, "step_time": 18.007336240000086 }, { "clip_ratio/high_max": 0.010620915098115802, "clip_ratio/high_mean": 0.005310457549057901, "clip_ratio/low_mean": 0.0016447368543595076, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069551944034174085, "completions/clipped_ratio": 0.0, "completions/max_length": 2790.0, "completions/max_terminated_length": 2790.0, "completions/mean_length": 1646.03125, "completions/mean_terminated_length": 1646.03125, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.13724102126434445, "epoch": 0.00072, "frac_reward_zero_std": 0.25, "grad_norm": 1.481195092201233, "kl": 0.0015346993204730097, "learning_rate": 1.8285714285714284e-06, "loss": -0.0189, "num_tokens": 701005.0, "reward": 0.6278125047683716, "reward_std": 0.30871257185935974, "rewards/rollout_reward_func/mean": 0.6278125047683716, "rewards/rollout_reward_func/std": 0.4518972933292389, "sampling/importance_sampling_ratio/max": 1.9988352060317993, "sampling/importance_sampling_ratio/mean": 0.9468162059783936, "sampling/importance_sampling_ratio/min": 0.4476884603500366, "sampling/sampling_logp_difference/max": 0.773470401763916, "sampling/sampling_logp_difference/mean": 0.02107076346874237, "step": 9, "step_time": 16.103736942000182 }, { "clip_ratio/high_max": 0.008938953513279557, "clip_ratio/high_mean": 0.004469476756639779, "clip_ratio/low_mean": 0.0015243901871144772, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005993866943754256, "completions/clipped_ratio": 0.0, "completions/max_length": 2799.0, "completions/max_terminated_length": 2799.0, "completions/mean_length": 2226.4375, "completions/mean_terminated_length": 2226.4375, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.18680323101580143, "epoch": 0.0008, "frac_reward_zero_std": 0.625, "grad_norm": 2.0065183639526367, "kl": 0.002535051797167398, "learning_rate": 2.057142857142857e-06, "loss": 0.0495, "num_tokens": 785722.0, "reward": 0.4443749785423279, "reward_std": 0.08841878175735474, "rewards/rollout_reward_func/mean": 0.4443749785423279, "rewards/rollout_reward_func/std": 0.35590803623199463, "sampling/importance_sampling_ratio/max": 2.790905714035034, "sampling/importance_sampling_ratio/mean": 0.940368115901947, "sampling/importance_sampling_ratio/min": 0.22617992758750916, "sampling/sampling_logp_difference/max": 0.6851506233215332, "sampling/sampling_logp_difference/mean": 0.025203729048371315, "step": 10, "step_time": 17.030096009999966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0015625000232830644, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015625000232830644, "completions/clipped_ratio": 0.0, "completions/max_length": 2783.0, "completions/max_terminated_length": 2783.0, "completions/mean_length": 1966.875, "completions/mean_terminated_length": 1966.875, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.1440325272269547, "epoch": 0.00088, "frac_reward_zero_std": 0.625, "grad_norm": 1.0474733114242554, "kl": 0.0011928395033464767, "learning_rate": 2.2857142857142856e-06, "loss": -0.0121, "num_tokens": 861366.0, "reward": 0.4596875011920929, "reward_std": 0.14279377460479736, "rewards/rollout_reward_func/mean": 0.4596875011920929, "rewards/rollout_reward_func/std": 0.38282889127731323, "sampling/importance_sampling_ratio/max": 1.9087510108947754, "sampling/importance_sampling_ratio/mean": 0.9341506361961365, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9133691787719727, "sampling/sampling_logp_difference/mean": 0.02222413383424282, "step": 11, "step_time": 16.685985845999994 }, { "clip_ratio/high_max": 0.007352941203862429, "clip_ratio/high_mean": 0.0055147059028968215, "clip_ratio/low_mean": 0.0036764706019312143, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009191176504828036, "completions/clipped_ratio": 0.0, "completions/max_length": 2801.0, "completions/max_terminated_length": 2801.0, "completions/mean_length": 1857.96875, "completions/mean_terminated_length": 1857.96875, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.17092999629676342, "epoch": 0.00096, "frac_reward_zero_std": 0.5, "grad_norm": 2.0095906257629395, "kl": 0.0024750066513661295, "learning_rate": 2.5142857142857142e-06, "loss": -0.0228, "num_tokens": 933262.0, "reward": 0.48593753576278687, "reward_std": 0.20529377460479736, "rewards/rollout_reward_func/mean": 0.48593753576278687, "rewards/rollout_reward_func/std": 0.38762184977531433, "sampling/importance_sampling_ratio/max": 1.6060364246368408, "sampling/importance_sampling_ratio/mean": 0.9451028108596802, "sampling/importance_sampling_ratio/min": 0.34411877393722534, "sampling/sampling_logp_difference/max": 1.0912601947784424, "sampling/sampling_logp_difference/mean": 0.019420120865106583, "step": 12, "step_time": 17.06754343600005 }, { "clip_ratio/high_max": 0.007936508161947131, "clip_ratio/high_mean": 0.003968254080973566, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0057043652050197124, "completions/clipped_ratio": 0.0, "completions/max_length": 2812.0, "completions/max_terminated_length": 2812.0, "completions/mean_length": 1741.5, "completions/mean_terminated_length": 1741.5, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.11786343855783343, "epoch": 0.00104, "frac_reward_zero_std": 0.5, "grad_norm": 2.0365312099456787, "kl": 0.0034534272163000423, "learning_rate": 2.742857142857143e-06, "loss": -0.0389, "num_tokens": 1001046.0, "reward": 0.6475000381469727, "reward_std": 0.24292194843292236, "rewards/rollout_reward_func/mean": 0.6475000381469727, "rewards/rollout_reward_func/std": 0.4391413629055023, "sampling/importance_sampling_ratio/max": 2.502153158187866, "sampling/importance_sampling_ratio/mean": 1.0042061805725098, "sampling/importance_sampling_ratio/min": 0.4875127375125885, "sampling/sampling_logp_difference/max": 0.6685242652893066, "sampling/sampling_logp_difference/mean": 0.01504062581807375, "step": 13, "step_time": 16.125135786999863 }, { "clip_ratio/high_max": 0.011430230224505067, "clip_ratio/high_mean": 0.0057151151122525334, "clip_ratio/low_mean": 0.0022321429569274187, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007947258069179952, "completions/clipped_ratio": 0.0, "completions/max_length": 2802.0, "completions/max_terminated_length": 2802.0, "completions/mean_length": 2010.3125, "completions/mean_terminated_length": 2010.3125, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.1693209670484066, "epoch": 0.00112, "frac_reward_zero_std": 0.5, "grad_norm": 1.2513231039047241, "kl": 0.002481764371623285, "learning_rate": 2.9714285714285716e-06, "loss": 0.0309, "num_tokens": 1078101.0, "reward": 0.4753125011920929, "reward_std": 0.19296419620513916, "rewards/rollout_reward_func/mean": 0.4753125011920929, "rewards/rollout_reward_func/std": 0.3761346936225891, "sampling/importance_sampling_ratio/max": 1.5099362134933472, "sampling/importance_sampling_ratio/mean": 0.9022589921951294, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.6567137241363525, "sampling/sampling_logp_difference/mean": 0.022564683109521866, "step": 14, "step_time": 16.688260055 }, { "clip_ratio/high_max": 0.0036764706019312143, "clip_ratio/high_mean": 0.0018382353009656072, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003574346425011754, "completions/clipped_ratio": 0.0, "completions/max_length": 2775.0, "completions/max_terminated_length": 2775.0, "completions/mean_length": 1887.71875, "completions/mean_terminated_length": 1887.71875, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.14878523536026478, "epoch": 0.0012, "frac_reward_zero_std": 0.5, "grad_norm": 1.092544674873352, "kl": 0.0021166762671782635, "learning_rate": 3.2e-06, "loss": -0.014, "num_tokens": 1150803.0, "reward": 0.4506249725818634, "reward_std": 0.2620203495025635, "rewards/rollout_reward_func/mean": 0.4506249725818634, "rewards/rollout_reward_func/std": 0.3878471255302429, "sampling/importance_sampling_ratio/max": 2.092060089111328, "sampling/importance_sampling_ratio/mean": 1.0061091184616089, "sampling/importance_sampling_ratio/min": 0.519212543964386, "sampling/sampling_logp_difference/max": 0.664109468460083, "sampling/sampling_logp_difference/mean": 0.02180980145931244, "step": 15, "step_time": 16.993085070999996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0016891892300918698, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016891892300918698, "completions/clipped_ratio": 0.0, "completions/max_length": 2812.0, "completions/max_terminated_length": 2812.0, "completions/mean_length": 2229.28125, "completions/mean_terminated_length": 2229.28125, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.15634657256305218, "epoch": 0.00128, "frac_reward_zero_std": 0.875, "grad_norm": 0.09010659158229828, "kl": 0.005270412558274984, "learning_rate": 3.428571428571428e-06, "loss": 0.005, "num_tokens": 1235464.0, "reward": 0.48374998569488525, "reward_std": 0.0625, "rewards/rollout_reward_func/mean": 0.48374998569488525, "rewards/rollout_reward_func/std": 0.36630454659461975, "sampling/importance_sampling_ratio/max": 1.890110969543457, "sampling/importance_sampling_ratio/mean": 0.9257134199142456, "sampling/importance_sampling_ratio/min": 0.4630853831768036, "sampling/sampling_logp_difference/max": 0.69629967212677, "sampling/sampling_logp_difference/mean": 0.020435180515050888, "step": 16, "step_time": 17.667978367999467 }, { "clip_ratio/high_max": 0.016176471021026373, "clip_ratio/high_mean": 0.009732972481288016, "clip_ratio/low_mean": 0.0016447368543595076, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011377709335647523, "completions/clipped_ratio": 0.0, "completions/max_length": 2449.0, "completions/max_terminated_length": 2449.0, "completions/mean_length": 1750.53125, "completions/mean_terminated_length": 1750.53125, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.13329231040552258, "epoch": 0.00136, "frac_reward_zero_std": 0.375, "grad_norm": 1.2919148206710815, "kl": 0.0019803230388788506, "learning_rate": 3.657142857142857e-06, "loss": 0.0395, "num_tokens": 1303537.0, "reward": 0.5606250166893005, "reward_std": 0.2351399064064026, "rewards/rollout_reward_func/mean": 0.5606250166893005, "rewards/rollout_reward_func/std": 0.41600972414016724, "sampling/importance_sampling_ratio/max": 1.4119406938552856, "sampling/importance_sampling_ratio/mean": 0.9287126064300537, "sampling/importance_sampling_ratio/min": 0.3827318847179413, "sampling/sampling_logp_difference/max": 0.6636209487915039, "sampling/sampling_logp_difference/mean": 0.017019610852003098, "step": 17, "step_time": 15.075951100999873 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0016447368543595076, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004769736900925636, "completions/clipped_ratio": 0.0, "completions/max_length": 2799.0, "completions/max_terminated_length": 2799.0, "completions/mean_length": 2316.9375, "completions/mean_terminated_length": 2316.9375, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.16856362204998732, "epoch": 0.00144, "frac_reward_zero_std": 0.625, "grad_norm": 2.867643356323242, "kl": 0.005084036383777857, "learning_rate": 3.885714285714286e-06, "loss": -0.0049, "num_tokens": 1391780.0, "reward": 0.37031251192092896, "reward_std": 0.10187499970197678, "rewards/rollout_reward_func/mean": 0.37031251192092896, "rewards/rollout_reward_func/std": 0.2657124996185303, "sampling/importance_sampling_ratio/max": 1.9782981872558594, "sampling/importance_sampling_ratio/mean": 1.0121815204620361, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48717403411865234, "sampling/sampling_logp_difference/mean": 0.022180214524269104, "step": 18, "step_time": 17.064124244999903 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0015243901871144772, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004649390233680606, "completions/clipped_ratio": 0.0, "completions/max_length": 2798.0, "completions/max_terminated_length": 2798.0, "completions/mean_length": 2216.0, "completions/mean_terminated_length": 2216.0, "completions/min_length": 1563.0, "completions/min_terminated_length": 1563.0, "entropy": 0.1883529694750905, "epoch": 0.00152, "frac_reward_zero_std": 0.875, "grad_norm": 2.2749135494232178, "kl": 0.005091317143524066, "learning_rate": 4.114285714285714e-06, "loss": 0.0455, "num_tokens": 1475873.0, "reward": 0.4059374928474426, "reward_std": 0.00812500063329935, "rewards/rollout_reward_func/mean": 0.4059374928474426, "rewards/rollout_reward_func/std": 0.29813244938850403, "sampling/importance_sampling_ratio/max": 2.81974196434021, "sampling/importance_sampling_ratio/mean": 1.0345871448516846, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8696746826171875, "sampling/sampling_logp_difference/mean": 0.02671925723552704, "step": 19, "step_time": 17.602816416999985 }, { "clip_ratio/high_max": 0.011488970601931214, "clip_ratio/high_mean": 0.005744485300965607, "clip_ratio/low_mean": 0.0048926768358796835, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01063716213684529, "completions/clipped_ratio": 0.0, "completions/max_length": 2819.0, "completions/max_terminated_length": 2819.0, "completions/mean_length": 1814.28125, "completions/mean_terminated_length": 1814.28125, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.1282729902304709, "epoch": 0.0016, "frac_reward_zero_std": 0.5, "grad_norm": 1.691245198249817, "kl": 0.002560590350185521, "learning_rate": 4.342857142857142e-06, "loss": -0.0259, "num_tokens": 1546081.0, "reward": 0.612500011920929, "reward_std": 0.18216876685619354, "rewards/rollout_reward_func/mean": 0.612500011920929, "rewards/rollout_reward_func/std": 0.43820008635520935, "sampling/importance_sampling_ratio/max": 1.9052116870880127, "sampling/importance_sampling_ratio/mean": 1.007737636566162, "sampling/importance_sampling_ratio/min": 0.5377175211906433, "sampling/sampling_logp_difference/max": 0.6581223011016846, "sampling/sampling_logp_difference/mean": 0.01716558076441288, "step": 20, "step_time": 16.950590521000322 }, { "clip_ratio/high_max": 0.010667945956811309, "clip_ratio/high_mean": 0.005333972978405654, "clip_ratio/low_mean": 0.004784891498275101, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010118864476680756, "completions/clipped_ratio": 0.0, "completions/max_length": 2791.0, "completions/max_terminated_length": 2791.0, "completions/mean_length": 1928.25, "completions/mean_terminated_length": 1928.25, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.1822828585281968, "epoch": 0.00168, "frac_reward_zero_std": 0.5, "grad_norm": 1.4209487438201904, "kl": 0.0024251511349575594, "learning_rate": 4.571428571428571e-06, "loss": -0.038, "num_tokens": 1620395.0, "reward": 0.6059374809265137, "reward_std": 0.20529377460479736, "rewards/rollout_reward_func/mean": 0.6059374809265137, "rewards/rollout_reward_func/std": 0.4463822841644287, "sampling/importance_sampling_ratio/max": 1.6781816482543945, "sampling/importance_sampling_ratio/mean": 1.0399606227874756, "sampling/importance_sampling_ratio/min": 0.30244705080986023, "sampling/sampling_logp_difference/max": 0.7038769721984863, "sampling/sampling_logp_difference/mean": 0.028661729767918587, "step": 21, "step_time": 16.793115096000065 }, { "clip_ratio/high_max": 0.01126575656235218, "clip_ratio/high_mean": 0.00563287828117609, "clip_ratio/low_mean": 0.0018382353009656072, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007471113582141697, "completions/clipped_ratio": 0.0, "completions/max_length": 2806.0, "completions/max_terminated_length": 2806.0, "completions/mean_length": 1895.0, "completions/mean_terminated_length": 1895.0, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.11727871629409492, "epoch": 0.00176, "frac_reward_zero_std": 0.375, "grad_norm": 1.4466348886489868, "kl": 0.003775272169150412, "learning_rate": 4.8e-06, "loss": 0.0009, "num_tokens": 1693497.0, "reward": 0.5303125381469727, "reward_std": 0.2740437984466553, "rewards/rollout_reward_func/mean": 0.5303125381469727, "rewards/rollout_reward_func/std": 0.4203799068927765, "sampling/importance_sampling_ratio/max": 1.6391761302947998, "sampling/importance_sampling_ratio/mean": 0.9023667573928833, "sampling/importance_sampling_ratio/min": 0.30269956588745117, "sampling/sampling_logp_difference/max": 1.103229284286499, "sampling/sampling_logp_difference/mean": 0.028024764731526375, "step": 22, "step_time": 16.31928637299984 }, { "clip_ratio/high_max": 0.006761695956811309, "clip_ratio/high_mean": 0.0033808479784056544, "clip_ratio/low_mean": 0.0034829722717404366, "clip_ratio/low_min": 0.003289473708719015, "clip_ratio/region_mean": 0.006863820250146091, "completions/clipped_ratio": 0.0, "completions/max_length": 2795.0, "completions/max_terminated_length": 2795.0, "completions/mean_length": 1923.9375, "completions/mean_terminated_length": 1923.9375, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.14631808176636696, "epoch": 0.00184, "frac_reward_zero_std": 0.625, "grad_norm": 1.9489635229110718, "kl": 0.0022731795979780145, "learning_rate": 5.0285714285714285e-06, "loss": -0.0229, "num_tokens": 1767354.0, "reward": 0.4168750047683716, "reward_std": 0.20417675375938416, "rewards/rollout_reward_func/mean": 0.4168750047683716, "rewards/rollout_reward_func/std": 0.33288994431495667, "sampling/importance_sampling_ratio/max": 2.5520823001861572, "sampling/importance_sampling_ratio/mean": 1.124953269958496, "sampling/importance_sampling_ratio/min": 0.3814745247364044, "sampling/sampling_logp_difference/max": 0.9945569038391113, "sampling/sampling_logp_difference/mean": 0.021298212930560112, "step": 23, "step_time": 16.909164390999877 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0015625000232830644, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004687500069849193, "completions/clipped_ratio": 0.0, "completions/max_length": 2798.0, "completions/max_terminated_length": 2798.0, "completions/mean_length": 2009.28125, "completions/mean_terminated_length": 2009.28125, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.18214968033134937, "epoch": 0.00192, "frac_reward_zero_std": 0.5, "grad_norm": 1.610776662826538, "kl": 0.003005845101142768, "learning_rate": 5.257142857142857e-06, "loss": -0.0704, "num_tokens": 1844719.0, "reward": 0.4243749976158142, "reward_std": 0.21341876685619354, "rewards/rollout_reward_func/mean": 0.4243749976158142, "rewards/rollout_reward_func/std": 0.362561970949173, "sampling/importance_sampling_ratio/max": 1.7062076330184937, "sampling/importance_sampling_ratio/mean": 1.0438251495361328, "sampling/importance_sampling_ratio/min": 0.3492918312549591, "sampling/sampling_logp_difference/max": 0.5720778703689575, "sampling/sampling_logp_difference/mean": 0.023630155250430107, "step": 24, "step_time": 17.14499585999988 }, { "clip_ratio/high_max": 0.0031250000465661287, "clip_ratio/high_mean": 0.0015625000232830644, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004687500069849193, "completions/clipped_ratio": 0.0, "completions/max_length": 2440.0, "completions/max_terminated_length": 2440.0, "completions/mean_length": 2006.6875, "completions/mean_terminated_length": 2006.6875, "completions/min_length": 1055.0, "completions/min_terminated_length": 1055.0, "entropy": 0.11316484399139881, "epoch": 0.002, "frac_reward_zero_std": 0.75, "grad_norm": 1.0315803289413452, "kl": 0.0030158030276652426, "learning_rate": 5.485714285714286e-06, "loss": 0.0116, "num_tokens": 1921270.0, "reward": 0.47968751192092896, "reward_std": 0.07062499970197678, "rewards/rollout_reward_func/mean": 0.47968751192092896, "rewards/rollout_reward_func/std": 0.36560583114624023, "sampling/importance_sampling_ratio/max": 1.54865562915802, "sampling/importance_sampling_ratio/mean": 0.9767700433731079, "sampling/importance_sampling_ratio/min": 0.5038349628448486, "sampling/sampling_logp_difference/max": 0.4957547187805176, "sampling/sampling_logp_difference/mean": 0.01319466158747673, "step": 25, "step_time": 15.574967514000036 }, { "clip_ratio/high_max": 0.007352941203862429, "clip_ratio/high_mean": 0.0036764706019312143, "clip_ratio/low_mean": 0.0015625000232830644, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005238970625214279, "completions/clipped_ratio": 0.0, "completions/max_length": 2816.0, "completions/max_terminated_length": 2816.0, "completions/mean_length": 2016.375, "completions/mean_terminated_length": 2016.375, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.1327181551605463, "epoch": 0.00208, "frac_reward_zero_std": 0.625, "grad_norm": 1.1481389999389648, "kl": 0.0036689931839646306, "learning_rate": 5.7142857142857145e-06, "loss": -0.004, "num_tokens": 1998617.0, "reward": 0.4909375011920929, "reward_std": 0.12780338525772095, "rewards/rollout_reward_func/mean": 0.4909375011920929, "rewards/rollout_reward_func/std": 0.3668818771839142, "sampling/importance_sampling_ratio/max": 1.5999431610107422, "sampling/importance_sampling_ratio/mean": 0.9634629487991333, "sampling/importance_sampling_ratio/min": 0.2564904987812042, "sampling/sampling_logp_difference/max": 0.6982665061950684, "sampling/sampling_logp_difference/mean": 0.01972239464521408, "step": 26, "step_time": 17.251899020999645 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.0, "completions/max_length": 2810.0, "completions/max_terminated_length": 2810.0, "completions/mean_length": 1849.84375, "completions/mean_terminated_length": 1849.84375, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.09636542806401849, "epoch": 0.00216, "frac_reward_zero_std": 0.5, "grad_norm": 0.7088967561721802, "kl": 0.002837517200532602, "learning_rate": 5.942857142857143e-06, "loss": -0.0039, "num_tokens": 2070057.0, "reward": 0.5687500238418579, "reward_std": 0.2160891890525818, "rewards/rollout_reward_func/mean": 0.5687500238418579, "rewards/rollout_reward_func/std": 0.4140106439590454, "sampling/importance_sampling_ratio/max": 1.6554824113845825, "sampling/importance_sampling_ratio/mean": 1.0057581663131714, "sampling/importance_sampling_ratio/min": 0.13771295547485352, "sampling/sampling_logp_difference/max": 1.693850040435791, "sampling/sampling_logp_difference/mean": 0.015657048672437668, "step": 27, "step_time": 16.657898435000106 }, { "clip_ratio/high_max": 0.0036764706019312143, "clip_ratio/high_mean": 0.0018382353009656072, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018382353009656072, "completions/clipped_ratio": 0.0, "completions/max_length": 2800.0, "completions/max_terminated_length": 2800.0, "completions/mean_length": 2166.28125, "completions/mean_terminated_length": 2166.28125, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.11421543313190341, "epoch": 0.00224, "frac_reward_zero_std": 0.625, "grad_norm": 1.0997257232666016, "kl": 0.001993668673094362, "learning_rate": 6.171428571428571e-06, "loss": 0.0093, "num_tokens": 2152509.0, "reward": 0.4090625047683716, "reward_std": 0.14099711179733276, "rewards/rollout_reward_func/mean": 0.4090625047683716, "rewards/rollout_reward_func/std": 0.3153631389141083, "sampling/importance_sampling_ratio/max": 1.4471888542175293, "sampling/importance_sampling_ratio/mean": 0.9940881133079529, "sampling/importance_sampling_ratio/min": 0.36628207564353943, "sampling/sampling_logp_difference/max": 0.4437229633331299, "sampling/sampling_logp_difference/mean": 0.01664617471396923, "step": 28, "step_time": 16.654660168999953 }, { "clip_ratio/high_max": 0.014613970648497343, "clip_ratio/high_mean": 0.009043096331879497, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010996221215464175, "completions/clipped_ratio": 0.0, "completions/max_length": 2427.0, "completions/max_terminated_length": 2427.0, "completions/mean_length": 1541.1875, "completions/mean_terminated_length": 1541.1875, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.11991730704903603, "epoch": 0.00232, "frac_reward_zero_std": 0.375, "grad_norm": 1.753915548324585, "kl": 0.004232257339026546, "learning_rate": 6.4e-06, "loss": -0.0561, "num_tokens": 2213064.0, "reward": 0.7221875190734863, "reward_std": 0.28713130950927734, "rewards/rollout_reward_func/mean": 0.7221875190734863, "rewards/rollout_reward_func/std": 0.47223374247550964, "sampling/importance_sampling_ratio/max": 1.8491572141647339, "sampling/importance_sampling_ratio/mean": 0.9664063453674316, "sampling/importance_sampling_ratio/min": 0.2533723711967468, "sampling/sampling_logp_difference/max": 0.7475757598876953, "sampling/sampling_logp_difference/mean": 0.02090834453701973, "step": 29, "step_time": 14.724566217999836 }, { "clip_ratio/high_max": 0.011101973708719015, "clip_ratio/high_mean": 0.0055509868543595076, "clip_ratio/low_mean": 0.0036764706019312143, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009227457456290722, "completions/clipped_ratio": 0.0, "completions/max_length": 2795.0, "completions/max_terminated_length": 2795.0, "completions/mean_length": 1690.28125, "completions/mean_terminated_length": 1690.28125, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.15091887768357992, "epoch": 0.0024, "frac_reward_zero_std": 0.25, "grad_norm": 1.5955613851547241, "kl": 0.003297277187812142, "learning_rate": 6.628571428571428e-06, "loss": -0.0312, "num_tokens": 2278894.0, "reward": 0.5768749713897705, "reward_std": 0.2879711389541626, "rewards/rollout_reward_func/mean": 0.5768749713897705, "rewards/rollout_reward_func/std": 0.4466031789779663, "sampling/importance_sampling_ratio/max": 2.0268845558166504, "sampling/importance_sampling_ratio/mean": 0.9738575220108032, "sampling/importance_sampling_ratio/min": 0.3578207492828369, "sampling/sampling_logp_difference/max": 0.7220923900604248, "sampling/sampling_logp_difference/mean": 0.023466479033231735, "step": 30, "step_time": 15.794334728999956 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.006302521098405123, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006302521098405123, "completions/clipped_ratio": 0.0, "completions/max_length": 2791.0, "completions/max_terminated_length": 2791.0, "completions/mean_length": 1802.28125, "completions/mean_terminated_length": 1802.28125, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.1271415469236672, "epoch": 0.00248, "frac_reward_zero_std": 0.625, "grad_norm": 0.5290101170539856, "kl": 0.007829649756786239, "learning_rate": 6.857142857142856e-06, "loss": 0.0219, "num_tokens": 2348595.0, "reward": 0.6468750238418579, "reward_std": 0.15625, "rewards/rollout_reward_func/mean": 0.6468750238418579, "rewards/rollout_reward_func/std": 0.4312205910682678, "sampling/importance_sampling_ratio/max": 1.4284578561782837, "sampling/importance_sampling_ratio/mean": 1.008836269378662, "sampling/importance_sampling_ratio/min": 0.5545295476913452, "sampling/sampling_logp_difference/max": 0.9234024286270142, "sampling/sampling_logp_difference/mean": 0.020626772195100784, "step": 31, "step_time": 16.82306190600002 }, { "clip_ratio/high_max": 0.010130719048902392, "clip_ratio/high_mean": 0.005065359524451196, "clip_ratio/low_mean": 0.0036210318794474006, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008686391403898597, "completions/clipped_ratio": 0.0, "completions/max_length": 2450.0, "completions/max_terminated_length": 2450.0, "completions/mean_length": 1986.28125, "completions/mean_terminated_length": 1986.28125, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.13206067122519016, "epoch": 0.00256, "frac_reward_zero_std": 0.5, "grad_norm": 1.3528786897659302, "kl": 0.008182306781236548, "learning_rate": 7.085714285714285e-06, "loss": -0.0853, "num_tokens": 2424934.0, "reward": 0.4606249928474426, "reward_std": 0.15091878175735474, "rewards/rollout_reward_func/mean": 0.4606249928474426, "rewards/rollout_reward_func/std": 0.3846149146556854, "sampling/importance_sampling_ratio/max": 2.71755313873291, "sampling/importance_sampling_ratio/mean": 1.051027774810791, "sampling/importance_sampling_ratio/min": 0.38737601041793823, "sampling/sampling_logp_difference/max": 0.7733535766601562, "sampling/sampling_logp_difference/mean": 0.020959284156560898, "step": 32, "step_time": 15.57071794400008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0022321429569274187, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022321429569274187, "completions/clipped_ratio": 0.0, "completions/max_length": 2803.0, "completions/max_terminated_length": 2803.0, "completions/mean_length": 2106.9375, "completions/mean_terminated_length": 2106.9375, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.12445190898142755, "epoch": 0.00264, "frac_reward_zero_std": 0.75, "grad_norm": 0.12888294458389282, "kl": 0.003734915575478226, "learning_rate": 7.314285714285714e-06, "loss": -0.0017, "num_tokens": 2505564.0, "reward": 0.4637500047683716, "reward_std": 0.13466876745224, "rewards/rollout_reward_func/mean": 0.4637500047683716, "rewards/rollout_reward_func/std": 0.37349048256874084, "sampling/importance_sampling_ratio/max": 2.2378056049346924, "sampling/importance_sampling_ratio/mean": 1.0552072525024414, "sampling/importance_sampling_ratio/min": 0.3374383747577667, "sampling/sampling_logp_difference/max": 1.084688663482666, "sampling/sampling_logp_difference/mean": 0.015957504510879517, "step": 33, "step_time": 16.374967034000065 }, { "clip_ratio/high_max": 0.004464285913854837, "clip_ratio/high_mean": 0.0022321429569274187, "clip_ratio/low_mean": 0.0022321429569274187, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.0, "completions/max_length": 2416.0, "completions/max_terminated_length": 2416.0, "completions/mean_length": 1479.28125, "completions/mean_terminated_length": 1479.28125, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.07319290563464165, "epoch": 0.00272, "frac_reward_zero_std": 0.375, "grad_norm": 1.5905581712722778, "kl": 0.02232863545214059, "learning_rate": 7.542857142857142e-06, "loss": 0.0285, "num_tokens": 2564043.0, "reward": 0.7212499976158142, "reward_std": 0.31069982051849365, "rewards/rollout_reward_func/mean": 0.7212499976158142, "rewards/rollout_reward_func/std": 0.45388466119766235, "sampling/importance_sampling_ratio/max": 1.5257185697555542, "sampling/importance_sampling_ratio/mean": 0.9286473989486694, "sampling/importance_sampling_ratio/min": 0.35652607679367065, "sampling/sampling_logp_difference/max": 1.0259580612182617, "sampling/sampling_logp_difference/mean": 0.0132124163210392, "step": 34, "step_time": 14.42571913799975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2806.0, "completions/max_terminated_length": 2806.0, "completions/mean_length": 1752.5, "completions/mean_terminated_length": 1752.5, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.09324552165344357, "epoch": 0.0028, "frac_reward_zero_std": 0.5, "grad_norm": 0.717671811580658, "kl": 0.004631695612260955, "learning_rate": 7.771428571428572e-06, "loss": 0.0108, "num_tokens": 2632179.0, "reward": 0.4125000238418579, "reward_std": 0.26933753490448, "rewards/rollout_reward_func/mean": 0.4125000238418579, "rewards/rollout_reward_func/std": 0.36455005407333374, "sampling/importance_sampling_ratio/max": 1.790269136428833, "sampling/importance_sampling_ratio/mean": 1.1154439449310303, "sampling/importance_sampling_ratio/min": 0.7148804068565369, "sampling/sampling_logp_difference/max": 0.596367359161377, "sampling/sampling_logp_difference/mean": 0.01612972654402256, "step": 35, "step_time": 16.94071342400025 }, { "clip_ratio/high_max": 0.003289473708719015, "clip_ratio/high_mean": 0.0016447368543595076, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016447368543595076, "completions/clipped_ratio": 0.0, "completions/max_length": 2780.0, "completions/max_terminated_length": 2780.0, "completions/mean_length": 2210.65625, "completions/mean_terminated_length": 2210.65625, "completions/min_length": 1565.0, "completions/min_terminated_length": 1565.0, "entropy": 0.13052229024469852, "epoch": 0.00288, "frac_reward_zero_std": 0.875, "grad_norm": 1.915448784828186, "kl": 0.014049735953449272, "learning_rate": 8e-06, "loss": 0.0114, "num_tokens": 2716101.0, "reward": 0.3787500262260437, "reward_std": 0.0624999962747097, "rewards/rollout_reward_func/mean": 0.3787500262260437, "rewards/rollout_reward_func/std": 0.26268768310546875, "sampling/importance_sampling_ratio/max": 1.6097471714019775, "sampling/importance_sampling_ratio/mean": 0.8499077558517456, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 24.667917251586914, "sampling/sampling_logp_difference/mean": 0.08357222378253937, "step": 36, "step_time": 16.647620255999527 }, { "clip_ratio/high_max": 0.013392857741564512, "clip_ratio/high_mean": 0.006696428870782256, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006696428870782256, "completions/clipped_ratio": 0.0, "completions/max_length": 2787.0, "completions/max_terminated_length": 2787.0, "completions/mean_length": 1735.125, "completions/mean_terminated_length": 1735.125, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.08023441676050425, "epoch": 0.00296, "frac_reward_zero_std": 0.5, "grad_norm": 1.0067864656448364, "kl": 0.014214006034308113, "learning_rate": 7.999999976246485e-06, "loss": 0.005, "num_tokens": 2783558.0, "reward": 0.5481250286102295, "reward_std": 0.2473391890525818, "rewards/rollout_reward_func/mean": 0.5481250286102295, "rewards/rollout_reward_func/std": 0.41463109850883484, "sampling/importance_sampling_ratio/max": 1.9839459657669067, "sampling/importance_sampling_ratio/mean": 1.0556774139404297, "sampling/importance_sampling_ratio/min": 0.5570288896560669, "sampling/sampling_logp_difference/max": 0.7343063354492188, "sampling/sampling_logp_difference/mean": 0.01291065476834774, "step": 37, "step_time": 15.852365188000249 }, { "clip_ratio/high_max": 0.013429548125714064, "clip_ratio/high_mean": 0.008277274086140096, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008277274086140096, "completions/clipped_ratio": 0.0, "completions/max_length": 2790.0, "completions/max_terminated_length": 2790.0, "completions/mean_length": 1825.5625, "completions/mean_terminated_length": 1825.5625, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.07684489572420716, "epoch": 0.00304, "frac_reward_zero_std": 0.375, "grad_norm": 0.9191060662269592, "kl": 0.008681207757035736, "learning_rate": 7.999999904985944e-06, "loss": -0.0211, "num_tokens": 2854054.0, "reward": 0.6575000286102295, "reward_std": 0.32216876745224, "rewards/rollout_reward_func/mean": 0.6575000286102295, "rewards/rollout_reward_func/std": 0.46569401025772095, "sampling/importance_sampling_ratio/max": 1.8603460788726807, "sampling/importance_sampling_ratio/mean": 1.0407439470291138, "sampling/importance_sampling_ratio/min": 0.5873942971229553, "sampling/sampling_logp_difference/max": 0.6204257011413574, "sampling/sampling_logp_difference/mean": 0.011706141754984856, "step": 38, "step_time": 16.086839031999943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2800.0, "completions/max_terminated_length": 2800.0, "completions/mean_length": 2006.0625, "completions/mean_terminated_length": 2006.0625, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.09726127330213785, "epoch": 0.00312, "frac_reward_zero_std": 0.75, "grad_norm": 1.442636489868164, "kl": 0.01900345625472255, "learning_rate": 7.999999786218377e-06, "loss": -0.0229, "num_tokens": 2930928.0, "reward": 0.42125001549720764, "reward_std": 0.13466876745224, "rewards/rollout_reward_func/mean": 0.42125001549720764, "rewards/rollout_reward_func/std": 0.3237656354904175, "sampling/importance_sampling_ratio/max": 1.6864854097366333, "sampling/importance_sampling_ratio/mean": 0.9331543445587158, "sampling/importance_sampling_ratio/min": 0.17830577492713928, "sampling/sampling_logp_difference/max": 0.9498655796051025, "sampling/sampling_logp_difference/mean": 0.01994011551141739, "step": 39, "step_time": 17.24544241999979 }, { "clip_ratio/high_max": 0.013523391913622618, "clip_ratio/high_mean": 0.006761695956811309, "clip_ratio/low_mean": 0.0016447368543595076, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008406432811170816, "completions/clipped_ratio": 0.0, "completions/max_length": 2440.0, "completions/max_terminated_length": 2440.0, "completions/mean_length": 1899.96875, "completions/mean_terminated_length": 1899.96875, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.09562211390584707, "epoch": 0.0032, "frac_reward_zero_std": 0.75, "grad_norm": 0.7564519047737122, "kl": 0.012280215043574572, "learning_rate": 7.999999619943787e-06, "loss": 0.0388, "num_tokens": 3004071.0, "reward": 0.53125, "reward_std": 0.13466876745224, "rewards/rollout_reward_func/mean": 0.53125, "rewards/rollout_reward_func/std": 0.40266650915145874, "sampling/importance_sampling_ratio/max": 1.7533916234970093, "sampling/importance_sampling_ratio/mean": 1.012838363647461, "sampling/importance_sampling_ratio/min": 0.40302401781082153, "sampling/sampling_logp_difference/max": 0.6195348501205444, "sampling/sampling_logp_difference/mean": 0.016937807202339172, "step": 40, "step_time": 15.278612154999792 }, { "clip_ratio/high_max": 0.013322473270818591, "clip_ratio/high_mean": 0.0066612366354092956, "clip_ratio/low_mean": 0.0016447368543595076, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008305973489768803, "completions/clipped_ratio": 0.0, "completions/max_length": 2818.0, "completions/max_terminated_length": 2818.0, "completions/mean_length": 2140.78125, "completions/mean_terminated_length": 2140.78125, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.10391843365505338, "epoch": 0.00328, "frac_reward_zero_std": 0.625, "grad_norm": 0.684903621673584, "kl": 0.0072205948672490194, "learning_rate": 7.999999406162173e-06, "loss": 0.0078, "num_tokens": 3085711.0, "reward": 0.41718751192092896, "reward_std": 0.14279377460479736, "rewards/rollout_reward_func/mean": 0.41718751192092896, "rewards/rollout_reward_func/std": 0.33007559180259705, "sampling/importance_sampling_ratio/max": 1.5918241739273071, "sampling/importance_sampling_ratio/mean": 0.900505006313324, "sampling/importance_sampling_ratio/min": 0.29988786578178406, "sampling/sampling_logp_difference/max": 1.036886215209961, "sampling/sampling_logp_difference/mean": 0.019227981567382812, "step": 41, "step_time": 16.941576514999497 }, { "clip_ratio/high_max": 0.0032051282469183207, "clip_ratio/high_mean": 0.0016025641234591603, "clip_ratio/low_mean": 0.0035156250232830644, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005118189146742225, "completions/clipped_ratio": 0.0, "completions/max_length": 2780.0, "completions/max_terminated_length": 2780.0, "completions/mean_length": 1944.25, "completions/mean_terminated_length": 1944.25, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.09584560617804527, "epoch": 0.00336, "frac_reward_zero_std": 0.5, "grad_norm": 2.3526816368103027, "kl": 0.027870278747286648, "learning_rate": 7.999999144873542e-06, "loss": 0.1103, "num_tokens": 3160194.0, "reward": 0.4490624964237213, "reward_std": 0.20263297855854034, "rewards/rollout_reward_func/mean": 0.4490624964237213, "rewards/rollout_reward_func/std": 0.36338335275650024, "sampling/importance_sampling_ratio/max": 2.6562483310699463, "sampling/importance_sampling_ratio/mean": 1.036879062652588, "sampling/importance_sampling_ratio/min": 0.24595271050930023, "sampling/sampling_logp_difference/max": 0.9235069751739502, "sampling/sampling_logp_difference/mean": 0.02009188011288643, "step": 42, "step_time": 16.650967574000333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0066964286379516125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0066964286379516125, "completions/clipped_ratio": 0.0, "completions/max_length": 2791.0, "completions/max_terminated_length": 2791.0, "completions/mean_length": 1675.03125, "completions/mean_terminated_length": 1675.03125, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.12920551793649793, "epoch": 0.00344, "frac_reward_zero_std": 0.375, "grad_norm": 1.3091895580291748, "kl": 0.03491167013999075, "learning_rate": 7.999998836077897e-06, "loss": 0.0888, "num_tokens": 3225903.0, "reward": 0.5174999833106995, "reward_std": 0.3608438968658447, "rewards/rollout_reward_func/mean": 0.5174999833106995, "rewards/rollout_reward_func/std": 0.43675488233566284, "sampling/importance_sampling_ratio/max": 2.373917579650879, "sampling/importance_sampling_ratio/mean": 1.0001801252365112, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3759169578552246, "sampling/sampling_logp_difference/mean": 0.02347693033516407, "step": 43, "step_time": 17.192637601000115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2798.0, "completions/max_terminated_length": 2798.0, "completions/mean_length": 2095.3125, "completions/mean_terminated_length": 2095.3125, "completions/min_length": 1562.0, "completions/min_terminated_length": 1562.0, "entropy": 0.09558335272595286, "epoch": 0.00352, "frac_reward_zero_std": 0.75, "grad_norm": 0.6341590881347656, "kl": 0.02431841316865757, "learning_rate": 7.99999847977524e-06, "loss": -0.0206, "num_tokens": 3305945.0, "reward": 0.5049999952316284, "reward_std": 0.14433754980564117, "rewards/rollout_reward_func/mean": 0.5049999952316284, "rewards/rollout_reward_func/std": 0.3978976607322693, "sampling/importance_sampling_ratio/max": 2.0796568393707275, "sampling/importance_sampling_ratio/mean": 0.8876084089279175, "sampling/importance_sampling_ratio/min": 0.34211352467536926, "sampling/sampling_logp_difference/max": 1.0857441425323486, "sampling/sampling_logp_difference/mean": 0.01929028518497944, "step": 44, "step_time": 16.607955621999963 }, { "clip_ratio/high_max": 0.009783434681594372, "clip_ratio/high_mean": 0.004891717340797186, "clip_ratio/low_mean": 0.0034007353242486715, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008292452665045857, "completions/clipped_ratio": 0.0, "completions/max_length": 2816.0, "completions/max_terminated_length": 2816.0, "completions/mean_length": 2426.71875, "completions/mean_terminated_length": 2426.71875, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.13120493851602077, "epoch": 0.0036, "frac_reward_zero_std": 0.75, "grad_norm": 1.2010557651519775, "kl": 0.014120981490123086, "learning_rate": 7.999998075965583e-06, "loss": -0.0277, "num_tokens": 3397449.0, "reward": 0.3384374976158142, "reward_std": 0.08029378205537796, "rewards/rollout_reward_func/mean": 0.3384374976158142, "rewards/rollout_reward_func/std": 0.21492847800254822, "sampling/importance_sampling_ratio/max": 2.0331103801727295, "sampling/importance_sampling_ratio/mean": 1.036205768585205, "sampling/importance_sampling_ratio/min": 0.44029006361961365, "sampling/sampling_logp_difference/max": 0.8087775707244873, "sampling/sampling_logp_difference/mean": 0.021842796355485916, "step": 45, "step_time": 17.039232532999677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.0, "completions/max_length": 2820.0, "completions/max_terminated_length": 2820.0, "completions/mean_length": 2035.78125, "completions/mean_terminated_length": 2035.78125, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.10772825870662928, "epoch": 0.00368, "frac_reward_zero_std": 0.75, "grad_norm": 0.6874250173568726, "kl": 0.02060257241828367, "learning_rate": 7.99999762464893e-06, "loss": 0.0037, "num_tokens": 3475466.0, "reward": 0.35374999046325684, "reward_std": 0.13466878235340118, "rewards/rollout_reward_func/mean": 0.35374999046325684, "rewards/rollout_reward_func/std": 0.26349693536758423, "sampling/importance_sampling_ratio/max": 2.4506301879882812, "sampling/importance_sampling_ratio/mean": 1.0585436820983887, "sampling/importance_sampling_ratio/min": 0.20336686074733734, "sampling/sampling_logp_difference/max": 1.0288989543914795, "sampling/sampling_logp_difference/mean": 0.019599031656980515, "step": 46, "step_time": 17.008759735000012 }, { "clip_ratio/high_max": 0.007352941203862429, "clip_ratio/high_mean": 0.0036764706019312143, "clip_ratio/low_mean": 0.0018382353009656072, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0055147059028968215, "completions/clipped_ratio": 0.0, "completions/max_length": 2796.0, "completions/max_terminated_length": 2796.0, "completions/mean_length": 1804.875, "completions/mean_terminated_length": 1804.875, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.11447951383888721, "epoch": 0.00376, "frac_reward_zero_std": 0.5, "grad_norm": 0.9577285051345825, "kl": 0.0146886624279432, "learning_rate": 7.999997125825284e-06, "loss": 0.018, "num_tokens": 3545642.0, "reward": 0.550000011920929, "reward_std": 0.25521132349967957, "rewards/rollout_reward_func/mean": 0.550000011920929, "rewards/rollout_reward_func/std": 0.4146043360233307, "sampling/importance_sampling_ratio/max": 2.3037173748016357, "sampling/importance_sampling_ratio/mean": 1.043008804321289, "sampling/importance_sampling_ratio/min": 0.5624377727508545, "sampling/sampling_logp_difference/max": 0.6249582767486572, "sampling/sampling_logp_difference/mean": 0.017069321125745773, "step": 47, "step_time": 17.18442160299992 }, { "clip_ratio/high_max": 0.011488970601931214, "clip_ratio/high_mean": 0.005744485300965607, "clip_ratio/low_mean": 0.003968254080973566, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009712739381939173, "completions/clipped_ratio": 0.0, "completions/max_length": 2804.0, "completions/max_terminated_length": 2804.0, "completions/mean_length": 1970.9375, "completions/mean_terminated_length": 1970.9375, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.09903696551918983, "epoch": 0.00384, "frac_reward_zero_std": 0.625, "grad_norm": 1.1492125988006592, "kl": 0.018403344380203635, "learning_rate": 7.999996579494655e-06, "loss": 0.0456, "num_tokens": 3621220.0, "reward": 0.4325000047683716, "reward_std": 0.19716876745224, "rewards/rollout_reward_func/mean": 0.4325000047683716, "rewards/rollout_reward_func/std": 0.3565334677696228, "sampling/importance_sampling_ratio/max": 2.369974136352539, "sampling/importance_sampling_ratio/mean": 1.0478521585464478, "sampling/importance_sampling_ratio/min": 0.11733747273683548, "sampling/sampling_logp_difference/max": 1.4917361736297607, "sampling/sampling_logp_difference/mean": 0.022218499332666397, "step": 48, "step_time": 16.41912825899999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2804.0, "completions/max_terminated_length": 2804.0, "completions/mean_length": 2252.875, "completions/mean_terminated_length": 2252.875, "completions/min_length": 1565.0, "completions/min_terminated_length": 1565.0, "entropy": 0.09296703850850463, "epoch": 0.00392, "frac_reward_zero_std": 0.75, "grad_norm": 0.5641649961471558, "kl": 0.03362530213780701, "learning_rate": 7.999995985657054e-06, "loss": -0.0171, "num_tokens": 3706955.0, "reward": 0.42624998092651367, "reward_std": 0.13466876745224, "rewards/rollout_reward_func/mean": 0.42624998092651367, "rewards/rollout_reward_func/std": 0.3314265012741089, "sampling/importance_sampling_ratio/max": 1.5233134031295776, "sampling/importance_sampling_ratio/mean": 0.9442777633666992, "sampling/importance_sampling_ratio/min": 0.2804383933544159, "sampling/sampling_logp_difference/max": 1.1384481191635132, "sampling/sampling_logp_difference/mean": 0.01721033826470375, "step": 49, "step_time": 16.923561193999603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2820.0, "completions/max_terminated_length": 2820.0, "completions/mean_length": 2463.8125, "completions/mean_terminated_length": 2463.8125, "completions/min_length": 2040.0, "completions/min_terminated_length": 2040.0, "entropy": 0.09066143818199635, "epoch": 0.004, "frac_reward_zero_std": 1.0, "grad_norm": 0.10026834160089493, "kl": 0.030927304484066553, "learning_rate": 7.99999534431249e-06, "loss": 0.0004, "num_tokens": 3799818.0, "reward": 0.30000001192092896, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.30000001192092896, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.050495147705078, "sampling/importance_sampling_ratio/mean": 1.0762633085250854, "sampling/importance_sampling_ratio/min": 0.29474106431007385, "sampling/sampling_logp_difference/max": 1.1381915807724, "sampling/sampling_logp_difference/mean": 0.01666702888906002, "step": 50, "step_time": 17.35131004799996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0018382353009656072, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018382353009656072, "completions/clipped_ratio": 0.0, "completions/max_length": 2786.0, "completions/max_terminated_length": 2786.0, "completions/mean_length": 1773.84375, "completions/mean_terminated_length": 1773.84375, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.06475613545626402, "epoch": 0.00408, "frac_reward_zero_std": 0.625, "grad_norm": 3.30958890914917, "kl": 0.015178573405137286, "learning_rate": 7.99999465546097e-06, "loss": 0.0343, "num_tokens": 3868368.0, "reward": 0.49562498927116394, "reward_std": 0.1848391890525818, "rewards/rollout_reward_func/mean": 0.49562498927116394, "rewards/rollout_reward_func/std": 0.4042271077632904, "sampling/importance_sampling_ratio/max": 2.036992311477661, "sampling/importance_sampling_ratio/mean": 1.0444798469543457, "sampling/importance_sampling_ratio/min": 0.6844988465309143, "sampling/sampling_logp_difference/max": 0.4961543083190918, "sampling/sampling_logp_difference/mean": 0.009073879569768906, "step": 51, "step_time": 17.00100525200014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2809.0, "completions/max_terminated_length": 2809.0, "completions/mean_length": 1786.375, "completions/mean_terminated_length": 1786.375, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.08096808800473809, "epoch": 0.00416, "frac_reward_zero_std": 0.375, "grad_norm": 1.6158044338226318, "kl": 0.020677090127719566, "learning_rate": 7.99999391910251e-06, "loss": -0.0195, "num_tokens": 3937756.0, "reward": 0.4912499785423279, "reward_std": 0.33183753490448, "rewards/rollout_reward_func/mean": 0.4912499785423279, "rewards/rollout_reward_func/std": 0.42849886417388916, "sampling/importance_sampling_ratio/max": 1.5450013875961304, "sampling/importance_sampling_ratio/mean": 0.9464795589447021, "sampling/importance_sampling_ratio/min": 0.39816370606422424, "sampling/sampling_logp_difference/max": 1.0278459787368774, "sampling/sampling_logp_difference/mean": 0.015709228813648224, "step": 52, "step_time": 16.014014809000173 }, { "clip_ratio/high_max": 0.003289473708719015, "clip_ratio/high_mean": 0.0016447368543595076, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016447368543595076, "completions/clipped_ratio": 0.0, "completions/max_length": 2799.0, "completions/max_terminated_length": 2799.0, "completions/mean_length": 1632.4375, "completions/mean_terminated_length": 1632.4375, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.06091495987493545, "epoch": 0.00424, "frac_reward_zero_std": 0.5, "grad_norm": 0.7226957678794861, "kl": 0.017386693391017616, "learning_rate": 7.999993135237117e-06, "loss": 0.0201, "num_tokens": 4001420.0, "reward": 0.6793749928474426, "reward_std": 0.2570079565048218, "rewards/rollout_reward_func/mean": 0.6793749928474426, "rewards/rollout_reward_func/std": 0.46435481309890747, "sampling/importance_sampling_ratio/max": 1.579625129699707, "sampling/importance_sampling_ratio/mean": 0.9135901927947998, "sampling/importance_sampling_ratio/min": 0.35018137097358704, "sampling/sampling_logp_difference/max": 1.1071686744689941, "sampling/sampling_logp_difference/mean": 0.015183830633759499, "step": 53, "step_time": 15.960780017000161 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2809.0, "completions/max_terminated_length": 2809.0, "completions/mean_length": 1589.6875, "completions/mean_terminated_length": 1589.6875, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.054683255730196834, "epoch": 0.00432, "frac_reward_zero_std": 0.375, "grad_norm": 1.0967762470245361, "kl": 0.021693169721402228, "learning_rate": 7.999992303864804e-06, "loss": -0.0452, "num_tokens": 4063548.0, "reward": 0.7262499928474426, "reward_std": 0.3071783781051636, "rewards/rollout_reward_func/mean": 0.7262499928474426, "rewards/rollout_reward_func/std": 0.45173320174217224, "sampling/importance_sampling_ratio/max": 1.3698084354400635, "sampling/importance_sampling_ratio/mean": 0.9228720664978027, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3379874229431152, "sampling/sampling_logp_difference/mean": 0.013593094423413277, "step": 54, "step_time": 16.424249653999823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0057151151122525334, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0057151151122525334, "completions/clipped_ratio": 0.0, "completions/max_length": 2435.0, "completions/max_terminated_length": 2435.0, "completions/mean_length": 1791.34375, "completions/mean_terminated_length": 1791.34375, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.06585042458027601, "epoch": 0.0044, "frac_reward_zero_std": 0.625, "grad_norm": 1.214758038520813, "kl": 0.024209468625485897, "learning_rate": 7.999991424985586e-06, "loss": 0.0112, "num_tokens": 4132756.0, "reward": 0.4637500047683716, "reward_std": 0.19184717535972595, "rewards/rollout_reward_func/mean": 0.4637500047683716, "rewards/rollout_reward_func/std": 0.35306718945503235, "sampling/importance_sampling_ratio/max": 2.3594348430633545, "sampling/importance_sampling_ratio/mean": 1.098282814025879, "sampling/importance_sampling_ratio/min": 0.3494420647621155, "sampling/sampling_logp_difference/max": 1.4460781812667847, "sampling/sampling_logp_difference/mean": 0.01887095905840397, "step": 55, "step_time": 15.803478434999533 }, { "clip_ratio/high_max": 0.003289473708719015, "clip_ratio/high_mean": 0.0016447368543595076, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016447368543595076, "completions/clipped_ratio": 0.0, "completions/max_length": 2794.0, "completions/max_terminated_length": 2794.0, "completions/mean_length": 2308.09375, "completions/mean_terminated_length": 2308.09375, "completions/min_length": 1568.0, "completions/min_terminated_length": 1568.0, "entropy": 0.09696381096728146, "epoch": 0.00448, "frac_reward_zero_std": 0.875, "grad_norm": 0.32451269030570984, "kl": 0.019486179284285754, "learning_rate": 7.999990498599477e-06, "loss": -0.0013, "num_tokens": 4220279.0, "reward": 0.3631250262260437, "reward_std": 0.05983918905258179, "rewards/rollout_reward_func/mean": 0.3631250262260437, "rewards/rollout_reward_func/std": 0.2257665991783142, "sampling/importance_sampling_ratio/max": 1.3443750143051147, "sampling/importance_sampling_ratio/mean": 0.9363906979560852, "sampling/importance_sampling_ratio/min": 0.3993731737136841, "sampling/sampling_logp_difference/max": 0.668013334274292, "sampling/sampling_logp_difference/mean": 0.01326768472790718, "step": 56, "step_time": 16.922060316999477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2460.0, "completions/max_terminated_length": 2460.0, "completions/mean_length": 1963.34375, "completions/mean_terminated_length": 1963.34375, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.04914194135926664, "epoch": 0.00456, "frac_reward_zero_std": 0.875, "grad_norm": 0.5049991607666016, "kl": 0.02837482520226331, "learning_rate": 7.99998952470649e-06, "loss": -0.008, "num_tokens": 4295471.0, "reward": 0.48374998569488525, "reward_std": 0.0624999962747097, "rewards/rollout_reward_func/mean": 0.48374998569488525, "rewards/rollout_reward_func/std": 0.3627649247646332, "sampling/importance_sampling_ratio/max": 1.912903904914856, "sampling/importance_sampling_ratio/mean": 0.9784045219421387, "sampling/importance_sampling_ratio/min": 0.4301539659500122, "sampling/sampling_logp_difference/max": 0.8482755422592163, "sampling/sampling_logp_difference/mean": 0.010939370840787888, "step": 57, "step_time": 15.453092248000303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0034007353242486715, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034007353242486715, "completions/clipped_ratio": 0.0, "completions/max_length": 2787.0, "completions/max_terminated_length": 2787.0, "completions/mean_length": 1872.28125, "completions/mean_terminated_length": 1872.28125, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.06335801596287638, "epoch": 0.00464, "frac_reward_zero_std": 0.625, "grad_norm": 0.6874297261238098, "kl": 0.03325861978373723, "learning_rate": 7.999988503306642e-06, "loss": -0.0215, "num_tokens": 4367703.0, "reward": 0.4325000047683716, "reward_std": 0.19716876745224, "rewards/rollout_reward_func/mean": 0.4325000047683716, "rewards/rollout_reward_func/std": 0.3528958261013031, "sampling/importance_sampling_ratio/max": 1.3908125162124634, "sampling/importance_sampling_ratio/mean": 0.8833715319633484, "sampling/importance_sampling_ratio/min": 1.9220989599944005e-07, "sampling/sampling_logp_difference/max": 13.085790634155273, "sampling/sampling_logp_difference/mean": 0.04347304627299309, "step": 58, "step_time": 16.225704187000247 }, { "clip_ratio/high_max": 0.004464285913854837, "clip_ratio/high_mean": 0.0022321429569274187, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022321429569274187, "completions/clipped_ratio": 0.0, "completions/max_length": 2810.0, "completions/max_terminated_length": 2810.0, "completions/mean_length": 2212.5, "completions/mean_terminated_length": 2212.5, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.0681739835999906, "epoch": 0.00472, "frac_reward_zero_std": 0.75, "grad_norm": 0.8530667424201965, "kl": 0.03184444556973176, "learning_rate": 7.999987434399948e-06, "loss": 0.0114, "num_tokens": 4452096.0, "reward": 0.4006250202655792, "reward_std": 0.13200798630714417, "rewards/rollout_reward_func/mean": 0.4006250202655792, "rewards/rollout_reward_func/std": 0.29461774230003357, "sampling/importance_sampling_ratio/max": 1.6479169130325317, "sampling/importance_sampling_ratio/mean": 1.004500150680542, "sampling/importance_sampling_ratio/min": 0.30276352167129517, "sampling/sampling_logp_difference/max": 1.1370731592178345, "sampling/sampling_logp_difference/mean": 0.014959340915083885, "step": 59, "step_time": 17.880038248999654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.0, "completions/max_length": 2438.0, "completions/max_terminated_length": 2438.0, "completions/mean_length": 2130.46875, "completions/mean_terminated_length": 2130.46875, "completions/min_length": 1567.0, "completions/min_terminated_length": 1567.0, "entropy": 0.055047230795025826, "epoch": 0.0048, "frac_reward_zero_std": 0.75, "grad_norm": 0.3532449007034302, "kl": 0.01998938515316695, "learning_rate": 7.999986317986426e-06, "loss": -0.0728, "num_tokens": 4533067.0, "reward": 0.3746874928474426, "reward_std": 0.07062499970197678, "rewards/rollout_reward_func/mean": 0.3746874928474426, "rewards/rollout_reward_func/std": 0.26494044065475464, "sampling/importance_sampling_ratio/max": 1.6606436967849731, "sampling/importance_sampling_ratio/mean": 0.94105064868927, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8746337890625, "sampling/sampling_logp_difference/mean": 0.013816887512803078, "step": 60, "step_time": 15.591869645000315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0022321429569274187, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022321429569274187, "completions/clipped_ratio": 0.0, "completions/max_length": 2772.0, "completions/max_terminated_length": 2772.0, "completions/mean_length": 1693.875, "completions/mean_terminated_length": 1693.875, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.05530149070546031, "epoch": 0.00488, "frac_reward_zero_std": 0.375, "grad_norm": 1.1544066667556763, "kl": 0.016453518153866753, "learning_rate": 7.999985154066091e-06, "loss": 0.0093, "num_tokens": 4598963.0, "reward": 0.5693750381469727, "reward_std": 0.32917672395706177, "rewards/rollout_reward_func/mean": 0.5693750381469727, "rewards/rollout_reward_func/std": 0.4425143301486969, "sampling/importance_sampling_ratio/max": 2.6382031440734863, "sampling/importance_sampling_ratio/mean": 1.0790200233459473, "sampling/importance_sampling_ratio/min": 0.7093254327774048, "sampling/sampling_logp_difference/max": 0.6202226877212524, "sampling/sampling_logp_difference/mean": 0.011032961308956146, "step": 61, "step_time": 16.082178944999896 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.0, "completions/max_length": 2806.0, "completions/max_terminated_length": 2806.0, "completions/mean_length": 2304.9375, "completions/mean_terminated_length": 2304.9375, "completions/min_length": 1559.0, "completions/min_terminated_length": 1559.0, "entropy": 0.06317996443249285, "epoch": 0.00496, "frac_reward_zero_std": 0.75, "grad_norm": 0.9210407137870789, "kl": 0.019177823327481747, "learning_rate": 7.999983942638965e-06, "loss": -0.0262, "num_tokens": 4686091.0, "reward": 0.3434374928474426, "reward_std": 0.08029377460479736, "rewards/rollout_reward_func/mean": 0.3434374928474426, "rewards/rollout_reward_func/std": 0.22245851159095764, "sampling/importance_sampling_ratio/max": 1.911454677581787, "sampling/importance_sampling_ratio/mean": 0.9809565544128418, "sampling/importance_sampling_ratio/min": 0.25244244933128357, "sampling/sampling_logp_difference/max": 0.9257916212081909, "sampling/sampling_logp_difference/mean": 0.013646715320646763, "step": 62, "step_time": 16.97750502200006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2421.0, "completions/max_terminated_length": 2421.0, "completions/mean_length": 1974.96875, "completions/mean_terminated_length": 1974.96875, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.04369071568362415, "epoch": 0.00504, "frac_reward_zero_std": 1.0, "grad_norm": 0.14146260917186737, "kl": 0.05203759076539427, "learning_rate": 7.999982683705066e-06, "loss": 0.0006, "num_tokens": 4761655.0, "reward": 0.2800000011920929, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.2800000011920929, "rewards/rollout_reward_func/std": 0.05376172438263893, "sampling/importance_sampling_ratio/max": 2.2244811058044434, "sampling/importance_sampling_ratio/mean": 1.0279231071472168, "sampling/importance_sampling_ratio/min": 0.3840067684650421, "sampling/sampling_logp_difference/max": 0.9096496105194092, "sampling/sampling_logp_difference/mean": 0.010698029771447182, "step": 63, "step_time": 16.219797216000416 }, { "clip_ratio/high_max": 0.015190972248092294, "clip_ratio/high_mean": 0.007595486124046147, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007595486124046147, "completions/clipped_ratio": 0.0, "completions/max_length": 2815.0, "completions/max_terminated_length": 2815.0, "completions/mean_length": 1982.6875, "completions/mean_terminated_length": 1982.6875, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.07828675024211407, "epoch": 0.00512, "frac_reward_zero_std": 0.5, "grad_norm": 0.7813690304756165, "kl": 0.021903582994127646, "learning_rate": 7.999981377264413e-06, "loss": 0.0011, "num_tokens": 4838046.0, "reward": 0.5843750238418579, "reward_std": 0.22841876745224, "rewards/rollout_reward_func/mean": 0.5843750238418579, "rewards/rollout_reward_func/std": 0.4214792251586914, "sampling/importance_sampling_ratio/max": 1.827079176902771, "sampling/importance_sampling_ratio/mean": 1.0283104181289673, "sampling/importance_sampling_ratio/min": 0.25544387102127075, "sampling/sampling_logp_difference/max": 1.0380005836486816, "sampling/sampling_logp_difference/mean": 0.015725988894701004, "step": 64, "step_time": 16.373142061000408 }, { "clip_ratio/high_max": 0.0036764706019312143, "clip_ratio/high_mean": 0.0018382353009656072, "clip_ratio/low_mean": 0.0018382353009656072, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036764706019312143, "completions/clipped_ratio": 0.0, "completions/max_length": 2801.0, "completions/max_terminated_length": 2801.0, "completions/mean_length": 2009.3125, "completions/mean_terminated_length": 2009.3125, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.05688254698179662, "epoch": 0.0052, "frac_reward_zero_std": 0.75, "grad_norm": 0.2757700979709625, "kl": 0.036635934142395854, "learning_rate": 7.999980023317026e-06, "loss": 0.0247, "num_tokens": 4914779.0, "reward": 0.45250001549720764, "reward_std": 0.125, "rewards/rollout_reward_func/mean": 0.45250001549720764, "rewards/rollout_reward_func/std": 0.3471450209617615, "sampling/importance_sampling_ratio/max": 1.4275991916656494, "sampling/importance_sampling_ratio/mean": 0.908840537071228, "sampling/importance_sampling_ratio/min": 0.23684662580490112, "sampling/sampling_logp_difference/max": 1.4337669610977173, "sampling/sampling_logp_difference/mean": 0.017221834510564804, "step": 65, "step_time": 16.79740752900011 }, { "clip_ratio/high_max": 0.0036764706019312143, "clip_ratio/high_mean": 0.0018382353009656072, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018382353009656072, "completions/clipped_ratio": 0.0, "completions/max_length": 2804.0, "completions/max_terminated_length": 2804.0, "completions/mean_length": 2059.75, "completions/mean_terminated_length": 2059.75, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.07610421534627676, "epoch": 0.00528, "frac_reward_zero_std": 0.75, "grad_norm": 1.4526886940002441, "kl": 0.027375709789339453, "learning_rate": 7.999978621862929e-06, "loss": 0.0203, "num_tokens": 4993633.0, "reward": 0.4793750047683716, "reward_std": 0.13200797140598297, "rewards/rollout_reward_func/mean": 0.4793750047683716, "rewards/rollout_reward_func/std": 0.3699514865875244, "sampling/importance_sampling_ratio/max": 1.815674066543579, "sampling/importance_sampling_ratio/mean": 1.0289226770401, "sampling/importance_sampling_ratio/min": 0.5281765460968018, "sampling/sampling_logp_difference/max": 0.6608150005340576, "sampling/sampling_logp_difference/mean": 0.01253808755427599, "step": 66, "step_time": 16.794041812999467 }, { "clip_ratio/high_max": 0.011259191203862429, "clip_ratio/high_mean": 0.005629595601931214, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007582720601931214, "completions/clipped_ratio": 0.0, "completions/max_length": 2812.0, "completions/max_terminated_length": 2812.0, "completions/mean_length": 1777.84375, "completions/mean_terminated_length": 1777.84375, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.06151273613795638, "epoch": 0.00536, "frac_reward_zero_std": 0.5, "grad_norm": 1.1407759189605713, "kl": 0.02162244959617965, "learning_rate": 7.999977172902144e-06, "loss": -0.0017, "num_tokens": 5062556.0, "reward": 0.48000001907348633, "reward_std": 0.25434714555740356, "rewards/rollout_reward_func/mean": 0.48000001907348633, "rewards/rollout_reward_func/std": 0.3861806094646454, "sampling/importance_sampling_ratio/max": 1.6365498304367065, "sampling/importance_sampling_ratio/mean": 1.0415589809417725, "sampling/importance_sampling_ratio/min": 0.4183502197265625, "sampling/sampling_logp_difference/max": 0.8754826188087463, "sampling/sampling_logp_difference/mean": 0.014012180268764496, "step": 67, "step_time": 17.151742474000002 }, { "clip_ratio/high_max": 0.0058139534667134285, "clip_ratio/high_mean": 0.0029069767333567142, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006031976779922843, "completions/clipped_ratio": 0.0, "completions/max_length": 2778.0, "completions/max_terminated_length": 2778.0, "completions/mean_length": 2272.375, "completions/mean_terminated_length": 2272.375, "completions/min_length": 2023.0, "completions/min_terminated_length": 2023.0, "entropy": 0.08835586486384273, "epoch": 0.00544, "frac_reward_zero_std": 0.75, "grad_norm": 1.3424510955810547, "kl": 0.030128376907669008, "learning_rate": 7.999975676434692e-06, "loss": -0.0562, "num_tokens": 5148670.0, "reward": 0.2918750047683716, "reward_std": 0.0162500012665987, "rewards/rollout_reward_func/mean": 0.2918750047683716, "rewards/rollout_reward_func/std": 0.031971510499715805, "sampling/importance_sampling_ratio/max": 2.0581729412078857, "sampling/importance_sampling_ratio/mean": 1.0349149703979492, "sampling/importance_sampling_ratio/min": 0.4890825152397156, "sampling/sampling_logp_difference/max": 0.7274646759033203, "sampling/sampling_logp_difference/mean": 0.019628014415502548, "step": 68, "step_time": 16.927781462999747 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2434.0, "completions/max_terminated_length": 2434.0, "completions/mean_length": 1876.4375, "completions/mean_terminated_length": 1876.4375, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.0765807363204658, "epoch": 0.00552, "frac_reward_zero_std": 0.625, "grad_norm": 0.9624868631362915, "kl": 0.05591569049283862, "learning_rate": 7.999974132460596e-06, "loss": 0.0124, "num_tokens": 5221035.0, "reward": 0.4793750047683716, "reward_std": 0.17558756470680237, "rewards/rollout_reward_func/mean": 0.4793750047683716, "rewards/rollout_reward_func/std": 0.3699514865875244, "sampling/importance_sampling_ratio/max": 1.3370881080627441, "sampling/importance_sampling_ratio/mean": 0.9259651899337769, "sampling/importance_sampling_ratio/min": 0.3385222852230072, "sampling/sampling_logp_difference/max": 1.2049891948699951, "sampling/sampling_logp_difference/mean": 0.014953669160604477, "step": 69, "step_time": 15.703423997999835 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 2437.0, "completions/max_terminated_length": 2437.0, "completions/mean_length": 1896.46875, "completions/mean_terminated_length": 1896.46875, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.05552901164628565, "epoch": 0.0056, "frac_reward_zero_std": 0.75, "grad_norm": 0.2667936384677887, "kl": 0.014588030888262438, "learning_rate": 7.999972540979884e-06, "loss": 0.0088, "num_tokens": 5294017.0, "reward": 0.5, "reward_std": 0.125, "rewards/rollout_reward_func/mean": 0.5, "rewards/rollout_reward_func/std": 0.39909571409225464, "sampling/importance_sampling_ratio/max": 1.607146143913269, "sampling/importance_sampling_ratio/mean": 0.9701419472694397, "sampling/importance_sampling_ratio/min": 0.401731014251709, "sampling/sampling_logp_difference/max": 1.0000518560409546, "sampling/sampling_logp_difference/mean": 0.010325020179152489, "step": 70, "step_time": 15.35116849200017 }, { "clip_ratio/high_max": 0.0069659443106502295, "clip_ratio/high_mean": 0.0034829721553251147, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034829721553251147, "completions/clipped_ratio": 0.0, "completions/max_length": 2805.0, "completions/max_terminated_length": 2805.0, "completions/mean_length": 2170.65625, "completions/mean_terminated_length": 2170.65625, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.09610213804990053, "epoch": 0.00568, "frac_reward_zero_std": 0.75, "grad_norm": 0.5360183119773865, "kl": 0.029458090604748577, "learning_rate": 7.99997090199258e-06, "loss": 0.0331, "num_tokens": 5376683.0, "reward": 0.515625, "reward_std": 0.12233918905258179, "rewards/rollout_reward_func/mean": 0.515625, "rewards/rollout_reward_func/std": 0.3893662095069885, "sampling/importance_sampling_ratio/max": 2.599726438522339, "sampling/importance_sampling_ratio/mean": 0.9910818934440613, "sampling/importance_sampling_ratio/min": 0.38567137718200684, "sampling/sampling_logp_difference/max": 0.9569785594940186, "sampling/sampling_logp_difference/mean": 0.02182850055396557, "step": 71, "step_time": 18.631503620000103 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0018382353009656072, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003574346425011754, "completions/clipped_ratio": 0.0, "completions/max_length": 2434.0, "completions/max_terminated_length": 2434.0, "completions/mean_length": 1713.6875, "completions/mean_terminated_length": 1713.6875, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.05361162032932043, "epoch": 0.00576, "frac_reward_zero_std": 0.5, "grad_norm": 0.6218546628952026, "kl": 0.02329419369925745, "learning_rate": 7.999969215498707e-06, "loss": -0.0147, "num_tokens": 5443189.0, "reward": 0.6181250214576721, "reward_std": 0.2755875587463379, "rewards/rollout_reward_func/mean": 0.6181250214576721, "rewards/rollout_reward_func/std": 0.45625001192092896, "sampling/importance_sampling_ratio/max": 1.4299272298812866, "sampling/importance_sampling_ratio/mean": 0.9309602975845337, "sampling/importance_sampling_ratio/min": 0.39184144139289856, "sampling/sampling_logp_difference/max": 0.9363220930099487, "sampling/sampling_logp_difference/mean": 0.011649301275610924, "step": 72, "step_time": 15.753308050000896 }, { "clip_ratio/high_max": 0.011660009622573853, "clip_ratio/high_mean": 0.005830004811286926, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007566115935333073, "completions/clipped_ratio": 0.0, "completions/max_length": 2801.0, "completions/max_terminated_length": 2801.0, "completions/mean_length": 1721.4375, "completions/mean_terminated_length": 1721.4375, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.06148386397399008, "epoch": 0.00584, "frac_reward_zero_std": 0.375, "grad_norm": 1.5323787927627563, "kl": 0.02491366575122811, "learning_rate": 7.999967481498294e-06, "loss": 0.0102, "num_tokens": 5509968.0, "reward": 0.5487499833106995, "reward_std": 0.33183753490448, "rewards/rollout_reward_func/mean": 0.5487499833106995, "rewards/rollout_reward_func/std": 0.4589731991291046, "sampling/importance_sampling_ratio/max": 1.9993343353271484, "sampling/importance_sampling_ratio/mean": 0.9622111916542053, "sampling/importance_sampling_ratio/min": 0.34618350863456726, "sampling/sampling_logp_difference/max": 1.054746150970459, "sampling/sampling_logp_difference/mean": 0.013135725632309914, "step": 73, "step_time": 16.009405589999915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 2792.0, "completions/max_terminated_length": 2792.0, "completions/mean_length": 1907.75, "completions/mean_terminated_length": 1907.75, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.08447153866291046, "epoch": 0.00592, "frac_reward_zero_std": 0.625, "grad_norm": 1.2818049192428589, "kl": 0.03082829067716375, "learning_rate": 7.999965699991369e-06, "loss": 0.0268, "num_tokens": 5583748.0, "reward": 0.44875001907348633, "reward_std": 0.20683756470680237, "rewards/rollout_reward_func/mean": 0.44875001907348633, "rewards/rollout_reward_func/std": 0.3857104480266571, "sampling/importance_sampling_ratio/max": 1.9656041860580444, "sampling/importance_sampling_ratio/mean": 1.0510060787200928, "sampling/importance_sampling_ratio/min": 0.4638214409351349, "sampling/sampling_logp_difference/max": 0.651539146900177, "sampling/sampling_logp_difference/mean": 0.016923408955335617, "step": 74, "step_time": 16.37548145899973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0016447368543595076, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016447368543595076, "completions/clipped_ratio": 0.0, "completions/max_length": 2794.0, "completions/max_terminated_length": 2794.0, "completions/mean_length": 1590.5, "completions/mean_terminated_length": 1590.5, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.051554064732044935, "epoch": 0.006, "frac_reward_zero_std": 0.75, "grad_norm": 0.4656570851802826, "kl": 0.039722154615446925, "learning_rate": 7.99996387097796e-06, "loss": -0.0273, "num_tokens": 5646147.0, "reward": 0.8512499928474426, "reward_std": 0.13466876745224, "rewards/rollout_reward_func/mean": 0.8512499928474426, "rewards/rollout_reward_func/std": 0.4309011399745941, "sampling/importance_sampling_ratio/max": 2.459237575531006, "sampling/importance_sampling_ratio/mean": 1.0666892528533936, "sampling/importance_sampling_ratio/min": 0.403707355260849, "sampling/sampling_logp_difference/max": 0.9005355834960938, "sampling/sampling_logp_difference/mean": 0.012501123361289501, "step": 75, "step_time": 16.584246522000285 }, { "epoch": 0.006, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 2216.95, "eval_completions/max_terminated_length": 2216.95, "eval_completions/mean_length": 1925.65, "eval_completions/mean_terminated_length": 1925.65, "eval_completions/min_length": 1634.35, "eval_completions/min_terminated_length": 1634.35, "eval_entropy": 0.07575540114194154, "eval_frac_reward_zero_std": 1.0, "eval_kl": 0.031034281105894478, "eval_loss": 4.469734994927421e-05, "eval_num_tokens": 5646147.0, "eval_reward": 0.6004999987781048, "eval_reward_std": 0.0, "eval_rewards/rollout_reward_func/mean": 0.6004999987781048, "eval_rewards/rollout_reward_func/std": 0.24536602906882762, "eval_runtime": 13.987, "eval_samples_per_second": 0.715, "eval_sampling/importance_sampling_ratio/max": 1.2220476478338242, "eval_sampling/importance_sampling_ratio/mean": 1.0154326111078262, "eval_sampling/importance_sampling_ratio/min": 0.8088175728917122, "eval_sampling/sampling_logp_difference/max": 0.25984298419207336, "eval_sampling/sampling_logp_difference/mean": 0.014358489285223186, "eval_steps_per_second": 0.357, "step": 75 } ], "logging_steps": 1.0, "max_steps": 25000, "num_input_tokens_seen": 5646147, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }