{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21344717182497333, "eval_steps": 1000, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1662.0, "completions/mean_length": 1134.15625, "completions/mean_terminated_length": 835.1364135742188, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 2.431631090119481, "epoch": 0.0010672358591248667, "frac_reward_zero_std": 0.3125, "grad_norm": 0.030865265056490898, "kl": 0.0, "learning_rate": 0.0, "loss": 0.093, "num_tokens": 86621.0, "reward": 0.5623373985290527, "reward_std": 0.03425048291683197, "rewards/argmax_reward_func/mean": 0.0, "rewards/argmax_reward_func/std": 0.0, "rewards/criterion_gradient_reward_func/mean": 0.48655617237091064, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.07578125596046448, "rewards/format_reward_func/std": 0.07336507737636566, "sampling/importance_sampling_ratio/max": 2.5961427688598633, "sampling/importance_sampling_ratio/mean": 0.5346519947052002, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4223146438598633, "sampling/sampling_logp_difference/mean": 0.02805997245013714, "step": 1, "step_time": 41.49941225499879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 1172.28125, "completions/mean_terminated_length": 625.4705810546875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 2.8236957266926765, "epoch": 0.0021344717182497333, "frac_reward_zero_std": 0.3125, "grad_norm": 0.027303757146000862, "kl": 0.0, "learning_rate": 1e-05, "loss": -0.0296, "num_tokens": 184412.0, "reward": -0.35546875, "reward_std": 0.14252620935440063, "rewards/argmax_reward_func/mean": 0.0625, "rewards/argmax_reward_func/std": 0.24593468010425568, "rewards/criterion_gradient_reward_func/mean": -0.5, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.08203125, "rewards/format_reward_func/std": 0.07218769192695618, "sampling/importance_sampling_ratio/max": 2.005566120147705, "sampling/importance_sampling_ratio/mean": 0.35993391275405884, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4357408285140991, "sampling/sampling_logp_difference/mean": 0.03387967497110367, "step": 2, "step_time": 33.432537868996405 }, { "clip_ratio/high_max": 0.003270158606028417, "clip_ratio/high_mean": 0.003270158606028417, "clip_ratio/low_mean": 0.0005212853538978379, "clip_ratio/low_min": 0.0005212853538978379, "clip_ratio/region_mean": 0.0037914439708401915, "completions/clipped_ratio": 0.34375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1494.0, "completions/mean_length": 1038.875, "completions/mean_terminated_length": 644.3809814453125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5402897894382477, "epoch": 0.0032017075773745998, "frac_reward_zero_std": 0.1875, "grad_norm": 0.028486698865890503, "kl": 0.0013464697112794966, "learning_rate": 2e-05, "loss": -0.0726, "num_tokens": 275662.0, "reward": 0.4338242709636688, "reward_std": 0.19887378811836243, "rewards/argmax_reward_func/mean": 0.09375, "rewards/argmax_reward_func/std": 0.2961445748806, "rewards/criterion_gradient_reward_func/mean": 0.24632428586483002, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.08255008608102798, "sampling/importance_sampling_ratio/max": 2.0627009868621826, "sampling/importance_sampling_ratio/mean": 0.4637377858161926, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.44522714614868164, "sampling/sampling_logp_difference/mean": 0.03064192458987236, "step": 3, "step_time": 36.592352470999685 }, { "clip_ratio/high_max": 0.0032231879195023794, "clip_ratio/high_mean": 0.0032231879195023794, "clip_ratio/low_mean": 0.0008163519705703948, "clip_ratio/low_min": 0.0008163519705703948, "clip_ratio/region_mean": 0.004039539890072774, "completions/clipped_ratio": 0.4375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1786.0, "completions/mean_length": 1160.46875, "completions/mean_terminated_length": 669.2777709960938, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 2.1913167480379343, "epoch": 0.004268943436499467, "frac_reward_zero_std": 0.1875, "grad_norm": 0.023701097816228867, "kl": 0.0012527473663794808, "learning_rate": 3e-05, "loss": -0.0412, "num_tokens": 366843.0, "reward": -0.3968749940395355, "reward_std": 0.11490485072135925, "rewards/argmax_reward_func/mean": 0.03125, "rewards/argmax_reward_func/std": 0.1767766922712326, "rewards/criterion_gradient_reward_func/mean": -0.5, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.07187499850988388, "rewards/format_reward_func/std": 0.07822373509407043, "sampling/importance_sampling_ratio/max": 2.0191760063171387, "sampling/importance_sampling_ratio/mean": 0.4658208191394806, "sampling/importance_sampling_ratio/min": 0.0002662605547811836, "sampling/sampling_logp_difference/max": 0.5244350433349609, "sampling/sampling_logp_difference/mean": 0.029541797935962677, "step": 4, "step_time": 37.01921246600341 }, { "clip_ratio/high_max": 0.001982621837669285, "clip_ratio/high_mean": 0.001982621837669285, "clip_ratio/low_mean": 0.0005589509892161004, "clip_ratio/low_min": 0.0005589509892161004, "clip_ratio/region_mean": 0.00254157281233347, "completions/clipped_ratio": 0.375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1751.0, "completions/mean_length": 1104.0625, "completions/mean_terminated_length": 691.2999877929688, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.7007607892155647, "epoch": 0.005336179295624333, "frac_reward_zero_std": 0.3125, "grad_norm": 0.013396462425589561, "kl": 0.0013774944964097813, "learning_rate": 4e-05, "loss": 0.0347, "num_tokens": 457087.0, "reward": -0.40234375, "reward_std": 0.0917029082775116, "rewards/argmax_reward_func/mean": 0.03125, "rewards/argmax_reward_func/std": 0.1767766922712326, "rewards/criterion_gradient_reward_func/mean": -0.5, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.06640625, "rewards/format_reward_func/std": 0.07394673675298691, "sampling/importance_sampling_ratio/max": 1.255894660949707, "sampling/importance_sampling_ratio/mean": 0.3248797655105591, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4532938003540039, "sampling/sampling_logp_difference/mean": 0.03193550929427147, "step": 5, "step_time": 35.72711361600159 }, { "clip_ratio/high_max": 0.0027264087439107243, "clip_ratio/high_mean": 0.0027264087439107243, "clip_ratio/low_mean": 0.0005786664059996838, "clip_ratio/low_min": 0.0005786664059996838, "clip_ratio/region_mean": 0.003305075162643334, "completions/clipped_ratio": 0.53125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1716.0, "completions/mean_length": 1280.34375, "completions/mean_terminated_length": 700.4666748046875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.9208407774567604, "epoch": 0.0064034151547491995, "frac_reward_zero_std": 0.3125, "grad_norm": 0.01853673905134201, "kl": 0.0014137454854790121, "learning_rate": 5e-05, "loss": -0.0195, "num_tokens": 555134.0, "reward": 0.6429054737091064, "reward_std": 0.1690426915884018, "rewards/argmax_reward_func/mean": 0.09375, "rewards/argmax_reward_func/std": 0.2961445748806, "rewards/criterion_gradient_reward_func/mean": 0.4405616819858551, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.10859374701976776, "rewards/format_reward_func/std": 0.07421888411045074, "sampling/importance_sampling_ratio/max": 2.250537157058716, "sampling/importance_sampling_ratio/mean": 0.31877005100250244, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5014762282371521, "sampling/sampling_logp_difference/mean": 0.03382378816604614, "step": 6, "step_time": 40.17239521900319 }, { "clip_ratio/high_max": 0.0017705363607092295, "clip_ratio/high_mean": 0.0017705363607092295, "clip_ratio/low_mean": 0.0010393604479759233, "clip_ratio/low_min": 0.0010393604479759233, "clip_ratio/region_mean": 0.0028098968250560574, "completions/clipped_ratio": 0.3125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1690.0, "completions/mean_length": 1081.875, "completions/mean_terminated_length": 759.0909423828125, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 2.254458498209715, "epoch": 0.007470651013874066, "frac_reward_zero_std": 0.3125, "grad_norm": 0.03492113947868347, "kl": 0.0013793954567518085, "learning_rate": 4.999996468241058e-05, "loss": 0.087, "num_tokens": 642038.0, "reward": -0.3460937738418579, "reward_std": 0.14031648635864258, "rewards/argmax_reward_func/mean": 0.0625, "rewards/argmax_reward_func/std": 0.24593468010425568, "rewards/criterion_gradient_reward_func/mean": -0.5, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.09140625596046448, "rewards/format_reward_func/std": 0.07529763132333755, "sampling/importance_sampling_ratio/max": 2.1214609146118164, "sampling/importance_sampling_ratio/mean": 0.49928179383277893, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5151300430297852, "sampling/sampling_logp_difference/mean": 0.03002871759235859, "step": 7, "step_time": 32.39049951399829 }, { "clip_ratio/high_max": 0.0021052010415587574, "clip_ratio/high_mean": 0.0021052010415587574, "clip_ratio/low_mean": 0.0005641000043397071, "clip_ratio/low_min": 0.0005641000043397071, "clip_ratio/region_mean": 0.002669301047717454, "completions/clipped_ratio": 0.28125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1771.0, "completions/mean_length": 998.125, "completions/mean_terminated_length": 687.478271484375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 2.2699602395296097, "epoch": 0.008537886872998933, "frac_reward_zero_std": 0.4375, "grad_norm": 0.021441733464598656, "kl": 0.001626853769266745, "learning_rate": 4.999985872974212e-05, "loss": 0.0131, "num_tokens": 730308.0, "reward": 0.49276474118232727, "reward_std": 0.17014756798744202, "rewards/argmax_reward_func/mean": 0.09375, "rewards/argmax_reward_func/std": 0.2961445748806, "rewards/criterion_gradient_reward_func/mean": 0.30526474118232727, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.08032193779945374, "sampling/importance_sampling_ratio/max": 1.661794662475586, "sampling/importance_sampling_ratio/mean": 0.42609864473342896, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4934859275817871, "sampling/sampling_logp_difference/mean": 0.029916543513536453, "step": 8, "step_time": 39.28847099400082 }, { "clip_ratio/high_max": 0.0022312107394100167, "clip_ratio/high_mean": 0.0022312107394100167, "clip_ratio/low_mean": 0.0006018511448928621, "clip_ratio/low_min": 0.0006018511448928621, "clip_ratio/region_mean": 0.0028330618843028788, "completions/clipped_ratio": 0.375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1631.0, "completions/mean_length": 1077.6875, "completions/mean_terminated_length": 649.1000366210938, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.0604200921952724, "epoch": 0.0096051227321238, "frac_reward_zero_std": 0.5, "grad_norm": 0.01968945562839508, "kl": 0.0014859367365716025, "learning_rate": 4.999968214229397e-05, "loss": 0.0419, "num_tokens": 815624.0, "reward": -0.3492187559604645, "reward_std": 0.1270582377910614, "rewards/argmax_reward_func/mean": 0.0625, "rewards/argmax_reward_func/std": 0.24593468010425568, "rewards/criterion_gradient_reward_func/mean": -0.5, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.08828125894069672, "rewards/format_reward_func/std": 0.08156345039606094, "sampling/importance_sampling_ratio/max": 2.2144131660461426, "sampling/importance_sampling_ratio/mean": 0.42925313115119934, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8429648876190186, "sampling/sampling_logp_difference/mean": 0.02670242078602314, "step": 9, "step_time": 30.39153382600125 }, { "clip_ratio/high_max": 0.0025172128371195868, "clip_ratio/high_mean": 0.0025172128371195868, "clip_ratio/low_mean": 0.0008683440737513592, "clip_ratio/low_min": 0.0008683440737513592, "clip_ratio/region_mean": 0.0033855569272418506, "completions/clipped_ratio": 0.5, "completions/max_length": 1792.0, "completions/max_terminated_length": 1673.0, "completions/mean_length": 1350.59375, "completions/mean_terminated_length": 909.1875, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 2.259644441306591, "epoch": 0.010672358591248666, "frac_reward_zero_std": 0.1875, "grad_norm": 0.013437962159514427, "kl": 0.0017150602907349821, "learning_rate": 4.999943492056506e-05, "loss": -0.0133, "num_tokens": 916771.0, "reward": 0.6810784339904785, "reward_std": 0.13479222357273102, "rewards/argmax_reward_func/mean": 0.0625, "rewards/argmax_reward_func/std": 0.24593468010425568, "rewards/criterion_gradient_reward_func/mean": 0.4982658922672272, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.12031250447034836, "rewards/format_reward_func/std": 0.06395483762025833, "sampling/importance_sampling_ratio/max": 1.0113781690597534, "sampling/importance_sampling_ratio/mean": 0.20657379925251007, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49831390380859375, "sampling/sampling_logp_difference/mean": 0.03026142716407776, "step": 10, "step_time": 35.942895204000706 }, { "clip_ratio/high_max": 0.003568034779164009, "clip_ratio/high_mean": 0.003568034779164009, "clip_ratio/low_mean": 0.0008167647538357414, "clip_ratio/low_min": 0.0008167647538357414, "clip_ratio/region_mean": 0.004384799511171877, "completions/clipped_ratio": 0.3125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1639.0, "completions/mean_length": 960.40625, "completions/mean_terminated_length": 582.4091186523438, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 2.4711361043155193, "epoch": 0.011739594450373533, "frac_reward_zero_std": 0.3125, "grad_norm": 0.03307599946856499, "kl": 0.0019544149326975457, "learning_rate": 4.9999117065253894e-05, "loss": 0.0455, "num_tokens": 995608.0, "reward": -0.35468751192092896, "reward_std": 0.1303728073835373, "rewards/argmax_reward_func/mean": 0.0625, "rewards/argmax_reward_func/std": 0.24593468010425568, "rewards/criterion_gradient_reward_func/mean": -0.5, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.08281250298023224, "rewards/format_reward_func/std": 0.08192185312509537, "sampling/importance_sampling_ratio/max": 2.5275375843048096, "sampling/importance_sampling_ratio/mean": 0.5409923791885376, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4497084617614746, "sampling/sampling_logp_difference/mean": 0.03116687200963497, "step": 11, "step_time": 28.291412371999286 }, { "clip_ratio/high_max": 0.004762829346873332, "clip_ratio/high_mean": 0.004762829346873332, "clip_ratio/low_mean": 0.0013049297722318443, "clip_ratio/low_min": 0.0013049297722318443, "clip_ratio/region_mean": 0.006067759182769805, "completions/clipped_ratio": 0.28125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1742.0, "completions/mean_length": 1001.3125, "completions/mean_terminated_length": 691.9130859375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 2.312736466526985, "epoch": 0.012806830309498399, "frac_reward_zero_std": 0.125, "grad_norm": 0.05455867573618889, "kl": 0.0021711735462304205, "learning_rate": 4.999872857725855e-05, "loss": -0.1288, "num_tokens": 1089376.0, "reward": 0.5853663086891174, "reward_std": 0.19997863471508026, "rewards/argmax_reward_func/mean": 0.09375, "rewards/argmax_reward_func/std": 0.2961445748806, "rewards/criterion_gradient_reward_func/mean": 0.3830225169658661, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.10859375447034836, "rewards/format_reward_func/std": 0.0685710683465004, "sampling/importance_sampling_ratio/max": 2.922780752182007, "sampling/importance_sampling_ratio/mean": 0.5339040756225586, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49237632751464844, "sampling/sampling_logp_difference/mean": 0.030930526554584503, "step": 12, "step_time": 34.63075887499872 }, { "clip_ratio/high_max": 0.0024765535126789473, "clip_ratio/high_mean": 0.0024765535126789473, "clip_ratio/low_mean": 0.0006066444493626477, "clip_ratio/low_min": 0.0006066444493626477, "clip_ratio/region_mean": 0.0030831979547656374, "completions/clipped_ratio": 0.34375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1421.0, "completions/mean_length": 1003.59375, "completions/mean_terminated_length": 590.6190795898438, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 1.879448313266039, "epoch": 0.013874066168623266, "frac_reward_zero_std": 0.3125, "grad_norm": 0.030804065987467766, "kl": 0.0020481923493207432, "learning_rate": 4.999826945767665e-05, "loss": 0.0767, "num_tokens": 1168687.0, "reward": -0.30546873807907104, "reward_std": 0.19113978743553162, "rewards/argmax_reward_func/mean": 0.09375, "rewards/argmax_reward_func/std": 0.2961445748806, "rewards/criterion_gradient_reward_func/mean": -0.5, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.10078124701976776, "rewards/format_reward_func/std": 0.07472648471593857, "sampling/importance_sampling_ratio/max": 2.568220376968384, "sampling/importance_sampling_ratio/mean": 0.4602521061897278, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4935779571533203, "sampling/sampling_logp_difference/mean": 0.027920059859752655, "step": 13, "step_time": 29.9490850580014 }, { "clip_ratio/high_max": 0.003377331820956897, "clip_ratio/high_mean": 0.003377331820956897, "clip_ratio/low_mean": 0.0005659566468239063, "clip_ratio/low_min": 0.0005659566468239063, "clip_ratio/region_mean": 0.003943288480513729, "completions/clipped_ratio": 0.375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1789.0, "completions/mean_length": 1077.125, "completions/mean_terminated_length": 648.2000122070312, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.742768321186304, "epoch": 0.014941302027748132, "frac_reward_zero_std": 0.1875, "grad_norm": 0.031979113817214966, "kl": 0.002178213473598589, "learning_rate": 4.99977397078054e-05, "loss": -0.0266, "num_tokens": 1253461.0, "reward": 0.45073968172073364, "reward_std": 0.10164659470319748, "rewards/argmax_reward_func/mean": 0.03125, "rewards/argmax_reward_func/std": 0.1767766922712326, "rewards/criterion_gradient_reward_func/mean": 0.32417717576026917, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.09531250596046448, "rewards/format_reward_func/std": 0.07111936062574387, "sampling/importance_sampling_ratio/max": 2.4795937538146973, "sampling/importance_sampling_ratio/mean": 0.4869721531867981, "sampling/importance_sampling_ratio/min": 0.004543759394437075, "sampling/sampling_logp_difference/max": 0.41729068756103516, "sampling/sampling_logp_difference/mean": 0.025957368314266205, "step": 14, "step_time": 29.575109181001608 }, { "clip_ratio/high_max": 0.0038930996670387685, "clip_ratio/high_mean": 0.0038930996670387685, "clip_ratio/low_mean": 0.0009401201969012618, "clip_ratio/low_min": 0.0009401201969012618, "clip_ratio/region_mean": 0.004833219853026094, "completions/clipped_ratio": 0.375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1616.0, "completions/mean_length": 1009.4375, "completions/mean_terminated_length": 539.9000244140625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 2.1193800568580627, "epoch": 0.016008537886873, "frac_reward_zero_std": 0.125, "grad_norm": 0.03657897189259529, "kl": 0.002269088727189228, "learning_rate": 4.999713932914157e-05, "loss": -0.0128, "num_tokens": 1341501.0, "reward": 0.4682930111885071, "reward_std": 0.19224464893341064, "rewards/argmax_reward_func/mean": 0.09375, "rewards/argmax_reward_func/std": 0.2961445748806, "rewards/criterion_gradient_reward_func/mean": 0.2792305052280426, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.09531250596046448, "rewards/format_reward_func/std": 0.08192184567451477, "sampling/importance_sampling_ratio/max": 1.8825421333312988, "sampling/importance_sampling_ratio/mean": 0.44870758056640625, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.45983433723449707, "sampling/sampling_logp_difference/mean": 0.02783556468784809, "step": 15, "step_time": 34.561196515996926 }, { "clip_ratio/high_max": 0.004106300068087876, "clip_ratio/high_mean": 0.004106300068087876, "clip_ratio/low_mean": 0.0010246572473988635, "clip_ratio/low_min": 0.0010246572473988635, "clip_ratio/region_mean": 0.005130957295477856, "completions/clipped_ratio": 0.53125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1658.0, "completions/mean_length": 1233.46875, "completions/mean_terminated_length": 600.4666748046875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 2.302693136036396, "epoch": 0.017075773745997867, "frac_reward_zero_std": 0.1875, "grad_norm": 0.05325016751885414, "kl": 0.0028706716548185796, "learning_rate": 4.9996468323381466e-05, "loss": -0.1163, "num_tokens": 1434686.0, "reward": -0.31953126192092896, "reward_std": 0.14694562554359436, "rewards/argmax_reward_func/mean": 0.0625, "rewards/argmax_reward_func/std": 0.24593468010425568, "rewards/criterion_gradient_reward_func/mean": -0.5, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.11796875298023224, "rewards/format_reward_func/std": 0.07302075624465942, "sampling/importance_sampling_ratio/max": 2.909137725830078, "sampling/importance_sampling_ratio/mean": 0.6558585166931152, "sampling/importance_sampling_ratio/min": 0.0005810415023006499, "sampling/sampling_logp_difference/max": 0.45676755905151367, "sampling/sampling_logp_difference/mean": 0.03154245764017105, "step": 16, "step_time": 34.40235109999867 }, { "clip_ratio/high_max": 0.00467353050771635, "clip_ratio/high_mean": 0.00467353050771635, "clip_ratio/low_mean": 0.001069291916792281, "clip_ratio/low_min": 0.001069291916792281, "clip_ratio/region_mean": 0.005742822377214907, "completions/clipped_ratio": 0.34375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1716.0, "completions/mean_length": 982.125, "completions/mean_terminated_length": 557.90478515625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 1.9572940804064274, "epoch": 0.01814300960512273, "frac_reward_zero_std": 0.125, "grad_norm": 0.04296024888753891, "kl": 0.0030270935167209245, "learning_rate": 4.9995726692420944e-05, "loss": 0.0785, "num_tokens": 1510990.0, "reward": 0.7439144849777222, "reward_std": 0.15246987342834473, "rewards/argmax_reward_func/mean": 0.125, "rewards/argmax_reward_func/std": 0.33601075410842896, "rewards/criterion_gradient_reward_func/mean": 0.514227032661438, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.10468750447034836, "rewards/format_reward_func/std": 0.07788471132516861, "sampling/importance_sampling_ratio/max": 2.113346815109253, "sampling/importance_sampling_ratio/mean": 0.40686556696891785, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7506132125854492, "sampling/sampling_logp_difference/mean": 0.029495857656002045, "step": 17, "step_time": 29.230848136997338 }, { "clip_ratio/high_max": 0.0020975891493435483, "clip_ratio/high_mean": 0.0020975891493435483, "clip_ratio/low_mean": 0.0008793990655249218, "clip_ratio/low_min": 0.0008793990655249218, "clip_ratio/region_mean": 0.002976988227601396, "completions/clipped_ratio": 0.53125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1220.0, "completions/mean_length": 1192.84375, "completions/mean_terminated_length": 513.800048828125, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 2.3507497645914555, "epoch": 0.0192102454642476, "frac_reward_zero_std": 0.25, "grad_norm": 0.025775041431188583, "kl": 0.0028757976469933055, "learning_rate": 4.999491443835542e-05, "loss": -0.1009, "num_tokens": 1604155.0, "reward": 0.47347962856292725, "reward_std": 0.14584076404571533, "rewards/argmax_reward_func/mean": 0.0625, "rewards/argmax_reward_func/std": 0.24593468010425568, "rewards/criterion_gradient_reward_func/mean": 0.3203546404838562, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.09062500298023224, "rewards/format_reward_func/std": 0.07452809065580368, "sampling/importance_sampling_ratio/max": 2.5378119945526123, "sampling/importance_sampling_ratio/mean": 0.4265184998512268, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6568746566772461, "sampling/sampling_logp_difference/mean": 0.02859003283083439, "step": 18, "step_time": 31.89225270499992 }, { "clip_ratio/high_max": 0.003789503280131612, "clip_ratio/high_mean": 0.003789503280131612, "clip_ratio/low_mean": 0.0007893467009125743, "clip_ratio/low_min": 0.0007893467009125743, "clip_ratio/region_mean": 0.004578849981044186, "completions/clipped_ratio": 0.21875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1459.0, "completions/mean_length": 815.96875, "completions/mean_terminated_length": 542.6799926757812, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 1.8807277753949165, "epoch": 0.020277481323372464, "frac_reward_zero_std": 0.25, "grad_norm": 0.05873904377222061, "kl": 0.003696119958476629, "learning_rate": 4.999403156347986e-05, "loss": 0.108, "num_tokens": 1684128.0, "reward": -0.3359375, "reward_std": 0.13921165466308594, "rewards/argmax_reward_func/mean": 0.0625, "rewards/argmax_reward_func/std": 0.24593468010425568, "rewards/criterion_gradient_reward_func/mean": -0.5, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.1015625, "rewards/format_reward_func/std": 0.07644771784543991, "sampling/importance_sampling_ratio/max": 2.2479114532470703, "sampling/importance_sampling_ratio/mean": 0.6651320457458496, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.40585219860076904, "sampling/sampling_logp_difference/mean": 0.029087107628583908, "step": 19, "step_time": 36.100535707000745 }, { "clip_ratio/high_max": 0.003283175792603288, "clip_ratio/high_mean": 0.003283175792603288, "clip_ratio/low_mean": 0.0014314119289338123, "clip_ratio/low_min": 0.0014314119289338123, "clip_ratio/region_mean": 0.004714587725175079, "completions/clipped_ratio": 0.34375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1744.0, "completions/mean_length": 1126.78125, "completions/mean_terminated_length": 778.3333740234375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 1.86278922483325, "epoch": 0.021344717182497332, "frac_reward_zero_std": 0.1875, "grad_norm": 0.03029097430408001, "kl": 0.0038251571895671077, "learning_rate": 4.999307807028871e-05, "loss": -0.0805, "num_tokens": 1777287.0, "reward": -0.32734373211860657, "reward_std": 0.15357476472854614, "rewards/argmax_reward_func/mean": 0.0625, "rewards/argmax_reward_func/std": 0.24593468010425568, "rewards/criterion_gradient_reward_func/mean": -0.5, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.11015625298023224, "rewards/format_reward_func/std": 0.06980946660041809, "sampling/importance_sampling_ratio/max": 2.3334615230560303, "sampling/importance_sampling_ratio/mean": 0.4293149709701538, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8119401931762695, "sampling/sampling_logp_difference/mean": 0.028486905619502068, "step": 20, "step_time": 36.58308943400243 }, { "clip_ratio/high_max": 0.0025813026513787918, "clip_ratio/high_mean": 0.0025813026513787918, "clip_ratio/low_mean": 0.0010192703775828704, "clip_ratio/low_min": 0.0010192703775828704, "clip_ratio/region_mean": 0.003600573032599641, "completions/clipped_ratio": 0.34375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1762.0, "completions/mean_length": 981.6875, "completions/mean_terminated_length": 557.2380981445312, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 2.4475060552358627, "epoch": 0.022411953041622197, "frac_reward_zero_std": 0.25, "grad_norm": 0.04966782033443451, "kl": 0.00499495500116609, "learning_rate": 4.999205396147601e-05, "loss": -0.0185, "num_tokens": 1856689.0, "reward": 0.47946423292160034, "reward_std": 0.09722718596458435, "rewards/argmax_reward_func/mean": 0.03125, "rewards/argmax_reward_func/std": 0.1767766922712326, "rewards/criterion_gradient_reward_func/mean": 0.33102670311927795, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.1171875, "rewards/format_reward_func/std": 0.06733328104019165, "sampling/importance_sampling_ratio/max": 2.8226981163024902, "sampling/importance_sampling_ratio/mean": 0.5518832802772522, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9661874771118164, "sampling/sampling_logp_difference/mean": 0.030854636803269386, "step": 21, "step_time": 35.660547125998164 }, { "clip_ratio/high_max": 0.0025256357548641972, "clip_ratio/high_mean": 0.0025256357548641972, "clip_ratio/low_mean": 0.001954864761501085, "clip_ratio/low_min": 0.001954864761501085, "clip_ratio/region_mean": 0.004480500479985494, "completions/clipped_ratio": 0.34375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1734.0, "completions/mean_length": 1045.5, "completions/mean_terminated_length": 654.4761962890625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.7636147774755955, "epoch": 0.023479188900747065, "frac_reward_zero_std": 0.1875, "grad_norm": 0.05411454290151596, "kl": 0.004100498714251444, "learning_rate": 4.9990959239935266e-05, "loss": 0.0264, "num_tokens": 1944121.0, "reward": 0.5323084592819214, "reward_std": 0.091702900826931, "rewards/argmax_reward_func/mean": 0.03125, "rewards/argmax_reward_func/std": 0.1767766922712326, "rewards/criterion_gradient_reward_func/mean": 0.37683969736099243, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.12421875447034836, "rewards/format_reward_func/std": 0.06853430718183517, "sampling/importance_sampling_ratio/max": 2.4316279888153076, "sampling/importance_sampling_ratio/mean": 0.551908016204834, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7746484279632568, "sampling/sampling_logp_difference/mean": 0.028002718463540077, "step": 22, "step_time": 34.37947198100028 }, { "clip_ratio/high_max": 0.003651722945505753, "clip_ratio/high_mean": 0.003651722945505753, "clip_ratio/low_mean": 0.0014183384828356793, "clip_ratio/low_min": 0.0014183384828356793, "clip_ratio/region_mean": 0.005070061422884464, "completions/clipped_ratio": 0.375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1252.0, "completions/mean_length": 1045.5, "completions/mean_terminated_length": 597.6000366210938, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8049771822988987, "epoch": 0.02454642475987193, "frac_reward_zero_std": 0.125, "grad_norm": 0.047026388347148895, "kl": 0.006468135034083389, "learning_rate": 4.9989793908759506e-05, "loss": 0.0179, "num_tokens": 2031793.0, "reward": -0.15390625596046448, "reward_std": 0.3745456337928772, "rewards/argmax_reward_func/mean": 0.21875, "rewards/argmax_reward_func/std": 0.420013427734375, "rewards/criterion_gradient_reward_func/mean": -0.5, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.12734374403953552, "rewards/format_reward_func/std": 0.07549817860126495, "sampling/importance_sampling_ratio/max": 2.804290771484375, "sampling/importance_sampling_ratio/mean": 0.5595538020133972, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6815462112426758, "sampling/sampling_logp_difference/mean": 0.02699916996061802, "step": 23, "step_time": 35.494740354001806 }, { "clip_ratio/high_max": 0.0028878302182420157, "clip_ratio/high_mean": 0.0028878302182420157, "clip_ratio/low_mean": 0.000996383503661491, "clip_ratio/low_min": 0.000996383503661491, "clip_ratio/region_mean": 0.0038842136891616974, "completions/clipped_ratio": 0.21875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1716.0, "completions/mean_length": 895.71875, "completions/mean_terminated_length": 644.760009765625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 1.8306932039558887, "epoch": 0.025613660618996798, "frac_reward_zero_std": 0.1875, "grad_norm": 0.033485084772109985, "kl": 0.00647735976963304, "learning_rate": 4.998855797124129e-05, "loss": -0.0009, "num_tokens": 2108566.0, "reward": 0.5588388442993164, "reward_std": 0.16351842880249023, "rewards/argmax_reward_func/mean": 0.09375, "rewards/argmax_reward_func/std": 0.2961445748806, "rewards/criterion_gradient_reward_func/mean": 0.30727633833885193, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.15781250596046448, "rewards/format_reward_func/std": 0.035603947937488556, "sampling/importance_sampling_ratio/max": 2.063812017440796, "sampling/importance_sampling_ratio/mean": 0.44901227951049805, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4627084732055664, "sampling/sampling_logp_difference/mean": 0.028912337496876717, "step": 24, "step_time": 37.27880799400009 }, { "clip_ratio/high_max": 0.004740225966088474, "clip_ratio/high_mean": 0.004740225966088474, "clip_ratio/low_mean": 0.0009464894046686823, "clip_ratio/low_min": 0.0009464894046686823, "clip_ratio/region_mean": 0.005686715370757156, "completions/clipped_ratio": 0.4375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1754.0, "completions/mean_length": 1124.875, "completions/mean_terminated_length": 606.0, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 1.4047931171953678, "epoch": 0.026680896478121666, "frac_reward_zero_std": 0.3125, "grad_norm": 0.028126059100031853, "kl": 0.00599390956631396, "learning_rate": 4.9987251430872624e-05, "loss": 0.1513, "num_tokens": 2202762.0, "reward": 0.6576347351074219, "reward_std": 0.22097085416316986, "rewards/argmax_reward_func/mean": 0.125, "rewards/argmax_reward_func/std": 0.33601075410842896, "rewards/criterion_gradient_reward_func/mean": 0.37950971722602844, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.15312500298023224, "rewards/format_reward_func/std": 0.06015772372484207, "sampling/importance_sampling_ratio/max": 1.913062572479248, "sampling/importance_sampling_ratio/mean": 0.4132564067840576, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6431503295898438, "sampling/sampling_logp_difference/mean": 0.025824351236224174, "step": 25, "step_time": 36.73987110499911 }, { "clip_ratio/high_max": 0.003679251669382211, "clip_ratio/high_mean": 0.003679251669382211, "clip_ratio/low_mean": 0.0014510033015540102, "clip_ratio/low_min": 0.0014510033015540102, "clip_ratio/region_mean": 0.005130254969117232, "completions/clipped_ratio": 0.3125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1776.0, "completions/mean_length": 957.59375, "completions/mean_terminated_length": 578.3181762695312, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 1.503768552094698, "epoch": 0.02774813233724653, "frac_reward_zero_std": 0.1875, "grad_norm": 0.04647405073046684, "kl": 0.006211655476363376, "learning_rate": 4.998587429134501e-05, "loss": -0.1798, "num_tokens": 2285347.0, "reward": -0.22031250596046448, "reward_std": 0.2121320366859436, "rewards/argmax_reward_func/mean": 0.125, "rewards/argmax_reward_func/std": 0.33601075410842896, "rewards/criterion_gradient_reward_func/mean": -0.5, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.15468750894069672, "rewards/format_reward_func/std": 0.04373559728264809, "sampling/importance_sampling_ratio/max": 1.7345614433288574, "sampling/importance_sampling_ratio/mean": 0.5314441919326782, "sampling/importance_sampling_ratio/min": 0.0004740021249745041, "sampling/sampling_logp_difference/max": 0.46543383598327637, "sampling/sampling_logp_difference/mean": 0.024397343397140503, "step": 26, "step_time": 35.39277333899827 }, { "clip_ratio/high_max": 0.003787987086980138, "clip_ratio/high_mean": 0.003787987086980138, "clip_ratio/low_mean": 0.0012827797581849154, "clip_ratio/low_min": 0.0012827797581849154, "clip_ratio/region_mean": 0.00507076688518282, "completions/clipped_ratio": 0.21875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1758.0, "completions/mean_length": 903.84375, "completions/mean_terminated_length": 655.1599731445312, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 1.2631471566855907, "epoch": 0.0288153681963714, "frac_reward_zero_std": 0.25, "grad_norm": 0.04185902327299118, "kl": 0.007992795668542385, "learning_rate": 4.9984426556549456e-05, "loss": 0.0672, "num_tokens": 2366862.0, "reward": 0.687231183052063, "reward_std": 0.2607456147670746, "rewards/argmax_reward_func/mean": 0.15625, "rewards/argmax_reward_func/std": 0.3689020276069641, "rewards/criterion_gradient_reward_func/mean": 0.37316861748695374, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.15781250596046448, "rewards/format_reward_func/std": 0.053293731063604355, "sampling/importance_sampling_ratio/max": 2.6994168758392334, "sampling/importance_sampling_ratio/mean": 0.4857272207736969, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7538604736328125, "sampling/sampling_logp_difference/mean": 0.024957852438092232, "step": 27, "step_time": 32.011972304001574 }, { "clip_ratio/high_max": 0.00337369517001207, "clip_ratio/high_mean": 0.00337369517001207, "clip_ratio/low_mean": 0.0011972706561209634, "clip_ratio/low_min": 0.0011972706561209634, "clip_ratio/region_mean": 0.0045709658406849485, "completions/clipped_ratio": 0.34375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1648.0, "completions/mean_length": 1077.25, "completions/mean_terminated_length": 702.857177734375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 1.649448435753584, "epoch": 0.029882604055496264, "frac_reward_zero_std": 0.25, "grad_norm": 0.02598665840923786, "kl": 0.009506171481916681, "learning_rate": 4.998290823057639e-05, "loss": 0.0498, "num_tokens": 2453340.0, "reward": 0.7511483430862427, "reward_std": 0.3104640543460846, "rewards/argmax_reward_func/mean": 0.25, "rewards/argmax_reward_func/std": 0.4399413466453552, "rewards/criterion_gradient_reward_func/mean": 0.3628670573234558, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.13828125596046448, "rewards/format_reward_func/std": 0.058193355798721313, "sampling/importance_sampling_ratio/max": 1.9904030561447144, "sampling/importance_sampling_ratio/mean": 0.43204760551452637, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0553960800170898, "sampling/sampling_logp_difference/mean": 0.02800804190337658, "step": 28, "step_time": 32.3059688759995 }, { "clip_ratio/high_max": 0.004031803779071197, "clip_ratio/high_mean": 0.004031803779071197, "clip_ratio/low_mean": 0.0007302826106752036, "clip_ratio/low_min": 0.0007302826106752036, "clip_ratio/region_mean": 0.0047620863715565065, "completions/clipped_ratio": 0.28125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1731.0, "completions/mean_length": 995.25, "completions/mean_terminated_length": 683.478271484375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 1.5926814749836922, "epoch": 0.030949839914621132, "frac_reward_zero_std": 0.3125, "grad_norm": 0.0290432907640934, "kl": 0.011325412837322801, "learning_rate": 4.998131931771568e-05, "loss": -0.0395, "num_tokens": 2536102.0, "reward": -0.21953123807907104, "reward_std": 0.21323686838150024, "rewards/argmax_reward_func/mean": 0.125, "rewards/argmax_reward_func/std": 0.33601075410842896, "rewards/criterion_gradient_reward_func/mean": -0.5, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.15546876192092896, "rewards/format_reward_func/std": 0.044781897217035294, "sampling/importance_sampling_ratio/max": 2.1380748748779297, "sampling/importance_sampling_ratio/mean": 0.3548307418823242, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8712320327758789, "sampling/sampling_logp_difference/mean": 0.02774733491241932, "step": 29, "step_time": 33.201856801998474 }, { "clip_ratio/high_max": 0.0030585488420911133, "clip_ratio/high_mean": 0.0030585488420911133, "clip_ratio/low_mean": 0.0013361104829527903, "clip_ratio/low_min": 0.0013361104829527903, "clip_ratio/region_mean": 0.004394659292302094, "completions/clipped_ratio": 0.21875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 892.71875, "completions/mean_terminated_length": 640.9199829101562, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 1.5191095881164074, "epoch": 0.032017075773746, "frac_reward_zero_std": 0.375, "grad_norm": 0.04557953402400017, "kl": 0.01082566031254828, "learning_rate": 4.997965982245668e-05, "loss": 0.0148, "num_tokens": 2622951.0, "reward": 0.8228504657745361, "reward_std": 0.2817378342151642, "rewards/argmax_reward_func/mean": 0.25, "rewards/argmax_reward_func/std": 0.4399413466453552, "rewards/criterion_gradient_reward_func/mean": 0.39550670981407166, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.17734375596046448, "rewards/format_reward_func/std": 0.029347317293286324, "sampling/importance_sampling_ratio/max": 2.7765984535217285, "sampling/importance_sampling_ratio/mean": 0.5699549913406372, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7797361612319946, "sampling/sampling_logp_difference/mean": 0.02788810431957245, "step": 30, "step_time": 33.957612550000704 }, { "clip_ratio/high_max": 0.005197916907491162, "clip_ratio/high_mean": 0.005197916907491162, "clip_ratio/low_mean": 0.0020905749042867683, "clip_ratio/low_min": 0.0020905749042867683, "clip_ratio/region_mean": 0.007288491811777931, "completions/clipped_ratio": 0.1875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1412.0, "completions/mean_length": 786.15625, "completions/mean_terminated_length": 554.0384521484375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.4714628048241138, "epoch": 0.033084311632870865, "frac_reward_zero_std": 0.0625, "grad_norm": 0.053506068885326385, "kl": 0.01468541519716382, "learning_rate": 4.9977929749488126e-05, "loss": -0.0053, "num_tokens": 2695232.0, "reward": 0.9790493249893188, "reward_std": 0.40658634901046753, "rewards/argmax_reward_func/mean": 0.3125, "rewards/argmax_reward_func/std": 0.4709290862083435, "rewards/criterion_gradient_reward_func/mean": 0.5274868607521057, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.13906249403953552, "rewards/format_reward_func/std": 0.07321463525295258, "sampling/importance_sampling_ratio/max": 2.768357992172241, "sampling/importance_sampling_ratio/mean": 0.5988168120384216, "sampling/importance_sampling_ratio/min": 0.004664627369493246, "sampling/sampling_logp_difference/max": 1.422224998474121, "sampling/sampling_logp_difference/mean": 0.02673635631799698, "step": 31, "step_time": 34.82516627199766 }, { "clip_ratio/high_max": 0.004163035380770452, "clip_ratio/high_mean": 0.004163035380770452, "clip_ratio/low_mean": 0.0010212299966951832, "clip_ratio/low_min": 0.0010212299966951832, "clip_ratio/region_mean": 0.005184265406569466, "completions/clipped_ratio": 0.1875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1681.0, "completions/mean_length": 791.0, "completions/mean_terminated_length": 560.0, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 1.46314487606287, "epoch": 0.03415154749199573, "frac_reward_zero_std": 0.4375, "grad_norm": 0.04645155370235443, "kl": 0.017245801427634433, "learning_rate": 4.9976129103698175e-05, "loss": -0.0398, "num_tokens": 2772582.0, "reward": 0.8458472490310669, "reward_std": 0.2872621417045593, "rewards/argmax_reward_func/mean": 0.3125, "rewards/argmax_reward_func/std": 0.4709290862083435, "rewards/criterion_gradient_reward_func/mean": 0.36615973711013794, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.16718751192092896, "rewards/format_reward_func/std": 0.05097338557243347, "sampling/importance_sampling_ratio/max": 2.712984800338745, "sampling/importance_sampling_ratio/mean": 0.4901038408279419, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1100280284881592, "sampling/sampling_logp_difference/mean": 0.02801729366183281, "step": 32, "step_time": 33.69202872999995 }, { "clip_ratio/high_max": 0.005036719794588862, "clip_ratio/high_mean": 0.005036719794588862, "clip_ratio/low_mean": 0.002195822944486281, "clip_ratio/low_min": 0.002195822944486281, "clip_ratio/region_mean": 0.007232542775454931, "completions/clipped_ratio": 0.3125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1763.0, "completions/mean_length": 950.8125, "completions/mean_terminated_length": 568.45458984375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.3015332594513893, "epoch": 0.035218783351120594, "frac_reward_zero_std": 0.0625, "grad_norm": 0.04944956675171852, "kl": 0.013906023581512272, "learning_rate": 4.9974257890174405e-05, "loss": -0.0392, "num_tokens": 2859210.0, "reward": 0.6490883827209473, "reward_std": 0.2264951467514038, "rewards/argmax_reward_func/mean": 0.125, "rewards/argmax_reward_func/std": 0.33601075410842896, "rewards/criterion_gradient_reward_func/mean": 0.3686196506023407, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.15546876192092896, "rewards/format_reward_func/std": 0.06081913039088249, "sampling/importance_sampling_ratio/max": 2.105214834213257, "sampling/importance_sampling_ratio/mean": 0.6134990453720093, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.5164413452148438, "sampling/sampling_logp_difference/mean": 0.026122061535716057, "step": 33, "step_time": 33.97057230699829 }, { "clip_ratio/high_max": 0.0022459906904259697, "clip_ratio/high_mean": 0.0022459906904259697, "clip_ratio/low_mean": 0.0021616665108012967, "clip_ratio/low_min": 0.0021616665108012967, "clip_ratio/region_mean": 0.004407657186675351, "completions/clipped_ratio": 0.3125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1643.0, "completions/mean_length": 958.21875, "completions/mean_terminated_length": 579.227294921875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 1.603148490190506, "epoch": 0.03628601921024546, "frac_reward_zero_std": 0.3125, "grad_norm": 0.024482250213623047, "kl": 0.01695849865791388, "learning_rate": 4.997231611420373e-05, "loss": -0.0355, "num_tokens": 2940765.0, "reward": 0.6579264402389526, "reward_std": 0.20660775899887085, "rewards/argmax_reward_func/mean": 0.1875, "rewards/argmax_reward_func/std": 0.3965577781200409, "rewards/criterion_gradient_reward_func/mean": 0.3118326961994171, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.15859374403953552, "rewards/format_reward_func/std": 0.060777679085731506, "sampling/importance_sampling_ratio/max": 2.7685742378234863, "sampling/importance_sampling_ratio/mean": 0.47060394287109375, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.652613639831543, "sampling/sampling_logp_difference/mean": 0.02811342477798462, "step": 34, "step_time": 34.58442073900278 }, { "clip_ratio/high_max": 0.00521002832101658, "clip_ratio/high_mean": 0.00521002832101658, "clip_ratio/low_mean": 0.002211495768278837, "clip_ratio/low_min": 0.002211495768278837, "clip_ratio/region_mean": 0.007421524103847332, "completions/clipped_ratio": 0.15625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1733.0, "completions/mean_length": 720.96875, "completions/mean_terminated_length": 522.629638671875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 1.2821640893816948, "epoch": 0.03735325506937033, "frac_reward_zero_std": 0.1875, "grad_norm": 0.06695578247308731, "kl": 0.022073757427278906, "learning_rate": 4.9970303781272474e-05, "loss": -0.0752, "num_tokens": 3009594.0, "reward": 1.0669527053833008, "reward_std": 0.5137572288513184, "rewards/argmax_reward_func/mean": 0.40625, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.4833589792251587, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.17734375596046448, "rewards/format_reward_func/std": 0.04132843390107155, "sampling/importance_sampling_ratio/max": 2.407660722732544, "sampling/importance_sampling_ratio/mean": 0.7148382067680359, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2477335929870605, "sampling/sampling_logp_difference/mean": 0.02798496000468731, "step": 35, "step_time": 32.45398926500093 }, { "clip_ratio/high_max": 0.00430632263305597, "clip_ratio/high_mean": 0.00430632263305597, "clip_ratio/low_mean": 0.003034734349057544, "clip_ratio/low_min": 0.003034734349057544, "clip_ratio/region_mean": 0.007341057040321175, "completions/clipped_ratio": 0.09375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1789.0, "completions/mean_length": 739.5, "completions/mean_terminated_length": 630.6206665039062, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 1.3567700609564781, "epoch": 0.0384204909284952, "frac_reward_zero_std": 0.125, "grad_norm": 0.06337057054042816, "kl": 0.028839207370765507, "learning_rate": 4.9968220897066284e-05, "loss": -0.0647, "num_tokens": 3092172.0, "reward": 0.33906248211860657, "reward_std": 0.3778601586818695, "rewards/argmax_reward_func/mean": 0.3125, "rewards/argmax_reward_func/std": 0.4709290862083435, "rewards/criterion_gradient_reward_func/mean": -0.15000000596046448, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.17656250298023224, "rewards/format_reward_func/std": 0.03699253499507904, "sampling/importance_sampling_ratio/max": 2.410306453704834, "sampling/importance_sampling_ratio/mean": 0.6089555025100708, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.7858703136444092, "sampling/sampling_logp_difference/mean": 0.029972005635499954, "step": 36, "step_time": 36.003745381000954 }, { "clip_ratio/high_max": 0.003430654454859905, "clip_ratio/high_mean": 0.003430654454859905, "clip_ratio/low_mean": 0.0020424544054549187, "clip_ratio/low_min": 0.0020424544054549187, "clip_ratio/region_mean": 0.0054731088821426965, "completions/clipped_ratio": 0.1875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1754.0, "completions/mean_length": 704.65625, "completions/mean_terminated_length": 453.73077392578125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 1.3845080137252808, "epoch": 0.03948772678762007, "frac_reward_zero_std": 0.1875, "grad_norm": 0.04558510705828667, "kl": 0.024949661165010184, "learning_rate": 4.996606746747018e-05, "loss": -0.0552, "num_tokens": 3158895.0, "reward": 0.9271030426025391, "reward_std": 0.38559412956237793, "rewards/argmax_reward_func/mean": 0.375, "rewards/argmax_reward_func/std": 0.49186936020851135, "rewards/criterion_gradient_reward_func/mean": 0.3872593641281128, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.16484375298023224, "rewards/format_reward_func/std": 0.056746240705251694, "sampling/importance_sampling_ratio/max": 2.5099925994873047, "sampling/importance_sampling_ratio/mean": 0.5697442293167114, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6297111511230469, "sampling/sampling_logp_difference/mean": 0.030561096966266632, "step": 37, "step_time": 28.22772048099887 }, { "clip_ratio/high_max": 0.004485016317630652, "clip_ratio/high_mean": 0.004485016317630652, "clip_ratio/low_mean": 0.0014229953667381778, "clip_ratio/low_min": 0.0014229953667381778, "clip_ratio/region_mean": 0.005908011655265, "completions/clipped_ratio": 0.15625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1628.0, "completions/mean_length": 632.5625, "completions/mean_terminated_length": 417.85186767578125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 1.2200517691671848, "epoch": 0.04055496264674493, "frac_reward_zero_std": 0.3125, "grad_norm": 0.04113490507006645, "kl": 0.03139959869440645, "learning_rate": 4.996384349856845e-05, "loss": -0.0619, "num_tokens": 3228439.0, "reward": -0.13593751192092896, "reward_std": 0.2894718050956726, "rewards/argmax_reward_func/mean": 0.1875, "rewards/argmax_reward_func/std": 0.3965577781200409, "rewards/criterion_gradient_reward_func/mean": -0.5, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.17656250298023224, "rewards/format_reward_func/std": 0.04706614091992378, "sampling/importance_sampling_ratio/max": 2.3509535789489746, "sampling/importance_sampling_ratio/mean": 0.5699790716171265, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9173851013183594, "sampling/sampling_logp_difference/mean": 0.03274017199873924, "step": 38, "step_time": 31.865677249000328 }, { "clip_ratio/high_max": 0.005592567926214542, "clip_ratio/high_mean": 0.005592567926214542, "clip_ratio/low_mean": 0.0017202793096657842, "clip_ratio/low_min": 0.0017202793096657842, "clip_ratio/region_mean": 0.007312847228604369, "completions/clipped_ratio": 0.09375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1762.0, "completions/mean_length": 672.34375, "completions/mean_terminated_length": 556.5172119140625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 1.01879758015275, "epoch": 0.0416221985058698, "frac_reward_zero_std": 0.1875, "grad_norm": 0.04494228959083557, "kl": 0.025687557295896113, "learning_rate": 4.996154899664473e-05, "loss": 0.05, "num_tokens": 3296920.0, "reward": 0.8580894470214844, "reward_std": 0.3833844065666199, "rewards/argmax_reward_func/mean": 0.25, "rewards/argmax_reward_func/std": 0.4399413466453552, "rewards/criterion_gradient_reward_func/mean": 0.4291831851005554, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.17890626192092896, "rewards/format_reward_func/std": 0.050445348024368286, "sampling/importance_sampling_ratio/max": 2.8635690212249756, "sampling/importance_sampling_ratio/mean": 0.6012802720069885, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7000026702880859, "sampling/sampling_logp_difference/mean": 0.029028404504060745, "step": 39, "step_time": 26.621436131998962 }, { "clip_ratio/high_max": 0.005104959574964596, "clip_ratio/high_mean": 0.005104959574964596, "clip_ratio/low_mean": 0.002460232572047971, "clip_ratio/low_min": 0.002460232572047971, "clip_ratio/region_mean": 0.007565192190668313, "completions/clipped_ratio": 0.0625, "completions/max_length": 1792.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 488.09375, "completions/mean_terminated_length": 401.16668701171875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.9746391959488392, "epoch": 0.042689434364994665, "frac_reward_zero_std": 0.25, "grad_norm": 0.06116454675793648, "kl": 0.0377456930000335, "learning_rate": 4.995918396818191e-05, "loss": -0.0769, "num_tokens": 3370665.0, "reward": 0.8580527305603027, "reward_std": 0.28836697340011597, "rewards/argmax_reward_func/mean": 0.3125, "rewards/argmax_reward_func/std": 0.4709290862083435, "rewards/criterion_gradient_reward_func/mean": 0.3728964924812317, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.17265623807907104, "rewards/format_reward_func/std": 0.04506240040063858, "sampling/importance_sampling_ratio/max": 2.8156259059906006, "sampling/importance_sampling_ratio/mean": 0.6535416841506958, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5756115913391113, "sampling/sampling_logp_difference/mean": 0.029179425910115242, "step": 40, "step_time": 37.02694117400006 }, { "clip_ratio/high_max": 0.004418104072101414, "clip_ratio/high_mean": 0.004418104072101414, "clip_ratio/low_mean": 0.0013135459703335073, "clip_ratio/low_min": 0.0013135459703335073, "clip_ratio/region_mean": 0.0057316500460729, "completions/clipped_ratio": 0.125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1270.0, "completions/mean_length": 582.125, "completions/mean_terminated_length": 409.2857360839844, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 1.1515404470264912, "epoch": 0.04375667022411953, "frac_reward_zero_std": 0.3125, "grad_norm": 0.0615927092730999, "kl": 0.03613680129637942, "learning_rate": 4.995674841986217e-05, "loss": -0.1043, "num_tokens": 3435043.0, "reward": 0.945233941078186, "reward_std": 0.4132154881954193, "rewards/argmax_reward_func/mean": 0.34375, "rewards/argmax_reward_func/std": 0.4825586974620819, "rewards/criterion_gradient_reward_func/mean": 0.41398394107818604, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.038625579327344894, "sampling/importance_sampling_ratio/max": 2.6321966648101807, "sampling/importance_sampling_ratio/mean": 0.5964424014091492, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8255620002746582, "sampling/sampling_logp_difference/mean": 0.031585998833179474, "step": 41, "step_time": 26.532947400999547 }, { "clip_ratio/high_max": 0.004958239496772876, "clip_ratio/high_mean": 0.004958239496772876, "clip_ratio/low_mean": 0.0010843543786904775, "clip_ratio/low_min": 0.0010843543786904775, "clip_ratio/region_mean": 0.00604259391548112, "completions/clipped_ratio": 0.1875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1749.0, "completions/mean_length": 706.625, "completions/mean_terminated_length": 456.15386962890625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 1.2080495860427618, "epoch": 0.044823906083244394, "frac_reward_zero_std": 0.375, "grad_norm": 0.03197896108031273, "kl": 0.031675134086981416, "learning_rate": 4.9954242358566914e-05, "loss": -0.0298, "num_tokens": 3510013.0, "reward": 0.04218750074505806, "reward_std": 0.19666408002376556, "rewards/argmax_reward_func/mean": 0.375, "rewards/argmax_reward_func/std": 0.49186936020851135, "rewards/criterion_gradient_reward_func/mean": -0.5, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.16718751192092896, "rewards/format_reward_func/std": 0.04854225739836693, "sampling/importance_sampling_ratio/max": 2.497363805770874, "sampling/importance_sampling_ratio/mean": 0.5387752056121826, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6475950479507446, "sampling/sampling_logp_difference/mean": 0.03082127310335636, "step": 42, "step_time": 32.46123552100016 }, { "clip_ratio/high_max": 0.005368428421206772, "clip_ratio/high_mean": 0.005368428421206772, "clip_ratio/low_mean": 0.0019163406286679674, "clip_ratio/low_min": 0.0019163406286679674, "clip_ratio/region_mean": 0.007284768995305058, "completions/clipped_ratio": 0.03125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 571.0625, "completions/mean_terminated_length": 531.6774291992188, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 1.2573323585093021, "epoch": 0.04589114194236926, "frac_reward_zero_std": 0.3125, "grad_norm": 0.06339698284864426, "kl": 0.03725032100919634, "learning_rate": 4.99516657913768e-05, "loss": -0.1451, "num_tokens": 3582279.0, "reward": 0.9636125564575195, "reward_std": 0.367916464805603, "rewards/argmax_reward_func/mean": 0.3125, "rewards/argmax_reward_func/std": 0.4709290862083435, "rewards/criterion_gradient_reward_func/mean": 0.4612688422203064, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.18984374403953552, "rewards/format_reward_func/std": 0.026835350319743156, "sampling/importance_sampling_ratio/max": 2.7602171897888184, "sampling/importance_sampling_ratio/mean": 0.5480587482452393, "sampling/importance_sampling_ratio/min": 0.0006319271633401513, "sampling/sampling_logp_difference/max": 2.1519336700439453, "sampling/sampling_logp_difference/mean": 0.03403317555785179, "step": 43, "step_time": 38.25361827300094 }, { "clip_ratio/high_max": 0.0032622530416119844, "clip_ratio/high_mean": 0.0032622530416119844, "clip_ratio/low_mean": 0.0013771284720860422, "clip_ratio/low_min": 0.0013771284720860422, "clip_ratio/region_mean": 0.004639381484594196, "completions/clipped_ratio": 0.09375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1527.0, "completions/mean_length": 551.34375, "completions/mean_terminated_length": 423.0, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 1.1791777200996876, "epoch": 0.04695837780149413, "frac_reward_zero_std": 0.4375, "grad_norm": 0.03485622629523277, "kl": 0.03682587330695242, "learning_rate": 4.994901872557166e-05, "loss": -0.0132, "num_tokens": 3651494.0, "reward": 0.9616175293922424, "reward_std": 0.31930288672447205, "rewards/argmax_reward_func/mean": 0.40625, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.36239880323410034, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19296875596046448, "rewards/format_reward_func/std": 0.02310073748230934, "sampling/importance_sampling_ratio/max": 2.369596481323242, "sampling/importance_sampling_ratio/mean": 0.5763484239578247, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7768189907073975, "sampling/sampling_logp_difference/mean": 0.03425965458154678, "step": 44, "step_time": 32.68233574700116 }, { "clip_ratio/high_max": 0.003625573961471673, "clip_ratio/high_mean": 0.003625573961471673, "clip_ratio/low_mean": 0.0024573057853558566, "clip_ratio/low_min": 0.0024573057853558566, "clip_ratio/region_mean": 0.006082879706809763, "completions/clipped_ratio": 0.0625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 482.65625, "completions/mean_terminated_length": 395.36669921875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.9539292119443417, "epoch": 0.048025613660619, "frac_reward_zero_std": 0.4375, "grad_norm": 0.056074462831020355, "kl": 0.05413978220894933, "learning_rate": 4.994630116863055e-05, "loss": 0.0733, "num_tokens": 3718657.0, "reward": 1.1001349687576294, "reward_std": 0.3204077482223511, "rewards/argmax_reward_func/mean": 0.34375, "rewards/argmax_reward_func/std": 0.4825586974620819, "rewards/criterion_gradient_reward_func/mean": 0.5641974806785583, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19218750298023224, "rewards/format_reward_func/std": 0.018445100635290146, "sampling/importance_sampling_ratio/max": 2.6998724937438965, "sampling/importance_sampling_ratio/mean": 0.45862436294555664, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1817073822021484, "sampling/sampling_logp_difference/mean": 0.03510105237364769, "step": 45, "step_time": 32.6915989120007 }, { "clip_ratio/high_max": 0.00524752854835242, "clip_ratio/high_mean": 0.00524752854835242, "clip_ratio/low_mean": 0.002464318080455996, "clip_ratio/low_min": 0.002464318080455996, "clip_ratio/region_mean": 0.007711846614256501, "completions/clipped_ratio": 0.0625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1755.0, "completions/mean_length": 580.4375, "completions/mean_terminated_length": 499.66668701171875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.9469889402389526, "epoch": 0.04909284951974386, "frac_reward_zero_std": 0.25, "grad_norm": 0.05326208844780922, "kl": 0.05695279373321682, "learning_rate": 4.994351312823167e-05, "loss": 0.0779, "num_tokens": 3790277.0, "reward": 0.919585108757019, "reward_std": 0.28394752740859985, "rewards/argmax_reward_func/mean": 0.3125, "rewards/argmax_reward_func/std": 0.4709290862083435, "rewards/criterion_gradient_reward_func/mean": 0.42036640644073486, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.18671874701976776, "rewards/format_reward_func/std": 0.030443213880062103, "sampling/importance_sampling_ratio/max": 2.292471170425415, "sampling/importance_sampling_ratio/mean": 0.5145248174667358, "sampling/importance_sampling_ratio/min": 0.00017690809909254313, "sampling/sampling_logp_difference/max": 1.2020535469055176, "sampling/sampling_logp_difference/mean": 0.0348585806787014, "step": 46, "step_time": 33.086043688001155 }, { "clip_ratio/high_max": 0.0036671321249741595, "clip_ratio/high_mean": 0.0036671321249741595, "clip_ratio/low_mean": 0.001979317341465503, "clip_ratio/low_min": 0.001979317341465503, "clip_ratio/region_mean": 0.005646449470077641, "completions/clipped_ratio": 0.1875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1288.0, "completions/mean_length": 818.59375, "completions/mean_terminated_length": 593.9615478515625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 1.151203090324998, "epoch": 0.05016008537886873, "frac_reward_zero_std": 0.25, "grad_norm": 0.05463254079222679, "kl": 0.050409337505698204, "learning_rate": 4.994065461225236e-05, "loss": 0.0787, "num_tokens": 3866360.0, "reward": 0.7501685619354248, "reward_std": 0.4485708475112915, "rewards/argmax_reward_func/mean": 0.375, "rewards/argmax_reward_func/std": 0.49186936020851135, "rewards/criterion_gradient_reward_func/mean": 0.17985612154006958, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.1953125, "rewards/format_reward_func/std": 0.016111381351947784, "sampling/importance_sampling_ratio/max": 2.5578129291534424, "sampling/importance_sampling_ratio/mean": 0.5782613158226013, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1703989505767822, "sampling/sampling_logp_difference/mean": 0.03642553836107254, "step": 47, "step_time": 31.942583424000986 }, { "clip_ratio/high_max": 0.003155585494823754, "clip_ratio/high_mean": 0.003155585494823754, "clip_ratio/low_mean": 0.0014643460817751475, "clip_ratio/low_min": 0.0014643460817751475, "clip_ratio/region_mean": 0.004619931576598901, "completions/clipped_ratio": 0.125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1756.0, "completions/mean_length": 630.25, "completions/mean_terminated_length": 464.2857360839844, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.9769103638827801, "epoch": 0.051227321237993596, "frac_reward_zero_std": 0.5, "grad_norm": 0.04097026214003563, "kl": 0.06624354002997279, "learning_rate": 4.9937725628769094e-05, "loss": -0.0299, "num_tokens": 3942348.0, "reward": 0.9056758880615234, "reward_std": 0.1447359025478363, "rewards/argmax_reward_func/mean": 0.28125, "rewards/argmax_reward_func/std": 0.45680341124534607, "rewards/criterion_gradient_reward_func/mean": 0.4361445903778076, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.18828126788139343, "rewards/format_reward_func/std": 0.031098423525691032, "sampling/importance_sampling_ratio/max": 1.97018563747406, "sampling/importance_sampling_ratio/mean": 0.3847596049308777, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2483386993408203, "sampling/sampling_logp_difference/mean": 0.03703593090176582, "step": 48, "step_time": 33.14647337500446 }, { "clip_ratio/high_max": 0.0035233482776675373, "clip_ratio/high_mean": 0.0035233482776675373, "clip_ratio/low_mean": 0.0007076487199810799, "clip_ratio/low_min": 0.0007076487199810799, "clip_ratio/region_mean": 0.004230996953992872, "completions/clipped_ratio": 0.1875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1542.0, "completions/mean_length": 784.34375, "completions/mean_terminated_length": 551.8077392578125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.8448831625282764, "epoch": 0.052294557097118465, "frac_reward_zero_std": 0.5, "grad_norm": 0.05313295125961304, "kl": 0.05098309414461255, "learning_rate": 4.9934726186057454e-05, "loss": -0.0453, "num_tokens": 4019591.0, "reward": 0.8352618217468262, "reward_std": 0.27731841802597046, "rewards/argmax_reward_func/mean": 0.3125, "rewards/argmax_reward_func/std": 0.4709290862083435, "rewards/criterion_gradient_reward_func/mean": 0.33448055386543274, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.18828125298023224, "rewards/format_reward_func/std": 0.022881509736180305, "sampling/importance_sampling_ratio/max": 2.577406406402588, "sampling/importance_sampling_ratio/mean": 0.6060134768486023, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.5782732963562012, "sampling/sampling_logp_difference/mean": 0.029864059761166573, "step": 49, "step_time": 32.49400077199971 }, { "clip_ratio/high_max": 0.0027793700610345695, "clip_ratio/high_mean": 0.0027793700610345695, "clip_ratio/low_mean": 0.0009122565606958233, "clip_ratio/low_min": 0.0009122565606958233, "clip_ratio/region_mean": 0.0036916266253683716, "completions/clipped_ratio": 0.125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1641.0, "completions/mean_length": 764.9375, "completions/mean_terminated_length": 618.2142944335938, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.820674292743206, "epoch": 0.05336179295624333, "frac_reward_zero_std": 0.375, "grad_norm": 0.04880816861987114, "kl": 0.05701018893159926, "learning_rate": 4.993165629259209e-05, "loss": -0.1225, "num_tokens": 4098615.0, "reward": 0.9071162343025208, "reward_std": 0.3281417191028595, "rewards/argmax_reward_func/mean": 0.28125, "rewards/argmax_reward_func/std": 0.45680341124534607, "rewards/criterion_gradient_reward_func/mean": 0.4391475021839142, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.18671874701976776, "rewards/format_reward_func/std": 0.04163220897316933, "sampling/importance_sampling_ratio/max": 2.7776365280151367, "sampling/importance_sampling_ratio/mean": 0.39576107263565063, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.4491276741027832, "sampling/sampling_logp_difference/mean": 0.03233148902654648, "step": 50, "step_time": 34.50056227499772 }, { "clip_ratio/high_max": 0.003162681474350393, "clip_ratio/high_mean": 0.003162681474350393, "clip_ratio/low_mean": 0.0021947009954601526, "clip_ratio/low_min": 0.0021947009954601526, "clip_ratio/region_mean": 0.005357382462534588, "completions/clipped_ratio": 0.1875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1650.0, "completions/mean_length": 902.46875, "completions/mean_terminated_length": 697.1923217773438, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.9475046005100012, "epoch": 0.054429028815368194, "frac_reward_zero_std": 0.4375, "grad_norm": 0.03652269393205643, "kl": 0.05199671583250165, "learning_rate": 4.992851595704668e-05, "loss": -0.0599, "num_tokens": 4184390.0, "reward": 0.9161238670349121, "reward_std": 0.2762135863304138, "rewards/argmax_reward_func/mean": 0.3125, "rewards/argmax_reward_func/std": 0.4709290862083435, "rewards/criterion_gradient_reward_func/mean": 0.4114362895488739, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19218750298023224, "rewards/format_reward_func/std": 0.020515041425824165, "sampling/importance_sampling_ratio/max": 2.283782958984375, "sampling/importance_sampling_ratio/mean": 0.5117959976196289, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.099390745162964, "sampling/sampling_logp_difference/mean": 0.03166363388299942, "step": 51, "step_time": 35.51677543299866 }, { "clip_ratio/high_max": 0.003356477478519082, "clip_ratio/high_mean": 0.003356477478519082, "clip_ratio/low_mean": 0.0011345062775944825, "clip_ratio/low_min": 0.0011345062775944825, "clip_ratio/region_mean": 0.004490983756113565, "completions/clipped_ratio": 0.1875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1658.0, "completions/mean_length": 690.5, "completions/mean_terminated_length": 436.3077087402344, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 1.0339625589549541, "epoch": 0.05549626467449306, "frac_reward_zero_std": 0.5625, "grad_norm": 0.028444333001971245, "kl": 0.06882872548885643, "learning_rate": 4.992530518829396e-05, "loss": -0.0181, "num_tokens": 4254840.0, "reward": 0.7728103399276733, "reward_std": 0.26737475395202637, "rewards/argmax_reward_func/mean": 0.375, "rewards/argmax_reward_func/std": 0.49186936020851135, "rewards/criterion_gradient_reward_func/mean": 0.20249789953231812, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.1953125, "rewards/format_reward_func/std": 0.014807227998971939, "sampling/importance_sampling_ratio/max": 2.354421377182007, "sampling/importance_sampling_ratio/mean": 0.41404685378074646, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.7060565948486328, "sampling/sampling_logp_difference/mean": 0.037241656333208084, "step": 52, "step_time": 27.100664807000612 }, { "clip_ratio/high_max": 0.0037003685429226607, "clip_ratio/high_mean": 0.0037003685429226607, "clip_ratio/low_mean": 0.0014787418986088596, "clip_ratio/low_min": 0.0014787418986088596, "clip_ratio/region_mean": 0.005179110477911308, "completions/clipped_ratio": 0.1875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 802.96875, "completions/mean_terminated_length": 574.7307739257812, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.8906721752136946, "epoch": 0.05656350053361793, "frac_reward_zero_std": 0.3125, "grad_norm": 0.0551103837788105, "kl": 0.059236726956442, "learning_rate": 4.992202399540567e-05, "loss": -0.0698, "num_tokens": 4330453.0, "reward": 0.12812498211860657, "reward_std": 0.36239221692085266, "rewards/argmax_reward_func/mean": 0.4375, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": -0.5, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19062499701976776, "rewards/format_reward_func/std": 0.023546455428004265, "sampling/importance_sampling_ratio/max": 2.1674771308898926, "sampling/importance_sampling_ratio/mean": 0.3936537802219391, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2342329025268555, "sampling/sampling_logp_difference/mean": 0.030425386503338814, "step": 53, "step_time": 32.833691573999204 }, { "clip_ratio/high_max": 0.003247561584430514, "clip_ratio/high_mean": 0.003247561584430514, "clip_ratio/low_mean": 0.0013816721293551382, "clip_ratio/low_min": 0.0013816721293551382, "clip_ratio/region_mean": 0.0046292337610793766, "completions/clipped_ratio": 0.3125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1515.0, "completions/mean_length": 932.0625, "completions/mean_terminated_length": 541.1818237304688, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.8370315562933683, "epoch": 0.0576307363927428, "frac_reward_zero_std": 0.4375, "grad_norm": 0.0565565787255764, "kl": 0.05779759946744889, "learning_rate": 4.99186723876525e-05, "loss": -0.0464, "num_tokens": 4417359.0, "reward": 1.1112661361694336, "reward_std": 0.3988524079322815, "rewards/argmax_reward_func/mean": 0.46875, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.44485974311828613, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19765625894069672, "rewards/format_reward_func/std": 0.009753772988915443, "sampling/importance_sampling_ratio/max": 2.302241325378418, "sampling/importance_sampling_ratio/mean": 0.7511900663375854, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2134153842926025, "sampling/sampling_logp_difference/mean": 0.02752211131155491, "step": 54, "step_time": 37.55064369400043 }, { "clip_ratio/high_max": 0.0035291641106596217, "clip_ratio/high_mean": 0.0035291641106596217, "clip_ratio/low_mean": 0.001980770835871226, "clip_ratio/low_min": 0.001980770835871226, "clip_ratio/region_mean": 0.005509934993824572, "completions/clipped_ratio": 0.28125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1763.0, "completions/mean_length": 964.59375, "completions/mean_terminated_length": 640.8261108398438, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 1.0259936638176441, "epoch": 0.05869797225186766, "frac_reward_zero_std": 0.375, "grad_norm": 0.052128683775663376, "kl": 0.058596692979335785, "learning_rate": 4.991525037450412e-05, "loss": 0.1046, "num_tokens": 4498728.0, "reward": 1.0630784034729004, "reward_std": 0.32261747121810913, "rewards/argmax_reward_func/mean": 0.28125, "rewards/argmax_reward_func/std": 0.45680341124534607, "rewards/criterion_gradient_reward_func/mean": 0.5927658081054688, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.18906250596046448, "rewards/format_reward_func/std": 0.022840170189738274, "sampling/importance_sampling_ratio/max": 2.5253405570983887, "sampling/importance_sampling_ratio/mean": 0.3179386854171753, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.4977741241455078, "sampling/sampling_logp_difference/mean": 0.03142296150326729, "step": 55, "step_time": 32.15883941299944 }, { "clip_ratio/high_max": 0.0018746323548839428, "clip_ratio/high_mean": 0.0018746323548839428, "clip_ratio/low_mean": 0.0016680510707374196, "clip_ratio/low_min": 0.0016680510707374196, "clip_ratio/region_mean": 0.0035426834256213624, "completions/clipped_ratio": 0.21875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1681.0, "completions/mean_length": 813.0, "completions/mean_terminated_length": 538.8800048828125, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 0.7951693683862686, "epoch": 0.05976520811099253, "frac_reward_zero_std": 0.5625, "grad_norm": 0.014409101568162441, "kl": 0.10683044674806297, "learning_rate": 4.99117579656291e-05, "loss": -0.0087, "num_tokens": 4571042.0, "reward": 0.7666022777557373, "reward_std": 0.18782523274421692, "rewards/argmax_reward_func/mean": 0.25, "rewards/argmax_reward_func/std": 0.4399413466453552, "rewards/criterion_gradient_reward_func/mean": 0.3244147300720215, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19218750298023224, "rewards/format_reward_func/std": 0.021475322544574738, "sampling/importance_sampling_ratio/max": 2.005937099456787, "sampling/importance_sampling_ratio/mean": 0.30666929483413696, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.5409717559814453, "sampling/sampling_logp_difference/mean": 0.029425105080008507, "step": 56, "step_time": 30.176332093000383 }, { "clip_ratio/high_max": 0.0029158997058402747, "clip_ratio/high_mean": 0.0029158997058402747, "clip_ratio/low_mean": 0.0022062624702812172, "clip_ratio/low_min": 0.0022062624702812172, "clip_ratio/region_mean": 0.005122162190673407, "completions/clipped_ratio": 0.34375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1532.0, "completions/mean_length": 1121.90625, "completions/mean_terminated_length": 770.90478515625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.7943318234756589, "epoch": 0.060832443970117396, "frac_reward_zero_std": 0.3125, "grad_norm": 0.03678717464208603, "kl": 0.050349713303148746, "learning_rate": 4.9908195170894925e-05, "loss": -0.1306, "num_tokens": 4657883.0, "reward": 0.9969813227653503, "reward_std": 0.4032718539237976, "rewards/argmax_reward_func/mean": 0.46875, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.33526259660720825, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19296875596046448, "rewards/format_reward_func/std": 0.019296471029520035, "sampling/importance_sampling_ratio/max": 2.4873666763305664, "sampling/importance_sampling_ratio/mean": 0.39236170053482056, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.195953845977783, "sampling/sampling_logp_difference/mean": 0.026066195219755173, "step": 57, "step_time": 35.55850386599741 }, { "clip_ratio/high_max": 0.002682634541997686, "clip_ratio/high_mean": 0.002682634541997686, "clip_ratio/low_mean": 0.0005113007955515059, "clip_ratio/low_min": 0.0005113007955515059, "clip_ratio/region_mean": 0.003193935337549192, "completions/clipped_ratio": 0.34375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1365.0, "completions/mean_length": 1001.78125, "completions/mean_terminated_length": 587.857177734375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.9577843407168984, "epoch": 0.061899679829242264, "frac_reward_zero_std": 0.5625, "grad_norm": 0.019906507804989815, "kl": 0.05214382451958954, "learning_rate": 4.990456200036793e-05, "loss": -0.0275, "num_tokens": 4740908.0, "reward": 1.1337437629699707, "reward_std": 0.27400386333465576, "rewards/argmax_reward_func/mean": 0.375, "rewards/argmax_reward_func/std": 0.49186936020851135, "rewards/criterion_gradient_reward_func/mean": 0.5649937987327576, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19375000894069672, "rewards/format_reward_func/std": 0.023759547621011734, "sampling/importance_sampling_ratio/max": 1.7909953594207764, "sampling/importance_sampling_ratio/mean": 0.29590582847595215, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.5843849182128906, "sampling/sampling_logp_difference/mean": 0.03090079315006733, "step": 58, "step_time": 35.24361273800059 }, { "clip_ratio/high_max": 0.002891971067583654, "clip_ratio/high_mean": 0.002891971067583654, "clip_ratio/low_mean": 0.0018478606598364422, "clip_ratio/low_min": 0.0018478606598364422, "clip_ratio/region_mean": 0.0047398317237821175, "completions/clipped_ratio": 0.34375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1703.0, "completions/mean_length": 1169.5, "completions/mean_terminated_length": 843.4285888671875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6762483306229115, "epoch": 0.06296691568836713, "frac_reward_zero_std": 0.4375, "grad_norm": 0.03641195967793465, "kl": 0.09076345129869878, "learning_rate": 4.990085846431331e-05, "loss": -0.0151, "num_tokens": 4823372.0, "reward": 0.8267509937286377, "reward_std": 0.2828426957130432, "rewards/argmax_reward_func/mean": 0.25, "rewards/argmax_reward_func/std": 0.4399413466453552, "rewards/criterion_gradient_reward_func/mean": 0.38925105333328247, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.038625579327344894, "sampling/importance_sampling_ratio/max": 1.9245468378067017, "sampling/importance_sampling_ratio/mean": 0.31796225905418396, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.5233197212219238, "sampling/sampling_logp_difference/mean": 0.025729810819029808, "step": 59, "step_time": 35.22728033799922 }, { "clip_ratio/high_max": 0.003561546305718366, "clip_ratio/high_mean": 0.003561546305718366, "clip_ratio/low_mean": 0.0009964932687580585, "clip_ratio/low_min": 0.0009964932687580585, "clip_ratio/region_mean": 0.004558039567200467, "completions/clipped_ratio": 0.4375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1587.0, "completions/mean_length": 1171.625, "completions/mean_terminated_length": 689.1111450195312, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.8216602727770805, "epoch": 0.064034151547492, "frac_reward_zero_std": 0.375, "grad_norm": 0.03349614515900612, "kl": 0.13530098425690085, "learning_rate": 4.989708457319505e-05, "loss": -0.0302, "num_tokens": 4917578.0, "reward": 1.1624993085861206, "reward_std": 0.3999572694301605, "rewards/argmax_reward_func/mean": 0.46875, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.49531179666519165, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19843751192092896, "rewards/format_reward_func/std": 0.0088388342410326, "sampling/importance_sampling_ratio/max": 2.8816568851470947, "sampling/importance_sampling_ratio/mean": 0.4976387023925781, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.4929189682006836, "sampling/sampling_logp_difference/mean": 0.02805042453110218, "step": 60, "step_time": 35.59829264600012 }, { "clip_ratio/high_max": 0.0029512645705835894, "clip_ratio/high_mean": 0.0029512645705835894, "clip_ratio/low_mean": 0.000953342838329263, "clip_ratio/low_min": 0.000953342838329263, "clip_ratio/region_mean": 0.0039046074234647676, "completions/clipped_ratio": 0.4375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1504.0, "completions/mean_length": 1150.375, "completions/mean_terminated_length": 651.3333129882812, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.7765182955190539, "epoch": 0.06510138740661686, "frac_reward_zero_std": 0.375, "grad_norm": 0.018434742465615273, "kl": 0.05586719745770097, "learning_rate": 4.9893240337675954e-05, "loss": 0.0124, "num_tokens": 4998692.0, "reward": 1.187744379043579, "reward_std": 0.41100579500198364, "rewards/argmax_reward_func/mean": 0.40625, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.5908693671226501, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19062501192092896, "rewards/format_reward_func/std": 0.03689020127058029, "sampling/importance_sampling_ratio/max": 1.6418814659118652, "sampling/importance_sampling_ratio/mean": 0.2985171973705292, "sampling/importance_sampling_ratio/min": 5.729573604185134e-05, "sampling/sampling_logp_difference/max": 2.1572463512420654, "sampling/sampling_logp_difference/mean": 0.026724547147750854, "step": 61, "step_time": 30.414572561999194 }, { "clip_ratio/high_max": 0.0013603273982880637, "clip_ratio/high_mean": 0.0013603273982880637, "clip_ratio/low_mean": 0.001467969865188934, "clip_ratio/low_min": 0.001467969865188934, "clip_ratio/region_mean": 0.0028282972634769976, "completions/clipped_ratio": 0.375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1771.0, "completions/mean_length": 1238.25, "completions/mean_terminated_length": 906.0, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "entropy": 0.8181438334286213, "epoch": 0.06616862326574173, "frac_reward_zero_std": 0.625, "grad_norm": 0.0410386323928833, "kl": 0.06344242324121296, "learning_rate": 4.9889325768617536e-05, "loss": -0.0491, "num_tokens": 5087994.0, "reward": 1.0127410888671875, "reward_std": 0.2651650309562683, "rewards/argmax_reward_func/mean": 0.5, "rewards/argmax_reward_func/std": 0.5080004930496216, "rewards/criterion_gradient_reward_func/mean": 0.31274107098579407, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.6730358600616455, "sampling/importance_sampling_ratio/mean": 0.419042706489563, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.9985331296920776, "sampling/sampling_logp_difference/mean": 0.0259285606443882, "step": 62, "step_time": 39.52297496099891 }, { "clip_ratio/high_max": 0.00327868276508525, "clip_ratio/high_mean": 0.00327868276508525, "clip_ratio/low_mean": 0.0016599974587734323, "clip_ratio/low_min": 0.0016599974587734323, "clip_ratio/region_mean": 0.004938680220220704, "completions/clipped_ratio": 0.46875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1496.0, "completions/mean_length": 1233.71875, "completions/mean_terminated_length": 741.11767578125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.7255973014980555, "epoch": 0.0672358591248666, "frac_reward_zero_std": 0.375, "grad_norm": 0.04055947810411453, "kl": 0.05197496654000133, "learning_rate": 4.9885340877080036e-05, "loss": 0.0762, "num_tokens": 5184935.0, "reward": 0.9001142978668213, "reward_std": 0.360182523727417, "rewards/argmax_reward_func/mean": 0.3125, "rewards/argmax_reward_func/std": 0.4709290862083435, "rewards/criterion_gradient_reward_func/mean": 0.3923018276691437, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.1953125, "rewards/format_reward_func/std": 0.016111381351947784, "sampling/importance_sampling_ratio/max": 2.576671600341797, "sampling/importance_sampling_ratio/mean": 0.3597943186759949, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.908696174621582, "sampling/sampling_logp_difference/mean": 0.025256918743252754, "step": 63, "step_time": 33.78314151200084 }, { "clip_ratio/high_max": 0.0030966925514803734, "clip_ratio/high_mean": 0.0030966925514803734, "clip_ratio/low_mean": 0.0006564408486156026, "clip_ratio/low_min": 0.0006564408486156026, "clip_ratio/region_mean": 0.003753133518330287, "completions/clipped_ratio": 0.3125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1721.0, "completions/mean_length": 1020.1875, "completions/mean_terminated_length": 669.3636474609375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.6222500512376428, "epoch": 0.06830309498399147, "frac_reward_zero_std": 0.4375, "grad_norm": 0.017787812277674675, "kl": 0.10228064085822552, "learning_rate": 4.988128567432242e-05, "loss": 0.0289, "num_tokens": 5277879.0, "reward": 1.0178642272949219, "reward_std": 0.40106210112571716, "rewards/argmax_reward_func/mean": 0.34375, "rewards/argmax_reward_func/std": 0.4825586974620819, "rewards/criterion_gradient_reward_func/mean": 0.47645801305770874, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19765624403953552, "rewards/format_reward_func/std": 0.013258252292871475, "sampling/importance_sampling_ratio/max": 1.262373924255371, "sampling/importance_sampling_ratio/mean": 0.2538524568080902, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1541547775268555, "sampling/sampling_logp_difference/mean": 0.022545866668224335, "step": 64, "step_time": 38.05616802100212 }, { "clip_ratio/high_max": 0.0023807797551853582, "clip_ratio/high_mean": 0.0023807797551853582, "clip_ratio/low_mean": 0.0015877370933594648, "clip_ratio/low_min": 0.0015877370933594648, "clip_ratio/region_mean": 0.003968516848544823, "completions/clipped_ratio": 0.5625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1475.0, "completions/mean_length": 1332.59375, "completions/mean_terminated_length": 741.9285888671875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.7406770009547472, "epoch": 0.06937033084311633, "frac_reward_zero_std": 0.4375, "grad_norm": 0.024238549172878265, "kl": 0.05065548315178603, "learning_rate": 4.987716017180227e-05, "loss": 0.0446, "num_tokens": 5365700.0, "reward": 0.6999263763427734, "reward_std": 0.36128735542297363, "rewards/argmax_reward_func/mean": 0.3125, "rewards/argmax_reward_func/std": 0.4709290862083435, "rewards/criterion_gradient_reward_func/mean": 0.19445759057998657, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19296875596046448, "rewards/format_reward_func/std": 0.022210843861103058, "sampling/importance_sampling_ratio/max": 1.6378140449523926, "sampling/importance_sampling_ratio/mean": 0.27266237139701843, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.191931962966919, "sampling/sampling_logp_difference/mean": 0.024090956896543503, "step": 65, "step_time": 31.89534520500274 }, { "clip_ratio/high_max": 0.0021839775508851744, "clip_ratio/high_mean": 0.0021839775508851744, "clip_ratio/low_mean": 0.0011986980789515655, "clip_ratio/low_min": 0.0011986980789515655, "clip_ratio/region_mean": 0.0033826756152848247, "completions/clipped_ratio": 0.59375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 1465.25, "completions/mean_terminated_length": 987.6923217773438, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "entropy": 1.0010418146848679, "epoch": 0.07043756670224119, "frac_reward_zero_std": 0.375, "grad_norm": 0.019639989361166954, "kl": 0.2180859032087028, "learning_rate": 4.9872964381175814e-05, "loss": -0.0389, "num_tokens": 5472064.0, "reward": 0.12812499701976776, "reward_std": 0.3668116331100464, "rewards/argmax_reward_func/mean": 0.4375, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": -0.5, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19062499701976776, "rewards/format_reward_func/std": 0.025988519191741943, "sampling/importance_sampling_ratio/max": 2.708674907684326, "sampling/importance_sampling_ratio/mean": 0.24566850066184998, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.751971960067749, "sampling/sampling_logp_difference/mean": 0.02534285932779312, "step": 66, "step_time": 41.90878479899857 }, { "clip_ratio/high_max": 0.0011542990559973987, "clip_ratio/high_mean": 0.0011542990559973987, "clip_ratio/low_mean": 0.0007184389269241365, "clip_ratio/low_min": 0.0007184389269241365, "clip_ratio/region_mean": 0.001872737977464567, "completions/clipped_ratio": 0.5625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1763.0, "completions/mean_length": 1432.84375, "completions/mean_terminated_length": 971.0714721679688, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "entropy": 0.5567322140559554, "epoch": 0.07150480256136606, "frac_reward_zero_std": 0.5, "grad_norm": 0.010918998159468174, "kl": 0.05619236419443041, "learning_rate": 4.986869831429787e-05, "loss": -0.0174, "num_tokens": 5574555.0, "reward": 0.10234373807907104, "reward_std": 0.3148834705352783, "rewards/argmax_reward_func/mean": 0.40625, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": -0.5, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19609375298023224, "rewards/format_reward_func/std": 0.01569540426135063, "sampling/importance_sampling_ratio/max": 2.776747941970825, "sampling/importance_sampling_ratio/mean": 0.4620300233364105, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.8425416946411133, "sampling/sampling_logp_difference/mean": 0.017367567867040634, "step": 67, "step_time": 34.81516051199924 }, { "clip_ratio/high_max": 0.0027484169113449752, "clip_ratio/high_mean": 0.0027484169113449752, "clip_ratio/low_mean": 0.0011800617685366888, "clip_ratio/low_min": 0.0011800617685366888, "clip_ratio/region_mean": 0.003928478665329749, "completions/clipped_ratio": 0.53125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 1439.71875, "completions/mean_terminated_length": 1040.4666748046875, "completions/min_length": 539.0, "completions/min_terminated_length": 539.0, "entropy": 0.825631907209754, "epoch": 0.07257203842049093, "frac_reward_zero_std": 0.4375, "grad_norm": 0.02292690984904766, "kl": 0.1205837621819228, "learning_rate": 4.986436198322182e-05, "loss": 0.0011, "num_tokens": 5670858.0, "reward": 1.2334508895874023, "reward_std": 0.360182523727417, "rewards/argmax_reward_func/mean": 0.5, "rewards/argmax_reward_func/std": 0.5080004930496216, "rewards/criterion_gradient_reward_func/mean": 0.5381383895874023, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.1953125, "rewards/format_reward_func/std": 0.019507545977830887, "sampling/importance_sampling_ratio/max": 2.9023122787475586, "sampling/importance_sampling_ratio/mean": 0.3522430658340454, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.2377219200134277, "sampling/sampling_logp_difference/mean": 0.023634646087884903, "step": 68, "step_time": 34.7986216770023 }, { "clip_ratio/high_max": 0.0014920752146281302, "clip_ratio/high_mean": 0.0014920752146281302, "clip_ratio/low_mean": 0.00035503802973835263, "clip_ratio/low_min": 0.00035503802973835263, "clip_ratio/region_mean": 0.0018471132443664828, "completions/clipped_ratio": 0.59375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1648.0, "completions/mean_length": 1427.03125, "completions/mean_terminated_length": 893.6154174804688, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.7870258279144764, "epoch": 0.0736392742796158, "frac_reward_zero_std": 0.625, "grad_norm": 0.019695747643709183, "kl": 0.052277603186666965, "learning_rate": 4.985995540019955e-05, "loss": 0.0599, "num_tokens": 5770795.0, "reward": 1.1247730255126953, "reward_std": 0.267374724149704, "rewards/argmax_reward_func/mean": 0.4375, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.49039801955223083, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19687500596046448, "rewards/format_reward_func/std": 0.010530294850468636, "sampling/importance_sampling_ratio/max": 2.0232419967651367, "sampling/importance_sampling_ratio/mean": 0.23310765624046326, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.4619638919830322, "sampling/sampling_logp_difference/mean": 0.020283225923776627, "step": 69, "step_time": 37.8903627589998 }, { "clip_ratio/high_max": 0.0018988249939866364, "clip_ratio/high_mean": 0.0018988249939866364, "clip_ratio/low_mean": 0.0004950233469571685, "clip_ratio/low_min": 0.0004950233469571685, "clip_ratio/region_mean": 0.002393848364590667, "completions/clipped_ratio": 0.65625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1398.0, "completions/mean_length": 1499.53125, "completions/mean_terminated_length": 941.1818237304688, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 0.8200256619602442, "epoch": 0.07470651013874066, "frac_reward_zero_std": 0.5, "grad_norm": 0.0242016538977623, "kl": 0.0422915889066644, "learning_rate": 4.985547857768147e-05, "loss": 0.0833, "num_tokens": 5873018.0, "reward": 0.8395864963531494, "reward_std": 0.27068930864334106, "rewards/argmax_reward_func/mean": 0.3125, "rewards/argmax_reward_func/std": 0.4709290862083435, "rewards/criterion_gradient_reward_func/mean": 0.33255523443222046, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19453126192092896, "rewards/format_reward_func/std": 0.015206076204776764, "sampling/importance_sampling_ratio/max": 1.9978951215744019, "sampling/importance_sampling_ratio/mean": 0.2658422291278839, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.7039794921875, "sampling/sampling_logp_difference/mean": 0.01892884261906147, "step": 70, "step_time": 39.03186344700134 }, { "clip_ratio/high_max": 0.0028024220082443208, "clip_ratio/high_mean": 0.0028024220082443208, "clip_ratio/low_mean": 0.0006614837966480991, "clip_ratio/low_min": 0.0006614837966480991, "clip_ratio/region_mean": 0.00346390580489242, "completions/clipped_ratio": 0.59375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1705.0, "completions/mean_length": 1467.15625, "completions/mean_terminated_length": 992.3846435546875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "entropy": 0.6691923476755619, "epoch": 0.07577374599786553, "frac_reward_zero_std": 0.625, "grad_norm": 0.008705717511475086, "kl": 0.046557122957892716, "learning_rate": 4.98509315283164e-05, "loss": -0.0186, "num_tokens": 5978295.0, "reward": 0.9327523708343506, "reward_std": 0.22318056225776672, "rewards/argmax_reward_func/mean": 0.40625, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.32806485891342163, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19843751192092896, "rewards/format_reward_func/std": 0.0088388342410326, "sampling/importance_sampling_ratio/max": 2.0010170936584473, "sampling/importance_sampling_ratio/mean": 0.2242860496044159, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.8629851341247559, "sampling/sampling_logp_difference/mean": 0.019126897677779198, "step": 71, "step_time": 38.74901056299586 }, { "clip_ratio/high_max": 0.0007959918511915021, "clip_ratio/high_mean": 0.0007959918511915021, "clip_ratio/low_mean": 0.0003974188875872642, "clip_ratio/low_min": 0.0003974188875872642, "clip_ratio/region_mean": 0.0011934107387787662, "completions/clipped_ratio": 0.4375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1682.0, "completions/mean_length": 1199.90625, "completions/mean_terminated_length": 739.388916015625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "entropy": 0.7579613905400038, "epoch": 0.0768409818569904, "frac_reward_zero_std": 0.6875, "grad_norm": 0.029258187860250473, "kl": 0.04364458750933409, "learning_rate": 4.984631426495163e-05, "loss": -0.1086, "num_tokens": 6067108.0, "reward": 0.9788209795951843, "reward_std": 0.17788153886795044, "rewards/argmax_reward_func/mean": 0.5, "rewards/argmax_reward_func/std": 0.5080004930496216, "rewards/criterion_gradient_reward_func/mean": 0.27960219979286194, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19921875, "rewards/format_reward_func/std": 0.004419418517500162, "sampling/importance_sampling_ratio/max": 2.7131004333496094, "sampling/importance_sampling_ratio/mean": 0.4216598868370056, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2987337112426758, "sampling/sampling_logp_difference/mean": 0.0185531098395586, "step": 72, "step_time": 32.4502327220016 }, { "clip_ratio/high_max": 0.002261465779156424, "clip_ratio/high_mean": 0.002261465779156424, "clip_ratio/low_mean": 0.0012222025034134276, "clip_ratio/low_min": 0.0012222025034134276, "clip_ratio/region_mean": 0.0034836682898458093, "completions/clipped_ratio": 0.59375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1787.0, "completions/mean_length": 1500.375, "completions/mean_terminated_length": 1074.1539306640625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.712285921908915, "epoch": 0.07790821771611527, "frac_reward_zero_std": 0.3125, "grad_norm": 0.052475638687610626, "kl": 0.03834435879252851, "learning_rate": 4.984162680063281e-05, "loss": -0.077, "num_tokens": 6167718.0, "reward": 1.2137778997421265, "reward_std": 0.31598833203315735, "rewards/argmax_reward_func/mean": 0.46875, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.5497154593467712, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.1953125, "rewards/format_reward_func/std": 0.013376526534557343, "sampling/importance_sampling_ratio/max": 2.844827175140381, "sampling/importance_sampling_ratio/mean": 0.5802140235900879, "sampling/importance_sampling_ratio/min": 0.00016066963144112378, "sampling/sampling_logp_difference/max": 2.37896466255188, "sampling/sampling_logp_difference/mean": 0.017836106941103935, "step": 73, "step_time": 35.66209018800055 }, { "clip_ratio/high_max": 0.0024109090518322773, "clip_ratio/high_mean": 0.0024109090518322773, "clip_ratio/low_mean": 0.0010076192756969249, "clip_ratio/low_min": 0.0010076192756969249, "clip_ratio/region_mean": 0.0034185283420811174, "completions/clipped_ratio": 0.53125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1675.0, "completions/mean_length": 1437.625, "completions/mean_terminated_length": 1036.0, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "entropy": 0.7141548208892345, "epoch": 0.07897545357524013, "frac_reward_zero_std": 0.375, "grad_norm": 0.031156150624155998, "kl": 0.06473891285713762, "learning_rate": 4.983686914860391e-05, "loss": -0.0007, "num_tokens": 6278406.0, "reward": 1.011054277420044, "reward_std": 0.31598830223083496, "rewards/argmax_reward_func/mean": 0.40625, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.41105425357818604, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19374999403953552, "rewards/format_reward_func/std": 0.017960529774427414, "sampling/importance_sampling_ratio/max": 2.0069432258605957, "sampling/importance_sampling_ratio/mean": 0.32379770278930664, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.7495510578155518, "sampling/sampling_logp_difference/mean": 0.018298080191016197, "step": 74, "step_time": 37.712092804998974 }, { "clip_ratio/high_max": 0.0017454729913879419, "clip_ratio/high_mean": 0.0017454729913879419, "clip_ratio/low_mean": 0.0009876012391032418, "clip_ratio/low_min": 0.0009876012391032418, "clip_ratio/region_mean": 0.002733074217758258, "completions/clipped_ratio": 0.625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1649.0, "completions/mean_length": 1439.875, "completions/mean_terminated_length": 853.0, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "entropy": 0.8465821100398898, "epoch": 0.08004268943436499, "frac_reward_zero_std": 0.375, "grad_norm": 0.008546740747988224, "kl": 0.049867414752952754, "learning_rate": 4.9832041322307263e-05, "loss": 0.013, "num_tokens": 6373646.0, "reward": 0.9568749666213989, "reward_std": 0.32261744141578674, "rewards/argmax_reward_func/mean": 0.40625, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.36000001430511475, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19062501192092896, "rewards/format_reward_func/std": 0.025988521054387093, "sampling/importance_sampling_ratio/max": 0.9181432127952576, "sampling/importance_sampling_ratio/mean": 0.17193971574306488, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.452864408493042, "sampling/sampling_logp_difference/mean": 0.01926429569721222, "step": 75, "step_time": 36.71659878799983 }, { "clip_ratio/high_max": 0.0028027956541336607, "clip_ratio/high_mean": 0.0028027956541336607, "clip_ratio/low_mean": 0.0013588099591288483, "clip_ratio/low_min": 0.0013588099591288483, "clip_ratio/region_mean": 0.004161605582339689, "completions/clipped_ratio": 0.375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1509.0, "completions/mean_length": 1178.84375, "completions/mean_terminated_length": 810.9500122070312, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "entropy": 0.7316114269196987, "epoch": 0.08110992529348986, "frac_reward_zero_std": 0.25, "grad_norm": 0.04949251189827919, "kl": 0.052813281770795584, "learning_rate": 4.982714333538343e-05, "loss": 0.056, "num_tokens": 6465761.0, "reward": 1.0787875652313232, "reward_std": 0.4905552864074707, "rewards/argmax_reward_func/mean": 0.46875, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.4147251546382904, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.1953125, "rewards/format_reward_func/std": 0.013376526534557343, "sampling/importance_sampling_ratio/max": 2.6131086349487305, "sampling/importance_sampling_ratio/mean": 0.5426270961761475, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2392475605010986, "sampling/sampling_logp_difference/mean": 0.019062940031290054, "step": 76, "step_time": 37.847519816992644 }, { "clip_ratio/high_max": 0.001760930274031125, "clip_ratio/high_mean": 0.001760930274031125, "clip_ratio/low_mean": 0.0010057372692244826, "clip_ratio/low_min": 0.0010057372692244826, "clip_ratio/region_mean": 0.002766667557807523, "completions/clipped_ratio": 0.5625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1713.0, "completions/mean_length": 1388.75, "completions/mean_terminated_length": 870.2857666015625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.7339489385485649, "epoch": 0.08217716115261472, "frac_reward_zero_std": 0.5625, "grad_norm": 0.016706565394997597, "kl": 0.05364376481156796, "learning_rate": 4.982217520167122e-05, "loss": -0.0925, "num_tokens": 6565773.0, "reward": 1.131700873374939, "reward_std": 0.3181980550289154, "rewards/argmax_reward_func/mean": 0.46875, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.46920084953308105, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19374999403953552, "rewards/format_reward_func/std": 0.027679037302732468, "sampling/importance_sampling_ratio/max": 1.8964990377426147, "sampling/importance_sampling_ratio/mean": 0.2813001871109009, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.986541748046875, "sampling/sampling_logp_difference/mean": 0.0190016720443964, "step": 77, "step_time": 33.98363399700429 }, { "clip_ratio/high_max": 0.0014935048093320802, "clip_ratio/high_mean": 0.0014935048093320802, "clip_ratio/low_mean": 0.0005937977275607409, "clip_ratio/low_min": 0.0005937977275607409, "clip_ratio/region_mean": 0.002087302536892821, "completions/clipped_ratio": 0.6875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 1488.875, "completions/mean_terminated_length": 822.0, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "entropy": 0.8204954480752349, "epoch": 0.0832443970117396, "frac_reward_zero_std": 0.5, "grad_norm": 0.028857016935944557, "kl": 0.04498161666560918, "learning_rate": 4.981713693520762e-05, "loss": 0.062, "num_tokens": 6663825.0, "reward": 1.128490924835205, "reward_std": 0.35797280073165894, "rewards/argmax_reward_func/mean": 0.5625, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.3706783354282379, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.1953125, "rewards/format_reward_func/std": 0.016111381351947784, "sampling/importance_sampling_ratio/max": 2.372729539871216, "sampling/importance_sampling_ratio/mean": 0.45614707469940186, "sampling/importance_sampling_ratio/min": 0.0016166368732228875, "sampling/sampling_logp_difference/max": 2.4817464351654053, "sampling/sampling_logp_difference/mean": 0.01907571777701378, "step": 78, "step_time": 34.31288923599277 }, { "clip_ratio/high_max": 0.0016209575478569604, "clip_ratio/high_mean": 0.0016209575478569604, "clip_ratio/low_mean": 0.0005453595367725939, "clip_ratio/low_min": 0.0005453595367725939, "clip_ratio/region_mean": 0.0021663170846295543, "completions/clipped_ratio": 0.59375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1762.0, "completions/mean_length": 1479.96875, "completions/mean_terminated_length": 1023.923095703125, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "entropy": 0.6830656230449677, "epoch": 0.08431163287086446, "frac_reward_zero_std": 0.5625, "grad_norm": 0.008605134673416615, "kl": 0.04200441169086844, "learning_rate": 4.9812028550227805e-05, "loss": 0.0428, "num_tokens": 6769316.0, "reward": 1.1282624006271362, "reward_std": 0.23864851891994476, "rewards/argmax_reward_func/mean": 0.46875, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.472012460231781, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.0375671423971653, "sampling/importance_sampling_ratio/max": 2.7428131103515625, "sampling/importance_sampling_ratio/mean": 0.3039014935493469, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.339261770248413, "sampling/sampling_logp_difference/mean": 0.017995459958910942, "step": 79, "step_time": 35.07186781999917 }, { "clip_ratio/high_max": 0.0016920079360716045, "clip_ratio/high_mean": 0.0016920079360716045, "clip_ratio/low_mean": 0.00027939826213696506, "clip_ratio/low_min": 0.00027939826213696506, "clip_ratio/region_mean": 0.001971406209122506, "completions/clipped_ratio": 0.4375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1398.0, "completions/mean_length": 1206.0625, "completions/mean_terminated_length": 750.3333129882812, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.792104878462851, "epoch": 0.08537886872998933, "frac_reward_zero_std": 0.4375, "grad_norm": 0.05167252942919731, "kl": 0.046694961143657565, "learning_rate": 4.980685006116504e-05, "loss": -0.1853, "num_tokens": 6861868.0, "reward": 1.1116015911102295, "reward_std": 0.35465818643569946, "rewards/argmax_reward_func/mean": 0.5, "rewards/argmax_reward_func/std": 0.5080004930496216, "rewards/criterion_gradient_reward_func/mean": 0.41238296031951904, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19921875, "rewards/format_reward_func/std": 0.004419418517500162, "sampling/importance_sampling_ratio/max": 2.8179287910461426, "sampling/importance_sampling_ratio/mean": 0.5459282398223877, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.176931381225586, "sampling/sampling_logp_difference/mean": 0.02042079158127308, "step": 80, "step_time": 34.26222573500672 }, { "clip_ratio/high_max": 0.0020792175055248663, "clip_ratio/high_mean": 0.0020792175055248663, "clip_ratio/low_mean": 0.0009640870466682827, "clip_ratio/low_min": 0.0009640870466682827, "clip_ratio/region_mean": 0.0030433045467361808, "completions/clipped_ratio": 0.53125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 1375.0, "completions/mean_terminated_length": 902.4000244140625, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "entropy": 0.7120202546939254, "epoch": 0.0864461045891142, "frac_reward_zero_std": 0.375, "grad_norm": 0.030733034014701843, "kl": 0.0479708646889776, "learning_rate": 4.980160148265065e-05, "loss": 0.0151, "num_tokens": 6956258.0, "reward": 1.1539629697799683, "reward_std": 0.4054814875125885, "rewards/argmax_reward_func/mean": 0.40625, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.5531817674636841, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19453126192092896, "rewards/format_reward_func/std": 0.01765984110534191, "sampling/importance_sampling_ratio/max": 2.2495176792144775, "sampling/importance_sampling_ratio/mean": 0.3457251489162445, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.0360052585601807, "sampling/sampling_logp_difference/mean": 0.017288144677877426, "step": 81, "step_time": 33.95203949999632 }, { "clip_ratio/high_max": 0.0021893332159379497, "clip_ratio/high_mean": 0.0021893332159379497, "clip_ratio/low_mean": 0.00087001343126758, "clip_ratio/low_min": 0.00087001343126758, "clip_ratio/region_mean": 0.0030593466508435085, "completions/clipped_ratio": 0.4375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1546.0, "completions/mean_length": 1194.96875, "completions/mean_terminated_length": 730.6111450195312, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.8299798853695393, "epoch": 0.08751334044823907, "frac_reward_zero_std": 0.5, "grad_norm": 0.0877360925078392, "kl": 0.8868826098041609, "learning_rate": 4.979628282951403e-05, "loss": -0.0791, "num_tokens": 7051803.0, "reward": 1.080803632736206, "reward_std": 0.3137786388397217, "rewards/argmax_reward_func/mean": 0.34375, "rewards/argmax_reward_func/std": 0.4825586974620819, "rewards/criterion_gradient_reward_func/mean": 0.5401785373687744, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19687500596046448, "rewards/format_reward_func/std": 0.012296733446419239, "sampling/importance_sampling_ratio/max": 2.592806577682495, "sampling/importance_sampling_ratio/mean": 0.4991931915283203, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.2466020584106445, "sampling/sampling_logp_difference/mean": 0.02063864655792713, "step": 82, "step_time": 37.23553667100532 }, { "clip_ratio/high_max": 0.0019039881153730676, "clip_ratio/high_mean": 0.0019039881153730676, "clip_ratio/low_mean": 0.0008427306784142274, "clip_ratio/low_min": 0.0008427306784142274, "clip_ratio/region_mean": 0.0027467187865113374, "completions/clipped_ratio": 0.375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1616.0, "completions/mean_length": 1123.5, "completions/mean_terminated_length": 722.4000244140625, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "entropy": 0.845631561242044, "epoch": 0.08858057630736393, "frac_reward_zero_std": 0.4375, "grad_norm": 0.032937195152044296, "kl": 0.07954186655115336, "learning_rate": 4.9790894116782514e-05, "loss": 0.0268, "num_tokens": 7143199.0, "reward": 1.0204963684082031, "reward_std": 0.3557630777359009, "rewards/argmax_reward_func/mean": 0.375, "rewards/argmax_reward_func/std": 0.49186936020851135, "rewards/criterion_gradient_reward_func/mean": 0.4470588266849518, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19843751192092896, "rewards/format_reward_func/std": 0.0088388342410326, "sampling/importance_sampling_ratio/max": 1.6345555782318115, "sampling/importance_sampling_ratio/mean": 0.4392532706260681, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2771567106246948, "sampling/sampling_logp_difference/mean": 0.020520983263850212, "step": 83, "step_time": 33.03453335099766 }, { "clip_ratio/high_max": 0.002472568790835794, "clip_ratio/high_mean": 0.002472568790835794, "clip_ratio/low_mean": 0.0009247518064512406, "clip_ratio/low_min": 0.0009247518064512406, "clip_ratio/region_mean": 0.003397320626390865, "completions/clipped_ratio": 0.4375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1758.0, "completions/mean_length": 1365.03125, "completions/mean_terminated_length": 1032.9444580078125, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "entropy": 0.7140422239899635, "epoch": 0.08964781216648879, "frac_reward_zero_std": 0.3125, "grad_norm": 0.04691598191857338, "kl": 0.038958370918408036, "learning_rate": 4.978543535968142e-05, "loss": 0.117, "num_tokens": 7232796.0, "reward": 1.1791534423828125, "reward_std": 0.4032718539237976, "rewards/argmax_reward_func/mean": 0.46875, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.5143096446990967, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19609375298023224, "rewards/format_reward_func/std": 0.01569540426135063, "sampling/importance_sampling_ratio/max": 2.321978807449341, "sampling/importance_sampling_ratio/mean": 0.658952534198761, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3409075736999512, "sampling/sampling_logp_difference/mean": 0.017669104039669037, "step": 84, "step_time": 31.768684331998884 }, { "clip_ratio/high_max": 0.002397002841462381, "clip_ratio/high_mean": 0.002397002841462381, "clip_ratio/low_mean": 0.0007627184058947023, "clip_ratio/low_min": 0.0007627184058947023, "clip_ratio/region_mean": 0.003159721236443147, "completions/clipped_ratio": 0.375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1740.0, "completions/mean_length": 1159.03125, "completions/mean_terminated_length": 779.25, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.6516066081821918, "epoch": 0.09071504802561366, "frac_reward_zero_std": 0.375, "grad_norm": 0.024563396349549294, "kl": 0.046079780091531575, "learning_rate": 4.977990657363397e-05, "loss": 0.0012, "num_tokens": 7326407.0, "reward": 0.9005703926086426, "reward_std": 0.27952811121940613, "rewards/argmax_reward_func/mean": 0.3125, "rewards/argmax_reward_func/std": 0.4709290862083435, "rewards/criterion_gradient_reward_func/mean": 0.3997892141342163, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.18828123807907104, "rewards/format_reward_func/std": 0.03418661653995514, "sampling/importance_sampling_ratio/max": 1.0403612852096558, "sampling/importance_sampling_ratio/mean": 0.3666849136352539, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7466635704040527, "sampling/sampling_logp_difference/mean": 0.01769557036459446, "step": 85, "step_time": 36.00609361400166 }, { "clip_ratio/high_max": 0.002377610871917568, "clip_ratio/high_mean": 0.002377610871917568, "clip_ratio/low_mean": 0.0004942612795275636, "clip_ratio/low_min": 0.0004942612795275636, "clip_ratio/region_mean": 0.0028718721514451317, "completions/clipped_ratio": 0.46875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1501.0, "completions/mean_length": 1260.46875, "completions/mean_terminated_length": 791.4705810546875, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "entropy": 0.5514051113277674, "epoch": 0.09178228388473852, "frac_reward_zero_std": 0.5625, "grad_norm": 0.050481222569942474, "kl": 0.03795831714523956, "learning_rate": 4.977430777426122e-05, "loss": 0.0891, "num_tokens": 7411524.0, "reward": 1.01230788230896, "reward_std": 0.310464084148407, "rewards/argmax_reward_func/mean": 0.53125, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.2818390727043152, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19921875, "rewards/format_reward_func/std": 0.004419418517500162, "sampling/importance_sampling_ratio/max": 2.6737067699432373, "sampling/importance_sampling_ratio/mean": 0.549344003200531, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9908313751220703, "sampling/sampling_logp_difference/mean": 0.014772492460906506, "step": 86, "step_time": 31.255588858008196 }, { "clip_ratio/high_max": 0.002499055590305943, "clip_ratio/high_mean": 0.002499055590305943, "clip_ratio/low_mean": 0.0012309320518397726, "clip_ratio/low_min": 0.0012309320518397726, "clip_ratio/region_mean": 0.0037299877003533766, "completions/clipped_ratio": 0.375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1612.0, "completions/mean_length": 1077.9375, "completions/mean_terminated_length": 649.5, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.9862131159752607, "epoch": 0.09284951974386339, "frac_reward_zero_std": 0.3125, "grad_norm": 0.04871785640716553, "kl": 0.046208353829570115, "learning_rate": 4.976863897738207e-05, "loss": -0.1256, "num_tokens": 7505968.0, "reward": 1.2513530254364014, "reward_std": 0.49276503920555115, "rewards/argmax_reward_func/mean": 0.59375, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.462290495634079, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.1953125, "rewards/format_reward_func/std": 0.014807227998971939, "sampling/importance_sampling_ratio/max": 2.876906394958496, "sampling/importance_sampling_ratio/mean": 0.4937659204006195, "sampling/importance_sampling_ratio/min": 0.0008248560479842126, "sampling/sampling_logp_difference/max": 0.9392604827880859, "sampling/sampling_logp_difference/mean": 0.020357908681035042, "step": 87, "step_time": 33.16679031900094 }, { "clip_ratio/high_max": 0.0010840238792297896, "clip_ratio/high_mean": 0.0010840238792297896, "clip_ratio/low_mean": 0.0006964058229641523, "clip_ratio/low_min": 0.0006964058229641523, "clip_ratio/region_mean": 0.0017804297276597936, "completions/clipped_ratio": 0.375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1467.0, "completions/mean_length": 995.84375, "completions/mean_terminated_length": 518.1500244140625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 1.0829234141856432, "epoch": 0.09391675560298826, "frac_reward_zero_std": 0.5625, "grad_norm": 0.01890585571527481, "kl": 0.03959930606652051, "learning_rate": 4.9762900199013174e-05, "loss": 0.0267, "num_tokens": 7584361.0, "reward": 1.0997931957244873, "reward_std": 0.23091456294059753, "rewards/argmax_reward_func/mean": 0.46875, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.43807435035705566, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19296875596046448, "rewards/format_reward_func/std": 0.024784859269857407, "sampling/importance_sampling_ratio/max": 2.6207568645477295, "sampling/importance_sampling_ratio/mean": 0.4287070631980896, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6905908584594727, "sampling/sampling_logp_difference/mean": 0.02188248373568058, "step": 88, "step_time": 29.830529162996754 }, { "clip_ratio/high_max": 0.00207410200528102, "clip_ratio/high_mean": 0.00207410200528102, "clip_ratio/low_mean": 0.0011334736045682803, "clip_ratio/low_min": 0.0011334736045682803, "clip_ratio/region_mean": 0.003207575617125258, "completions/clipped_ratio": 0.28125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1561.0, "completions/mean_length": 946.125, "completions/mean_terminated_length": 615.1304321289062, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.6779058063402772, "epoch": 0.09498399146211313, "frac_reward_zero_std": 0.25, "grad_norm": 0.047559577971696854, "kl": 0.0415624959859997, "learning_rate": 4.975709145536892e-05, "loss": 0.1641, "num_tokens": 7667705.0, "reward": 1.2079377174377441, "reward_std": 0.4938699007034302, "rewards/argmax_reward_func/mean": 0.40625, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.6087189316749573, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19296875596046448, "rewards/format_reward_func/std": 0.021283771842718124, "sampling/importance_sampling_ratio/max": 2.8416295051574707, "sampling/importance_sampling_ratio/mean": 0.6820238828659058, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8188488483428955, "sampling/sampling_logp_difference/mean": 0.01712602935731411, "step": 89, "step_time": 35.921584791003625 }, { "clip_ratio/high_max": 0.002041866857325658, "clip_ratio/high_mean": 0.002041866857325658, "clip_ratio/low_mean": 0.0010007300279539777, "clip_ratio/low_min": 0.0010007300279539777, "clip_ratio/region_mean": 0.0030425968616327737, "completions/clipped_ratio": 0.28125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 976.03125, "completions/mean_terminated_length": 656.7391357421875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.9249678179621696, "epoch": 0.096051227321238, "frac_reward_zero_std": 0.4375, "grad_norm": 0.04099859297275543, "kl": 0.044552834588102996, "learning_rate": 4.975121276286136e-05, "loss": -0.0073, "num_tokens": 7756702.0, "reward": 1.0819536447525024, "reward_std": 0.35465824604034424, "rewards/argmax_reward_func/mean": 0.375, "rewards/argmax_reward_func/std": 0.49186936020851135, "rewards/criterion_gradient_reward_func/mean": 0.5092974305152893, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19765624403953552, "rewards/format_reward_func/std": 0.009753772988915443, "sampling/importance_sampling_ratio/max": 2.7051336765289307, "sampling/importance_sampling_ratio/mean": 0.46350711584091187, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.033491849899292, "sampling/sampling_logp_difference/mean": 0.020451068878173828, "step": 90, "step_time": 35.76604353200128 }, { "clip_ratio/high_max": 0.002577034800196998, "clip_ratio/high_mean": 0.002577034800196998, "clip_ratio/low_mean": 0.0013091751607134938, "clip_ratio/low_min": 0.0013091751607134938, "clip_ratio/region_mean": 0.003886209975462407, "completions/clipped_ratio": 0.28125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1511.0, "completions/mean_length": 882.625, "completions/mean_terminated_length": 526.7825927734375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.9903864152729511, "epoch": 0.09711846318036287, "frac_reward_zero_std": 0.5, "grad_norm": 0.06534211337566376, "kl": 0.049260309897363186, "learning_rate": 4.974526413810021e-05, "loss": -0.0592, "num_tokens": 7831960.0, "reward": 1.0865331888198853, "reward_std": 0.3115689158439636, "rewards/argmax_reward_func/mean": 0.40625, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.4818456172943115, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19843751192092896, "rewards/format_reward_func/std": 0.0088388342410326, "sampling/importance_sampling_ratio/max": 2.975571632385254, "sampling/importance_sampling_ratio/mean": 0.46007436513900757, "sampling/importance_sampling_ratio/min": 0.005405105650424957, "sampling/sampling_logp_difference/max": 0.8034868240356445, "sampling/sampling_logp_difference/mean": 0.02226456254720688, "step": 91, "step_time": 33.268395881998 }, { "clip_ratio/high_max": 0.003195944918843452, "clip_ratio/high_mean": 0.003195944918843452, "clip_ratio/low_mean": 0.0009464338691032026, "clip_ratio/low_min": 0.0009464338691032026, "clip_ratio/region_mean": 0.004142378793403623, "completions/clipped_ratio": 0.1875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1545.0, "completions/mean_length": 777.0, "completions/mean_terminated_length": 542.7692260742188, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.8049357458949089, "epoch": 0.09818569903948772, "frac_reward_zero_std": 0.375, "grad_norm": 0.04052986949682236, "kl": 0.04888495116028935, "learning_rate": 4.973924559789276e-05, "loss": 0.0975, "num_tokens": 7905512.0, "reward": 1.2090131044387817, "reward_std": 0.44415146112442017, "rewards/argmax_reward_func/mean": 0.5, "rewards/argmax_reward_func/std": 0.5080004930496216, "rewards/criterion_gradient_reward_func/mean": 0.510575532913208, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19843751192092896, "rewards/format_reward_func/std": 0.0088388342410326, "sampling/importance_sampling_ratio/max": 2.7534220218658447, "sampling/importance_sampling_ratio/mean": 0.5772168636322021, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.309014916419983, "sampling/sampling_logp_difference/mean": 0.01834295131266117, "step": 92, "step_time": 31.833109737997802 }, { "clip_ratio/high_max": 0.0029095441204844974, "clip_ratio/high_mean": 0.0029095441204844974, "clip_ratio/low_mean": 0.0009116914552578237, "clip_ratio/low_min": 0.0009116914552578237, "clip_ratio/region_mean": 0.0038212355721043423, "completions/clipped_ratio": 0.125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1689.0, "completions/mean_length": 761.65625, "completions/mean_terminated_length": 614.4642944335938, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 1.0621236115694046, "epoch": 0.09925293489861259, "frac_reward_zero_std": 0.3125, "grad_norm": 0.05450521036982536, "kl": 0.04244729154743254, "learning_rate": 4.973315715924382e-05, "loss": 0.1011, "num_tokens": 7974517.0, "reward": 1.1128746271133423, "reward_std": 0.4441514015197754, "rewards/argmax_reward_func/mean": 0.5, "rewards/argmax_reward_func/std": 0.5080004930496216, "rewards/criterion_gradient_reward_func/mean": 0.4144371449947357, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19843751192092896, "rewards/format_reward_func/std": 0.0088388342410326, "sampling/importance_sampling_ratio/max": 2.514376163482666, "sampling/importance_sampling_ratio/mean": 0.6445499062538147, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8416061401367188, "sampling/sampling_logp_difference/mean": 0.02049885131418705, "step": 93, "step_time": 27.61737263899522 }, { "clip_ratio/high_max": 0.002693494694540277, "clip_ratio/high_mean": 0.002693494694540277, "clip_ratio/low_mean": 0.0020320932744652964, "clip_ratio/low_min": 0.0020320932744652964, "clip_ratio/region_mean": 0.0047255879690055735, "completions/clipped_ratio": 0.09375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1726.0, "completions/mean_length": 705.875, "completions/mean_terminated_length": 593.5172119140625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 1.0226356908679008, "epoch": 0.10032017075773746, "frac_reward_zero_std": 0.5, "grad_norm": 0.05851002782583237, "kl": 0.04262960248161107, "learning_rate": 4.972699883935572e-05, "loss": 0.102, "num_tokens": 8047227.0, "reward": 0.9998905658721924, "reward_std": 0.31267374753952026, "rewards/argmax_reward_func/mean": 0.46875, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.3350468873977661, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19609375298023224, "rewards/format_reward_func/std": 0.01287247333675623, "sampling/importance_sampling_ratio/max": 2.599499464035034, "sampling/importance_sampling_ratio/mean": 0.7187550067901611, "sampling/importance_sampling_ratio/min": 0.002687035361304879, "sampling/sampling_logp_difference/max": 0.6941485404968262, "sampling/sampling_logp_difference/mean": 0.02295234054327011, "step": 94, "step_time": 30.962510417006342 }, { "clip_ratio/high_max": 0.004371957780676894, "clip_ratio/high_mean": 0.004371957780676894, "clip_ratio/low_mean": 0.0011369032799848355, "clip_ratio/low_min": 0.0011369032799848355, "clip_ratio/region_mean": 0.005508861017005984, "completions/clipped_ratio": 0.0625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 524.40625, "completions/mean_terminated_length": 439.9000244140625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 1.1804856918752193, "epoch": 0.10138740661686232, "frac_reward_zero_std": 0.3125, "grad_norm": 0.038850247859954834, "kl": 0.05508435203228146, "learning_rate": 4.972077065562821e-05, "loss": -0.0536, "num_tokens": 8117132.0, "reward": 1.0417178869247437, "reward_std": 0.3281417489051819, "rewards/argmax_reward_func/mean": 0.34375, "rewards/argmax_reward_func/std": 0.4825586974620819, "rewards/criterion_gradient_reward_func/mean": 0.5112491250038147, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.18671876192092896, "rewards/format_reward_func/std": 0.039136018604040146, "sampling/importance_sampling_ratio/max": 1.683869481086731, "sampling/importance_sampling_ratio/mean": 0.48085349798202515, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.715033769607544, "sampling/sampling_logp_difference/mean": 0.02466602437198162, "step": 95, "step_time": 33.11536462500226 }, { "clip_ratio/high_max": 0.0019475785811664537, "clip_ratio/high_mean": 0.0019475785811664537, "clip_ratio/low_mean": 0.0008571770886192098, "clip_ratio/low_min": 0.0008571770886192098, "clip_ratio/region_mean": 0.002804755698889494, "completions/clipped_ratio": 0.0625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 590.5, "completions/mean_terminated_length": 510.4000244140625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.9665573053061962, "epoch": 0.10245464247598719, "frac_reward_zero_std": 0.5, "grad_norm": 0.06030852347612381, "kl": 0.05448542395606637, "learning_rate": 4.9714472625658464e-05, "loss": 0.0746, "num_tokens": 8196450.0, "reward": 0.9001810550689697, "reward_std": 0.18230096995830536, "rewards/argmax_reward_func/mean": 0.25, "rewards/argmax_reward_func/std": 0.4399413466453552, "rewards/criterion_gradient_reward_func/mean": 0.45721226930618286, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19296875596046448, "rewards/format_reward_func/std": 0.01707947812974453, "sampling/importance_sampling_ratio/max": 2.6693265438079834, "sampling/importance_sampling_ratio/mean": 0.8080523014068604, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7579154968261719, "sampling/sampling_logp_difference/mean": 0.021842550486326218, "step": 96, "step_time": 33.02514458800215 }, { "clip_ratio/high_max": 0.0023519086025771685, "clip_ratio/high_mean": 0.0023519086025771685, "clip_ratio/low_mean": 0.0010026067029684782, "clip_ratio/low_min": 0.0010026067029684782, "clip_ratio/region_mean": 0.003354515298269689, "completions/clipped_ratio": 0.15625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1587.0, "completions/mean_length": 682.125, "completions/mean_terminated_length": 476.59259033203125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 1.368406793102622, "epoch": 0.10352187833511206, "frac_reward_zero_std": 0.5, "grad_norm": 0.034665804356336594, "kl": 0.03559327143011615, "learning_rate": 4.970810476724097e-05, "loss": 0.0663, "num_tokens": 8279424.0, "reward": 1.0982459783554077, "reward_std": 0.3568679392337799, "rewards/argmax_reward_func/mean": 0.5, "rewards/argmax_reward_func/std": 0.5080004930496216, "rewards/criterion_gradient_reward_func/mean": 0.4005897641181946, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19765624403953552, "rewards/format_reward_func/std": 0.009753772988915443, "sampling/importance_sampling_ratio/max": 1.7637882232666016, "sampling/importance_sampling_ratio/mean": 0.5529554486274719, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.9159088134765625, "sampling/sampling_logp_difference/mean": 0.025673875585198402, "step": 97, "step_time": 38.278300575000685 }, { "clip_ratio/high_max": 0.003627997590228915, "clip_ratio/high_mean": 0.003627997590228915, "clip_ratio/low_mean": 0.001280266049434431, "clip_ratio/low_min": 0.001280266049434431, "clip_ratio/region_mean": 0.004908263639663346, "completions/clipped_ratio": 0.125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1111.0, "completions/mean_length": 593.9375, "completions/mean_terminated_length": 422.7857360839844, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.9734468050301075, "epoch": 0.10458911419423693, "frac_reward_zero_std": 0.375, "grad_norm": 0.05170489847660065, "kl": 0.0476768707158044, "learning_rate": 4.970166709836751e-05, "loss": -0.022, "num_tokens": 8352886.0, "reward": 1.0562182664871216, "reward_std": 0.44415146112442017, "rewards/argmax_reward_func/mean": 0.4375, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.42028066515922546, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19843751192092896, "rewards/format_reward_func/std": 0.0088388342410326, "sampling/importance_sampling_ratio/max": 2.6622631549835205, "sampling/importance_sampling_ratio/mean": 0.5990111827850342, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6912460327148438, "sampling/sampling_logp_difference/mean": 0.020409682765603065, "step": 98, "step_time": 33.26101536799797 }, { "clip_ratio/high_max": 0.0021224770898697898, "clip_ratio/high_mean": 0.0021224770898697898, "clip_ratio/low_mean": 0.0020291019827709533, "clip_ratio/low_min": 0.0020291019827709533, "clip_ratio/region_mean": 0.004151579043536913, "completions/clipped_ratio": 0.09375, "completions/max_length": 1792.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 493.40625, "completions/mean_terminated_length": 359.0689697265625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 1.1057712994515896, "epoch": 0.1056563500533618, "frac_reward_zero_std": 0.5, "grad_norm": 0.0813102126121521, "kl": 0.04953256109729409, "learning_rate": 4.969515963722714e-05, "loss": 0.0594, "num_tokens": 8417643.0, "reward": 1.1617546081542969, "reward_std": 0.19887375831604004, "rewards/argmax_reward_func/mean": 0.375, "rewards/argmax_reward_func/std": 0.49186936020851135, "rewards/criterion_gradient_reward_func/mean": 0.6023796796798706, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.18437500298023224, "rewards/format_reward_func/std": 0.04989909008145332, "sampling/importance_sampling_ratio/max": 2.951803684234619, "sampling/importance_sampling_ratio/mean": 0.8987444639205933, "sampling/importance_sampling_ratio/min": 0.00022933883883524686, "sampling/sampling_logp_difference/max": 2.9977235794067383, "sampling/sampling_logp_difference/mean": 0.02729441598057747, "step": 99, "step_time": 26.848839408994536 }, { "clip_ratio/high_max": 0.0029996807061252184, "clip_ratio/high_mean": 0.0029996807061252184, "clip_ratio/low_mean": 0.0006065523011784535, "clip_ratio/low_min": 0.0006065523011784535, "clip_ratio/region_mean": 0.003606233007303672, "completions/clipped_ratio": 0.03125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1451.0, "completions/mean_length": 532.75, "completions/mean_terminated_length": 492.1290283203125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 1.0328365676105022, "epoch": 0.10672358591248667, "frac_reward_zero_std": 0.5, "grad_norm": 0.056620825082063675, "kl": 0.05010396265424788, "learning_rate": 4.9688582402206075e-05, "loss": 0.0039, "num_tokens": 8492047.0, "reward": 1.1000282764434814, "reward_std": 0.2728990316390991, "rewards/argmax_reward_func/mean": 0.4375, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.4679970145225525, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19453126192092896, "rewards/format_reward_func/std": 0.017659839242696762, "sampling/importance_sampling_ratio/max": 1.87975013256073, "sampling/importance_sampling_ratio/mean": 0.6443547606468201, "sampling/importance_sampling_ratio/min": 0.007840153761208057, "sampling/sampling_logp_difference/max": 0.8139629364013672, "sampling/sampling_logp_difference/mean": 0.02344057895243168, "step": 100, "step_time": 35.58332700599931 }, { "clip_ratio/high_max": 0.00347414800489787, "clip_ratio/high_mean": 0.00347414800489787, "clip_ratio/low_mean": 0.0005614701221929863, "clip_ratio/low_min": 0.0005614701221929863, "clip_ratio/region_mean": 0.004035618127090856, "completions/clipped_ratio": 0.0625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1250.0, "completions/mean_length": 425.5625, "completions/mean_terminated_length": 334.4666748046875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 1.0921905748546124, "epoch": 0.10779082177161152, "frac_reward_zero_std": 0.4375, "grad_norm": 0.04831473529338837, "kl": 0.06928284675814211, "learning_rate": 4.968193541188768e-05, "loss": 0.0068, "num_tokens": 8555557.0, "reward": 1.1748414039611816, "reward_std": 0.4065863788127899, "rewards/argmax_reward_func/mean": 0.53125, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.4498414099216461, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19375000894069672, "rewards/format_reward_func/std": 0.02106058970093727, "sampling/importance_sampling_ratio/max": 1.4021543264389038, "sampling/importance_sampling_ratio/mean": 0.5452498197555542, "sampling/importance_sampling_ratio/min": 0.02184658870100975, "sampling/sampling_logp_difference/max": 0.9317439794540405, "sampling/sampling_logp_difference/mean": 0.022775448858737946, "step": 101, "step_time": 29.740817445002904 }, { "clip_ratio/high_max": 0.002386968095379416, "clip_ratio/high_mean": 0.002386968095379416, "clip_ratio/low_mean": 0.0007727852062089369, "clip_ratio/low_min": 0.0007727852062089369, "clip_ratio/region_mean": 0.0031597533015883528, "completions/clipped_ratio": 0.0, "completions/max_length": 1370.0, "completions/max_terminated_length": 1370.0, "completions/mean_length": 350.34375, "completions/mean_terminated_length": 350.34375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 1.0847865156829357, "epoch": 0.10885805763073639, "frac_reward_zero_std": 0.5625, "grad_norm": 0.046428240835666656, "kl": 0.04981359315570444, "learning_rate": 4.9675218685052415e-05, "loss": 0.0202, "num_tokens": 8609044.0, "reward": 0.9879605770111084, "reward_std": 0.27400386333465576, "rewards/argmax_reward_func/mean": 0.5625, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.23171059787273407, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19374999403953552, "rewards/format_reward_func/std": 0.02199706807732582, "sampling/importance_sampling_ratio/max": 2.4124064445495605, "sampling/importance_sampling_ratio/mean": 0.6805813312530518, "sampling/importance_sampling_ratio/min": 0.08003643900156021, "sampling/sampling_logp_difference/max": 0.4569840431213379, "sampling/sampling_logp_difference/mean": 0.02590150572359562, "step": 102, "step_time": 22.30004965899934 }, { "clip_ratio/high_max": 0.0031683267297921702, "clip_ratio/high_mean": 0.0031683267297921702, "clip_ratio/low_mean": 0.0032091461471281946, "clip_ratio/low_min": 0.0032091461471281946, "clip_ratio/region_mean": 0.006377472927852068, "completions/clipped_ratio": 0.0625, "completions/max_length": 1792.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 400.25, "completions/mean_terminated_length": 307.4666748046875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 1.3097773157060146, "epoch": 0.10992529348986126, "frac_reward_zero_std": 0.375, "grad_norm": 0.0673256665468216, "kl": 0.05567830195650458, "learning_rate": 4.9668432240677766e-05, "loss": 0.052, "num_tokens": 8673726.0, "reward": 1.3100965023040771, "reward_std": 0.408796101808548, "rewards/argmax_reward_func/mean": 0.59375, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.5241590142250061, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19218750298023224, "rewards/format_reward_func/std": 0.03616578131914139, "sampling/importance_sampling_ratio/max": 2.8476669788360596, "sampling/importance_sampling_ratio/mean": 0.6960346698760986, "sampling/importance_sampling_ratio/min": 0.0013957922346889973, "sampling/sampling_logp_difference/max": 0.48442983627319336, "sampling/sampling_logp_difference/mean": 0.0285791028290987, "step": 103, "step_time": 31.744883055001992 }, { "clip_ratio/high_max": 0.003233394949347712, "clip_ratio/high_mean": 0.003233394949347712, "clip_ratio/low_mean": 0.0009915271657519042, "clip_ratio/low_min": 0.0009915271657519042, "clip_ratio/region_mean": 0.004224922129651532, "completions/clipped_ratio": 0.0625, "completions/max_length": 1792.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 421.34375, "completions/mean_terminated_length": 329.9666748046875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 1.1103278808295727, "epoch": 0.11099252934898612, "frac_reward_zero_std": 0.5, "grad_norm": 0.059645190834999084, "kl": 0.06692357524298131, "learning_rate": 4.96615760979382e-05, "loss": 0.0485, "num_tokens": 8736009.0, "reward": 1.241872787475586, "reward_std": 0.318198025226593, "rewards/argmax_reward_func/mean": 0.59375, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.4559352397918701, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19218750298023224, "rewards/format_reward_func/std": 0.023277342319488525, "sampling/importance_sampling_ratio/max": 1.9665167331695557, "sampling/importance_sampling_ratio/mean": 0.570235013961792, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48363399505615234, "sampling/sampling_logp_difference/mean": 0.026485642418265343, "step": 104, "step_time": 25.998062878999917 }, { "clip_ratio/high_max": 0.004465410209377296, "clip_ratio/high_mean": 0.004465410209377296, "clip_ratio/low_mean": 0.0008689017267897725, "clip_ratio/low_min": 0.0008689017267897725, "clip_ratio/region_mean": 0.005334311856131535, "completions/clipped_ratio": 0.0625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1688.0, "completions/mean_length": 450.65625, "completions/mean_terminated_length": 361.23333740234375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 1.1124612148851156, "epoch": 0.11205976520811099, "frac_reward_zero_std": 0.4375, "grad_norm": 0.07388127595186234, "kl": 0.061068268958479166, "learning_rate": 4.965465027620512e-05, "loss": -0.1074, "num_tokens": 8807042.0, "reward": 1.1730844974517822, "reward_std": 0.35465824604034424, "rewards/argmax_reward_func/mean": 0.5625, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.41136571764945984, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19921875, "rewards/format_reward_func/std": 0.004419418517500162, "sampling/importance_sampling_ratio/max": 2.3886969089508057, "sampling/importance_sampling_ratio/mean": 0.625559389591217, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7139627933502197, "sampling/sampling_logp_difference/mean": 0.02300747111439705, "step": 105, "step_time": 33.2616786969993 }, { "clip_ratio/high_max": 0.0037686872456106357, "clip_ratio/high_mean": 0.0037686872456106357, "clip_ratio/low_mean": 0.0013883583742426708, "clip_ratio/low_min": 0.0013883583742426708, "clip_ratio/region_mean": 0.0051570456198533066, "completions/clipped_ratio": 0.03125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1733.0, "completions/mean_length": 363.25, "completions/mean_terminated_length": 317.1612854003906, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.9705239869654179, "epoch": 0.11312700106723586, "frac_reward_zero_std": 0.5, "grad_norm": 0.0998031347990036, "kl": 0.05363424040842801, "learning_rate": 4.9647654795046784e-05, "loss": 0.0829, "num_tokens": 8875590.0, "reward": 1.1306555271148682, "reward_std": 0.3082543611526489, "rewards/argmax_reward_func/mean": 0.53125, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.40174928307533264, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19765625894069672, "rewards/format_reward_func/std": 0.009753772988915443, "sampling/importance_sampling_ratio/max": 2.3087735176086426, "sampling/importance_sampling_ratio/mean": 0.8146755695343018, "sampling/importance_sampling_ratio/min": 0.05622253939509392, "sampling/sampling_logp_difference/max": 0.48807692527770996, "sampling/sampling_logp_difference/mean": 0.02410818822681904, "step": 106, "step_time": 32.04513001800842 }, { "clip_ratio/high_max": 0.0028044096106896177, "clip_ratio/high_mean": 0.0028044096106896177, "clip_ratio/low_mean": 0.0008175929542630911, "clip_ratio/low_min": 0.0008175929542630911, "clip_ratio/region_mean": 0.0036220026231603697, "completions/clipped_ratio": 0.0625, "completions/max_length": 1792.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 387.0, "completions/mean_terminated_length": 293.3333435058594, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 1.0023357346653938, "epoch": 0.11419423692636073, "frac_reward_zero_std": 0.5625, "grad_norm": 0.03959713503718376, "kl": 0.06260226119775325, "learning_rate": 4.964058967422827e-05, "loss": -0.0439, "num_tokens": 8936628.0, "reward": 1.1155339479446411, "reward_std": 0.3181980550289154, "rewards/argmax_reward_func/mean": 0.59375, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.3280339241027832, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19374999403953552, "rewards/format_reward_func/std": 0.027679037302732468, "sampling/importance_sampling_ratio/max": 2.899698495864868, "sampling/importance_sampling_ratio/mean": 0.5870722532272339, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5831270217895508, "sampling/sampling_logp_difference/mean": 0.02363421581685543, "step": 107, "step_time": 28.053487809003855 }, { "clip_ratio/high_max": 0.00317996644298546, "clip_ratio/high_mean": 0.00317996644298546, "clip_ratio/low_mean": 0.0016695811427780427, "clip_ratio/low_min": 0.0016695811427780427, "clip_ratio/region_mean": 0.004849547585763503, "completions/clipped_ratio": 0.03125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1643.0, "completions/mean_length": 406.59375, "completions/mean_terminated_length": 361.9032287597656, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 1.0369208380579948, "epoch": 0.1152614727854856, "frac_reward_zero_std": 0.5625, "grad_norm": 0.06672753393650055, "kl": 0.06888616341166198, "learning_rate": 4.9633454933711434e-05, "loss": 0.0202, "num_tokens": 8998953.0, "reward": 0.9710675477981567, "reward_std": 0.2253902703523636, "rewards/argmax_reward_func/mean": 0.34375, "rewards/argmax_reward_func/std": 0.4825586974620819, "rewards/criterion_gradient_reward_func/mean": 0.4366925060749054, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19062501192092896, "rewards/format_reward_func/std": 0.026753051206469536, "sampling/importance_sampling_ratio/max": 2.809570550918579, "sampling/importance_sampling_ratio/mean": 0.6659361124038696, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5750114917755127, "sampling/sampling_logp_difference/mean": 0.025920961052179337, "step": 108, "step_time": 29.3246876910016 }, { "clip_ratio/high_max": 0.001785951288184151, "clip_ratio/high_mean": 0.001785951288184151, "clip_ratio/low_mean": 0.0010496080285520293, "clip_ratio/low_min": 0.0010496080285520293, "clip_ratio/region_mean": 0.002835559302184265, "completions/clipped_ratio": 0.03125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1566.0, "completions/mean_length": 380.8125, "completions/mean_terminated_length": 335.2903137207031, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 1.1031884010881186, "epoch": 0.11632870864461047, "frac_reward_zero_std": 0.5625, "grad_norm": 0.08105583488941193, "kl": 0.07797365996520966, "learning_rate": 4.962625059365481e-05, "loss": -0.0582, "num_tokens": 9060035.0, "reward": 1.068750023841858, "reward_std": 0.30935922265052795, "rewards/argmax_reward_func/mean": 0.46875, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.4000000059604645, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.967715263366699, "sampling/importance_sampling_ratio/mean": 0.750694751739502, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5803612470626831, "sampling/sampling_logp_difference/mean": 0.02450050227344036, "step": 109, "step_time": 26.006354548000672 }, { "clip_ratio/high_max": 0.0019244407303631306, "clip_ratio/high_mean": 0.0019244407303631306, "clip_ratio/low_mean": 0.0009063531906576827, "clip_ratio/low_min": 0.0009063531906576827, "clip_ratio/region_mean": 0.0028307939210208133, "completions/clipped_ratio": 0.0, "completions/max_length": 1630.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 371.5, "completions/mean_terminated_length": 371.5, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.9633561205118895, "epoch": 0.11739594450373532, "frac_reward_zero_std": 0.6875, "grad_norm": 0.06063111498951912, "kl": 0.06688621733337641, "learning_rate": 4.96189766744136e-05, "loss": 0.0202, "num_tokens": 9124911.0, "reward": 1.3031442165374756, "reward_std": 0.22097086906433105, "rewards/argmax_reward_func/mean": 0.59375, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.5093941688537598, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5598549842834473, "sampling/importance_sampling_ratio/mean": 0.9789856672286987, "sampling/importance_sampling_ratio/min": 0.03592727705836296, "sampling/sampling_logp_difference/max": 0.8145298957824707, "sampling/sampling_logp_difference/mean": 0.025027023628354073, "step": 110, "step_time": 29.203252994995637 }, { "clip_ratio/high_max": 0.004665557004045695, "clip_ratio/high_mean": 0.004665557004045695, "clip_ratio/low_mean": 0.0009134072897722945, "clip_ratio/low_min": 0.0009134072897722945, "clip_ratio/region_mean": 0.0055789642938179895, "completions/clipped_ratio": 0.03125, "completions/max_length": 1792.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 325.1875, "completions/mean_terminated_length": 277.8709716796875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.9741303212940693, "epoch": 0.11846318036286019, "frac_reward_zero_std": 0.5, "grad_norm": 0.08212444186210632, "kl": 0.06720549741294235, "learning_rate": 4.9611633196539584e-05, "loss": 0.0148, "num_tokens": 9190467.0, "reward": 1.130263090133667, "reward_std": 0.3535533547401428, "rewards/argmax_reward_func/mean": 0.625, "rewards/argmax_reward_func/std": 0.49186936020851135, "rewards/criterion_gradient_reward_func/mean": 0.3052631616592407, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.2101218700408936, "sampling/importance_sampling_ratio/mean": 0.6723105907440186, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.585108757019043, "sampling/sampling_logp_difference/mean": 0.023886609822511673, "step": 111, "step_time": 34.072315775007155 }, { "clip_ratio/high_max": 0.004554668819764629, "clip_ratio/high_mean": 0.004554668819764629, "clip_ratio/low_mean": 0.003200437313353177, "clip_ratio/low_min": 0.003200437313353177, "clip_ratio/region_mean": 0.0077551060894620605, "completions/clipped_ratio": 0.03125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1124.0, "completions/mean_length": 330.71875, "completions/mean_terminated_length": 283.58062744140625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.9905771557241678, "epoch": 0.11953041622198506, "frac_reward_zero_std": 0.375, "grad_norm": 0.08528739213943481, "kl": 0.08067304582800716, "learning_rate": 4.960422018078108e-05, "loss": -0.0399, "num_tokens": 9249156.0, "reward": 1.20307457447052, "reward_std": 0.4065863788127899, "rewards/argmax_reward_func/mean": 0.40625, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.6030746102333069, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19374999403953552, "rewards/format_reward_func/std": 0.0353553406894207, "sampling/importance_sampling_ratio/max": 2.6736505031585693, "sampling/importance_sampling_ratio/mean": 0.6745803952217102, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7244689464569092, "sampling/sampling_logp_difference/mean": 0.02373790182173252, "step": 112, "step_time": 25.94527157000266 }, { "clip_ratio/high_max": 0.0017739472386892885, "clip_ratio/high_mean": 0.0017739472386892885, "clip_ratio/low_mean": 0.0004841708141611889, "clip_ratio/low_min": 0.0004841708141611889, "clip_ratio/region_mean": 0.0022581180528504774, "completions/clipped_ratio": 0.0, "completions/max_length": 1496.0, "completions/max_terminated_length": 1496.0, "completions/mean_length": 429.28125, "completions/mean_terminated_length": 429.28125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.8011012319475412, "epoch": 0.12059765208110992, "frac_reward_zero_std": 0.75, "grad_norm": 0.03992867469787598, "kl": 0.07776898518204689, "learning_rate": 4.959673764808287e-05, "loss": 0.0486, "num_tokens": 9313401.0, "reward": 1.0963722467422485, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func/mean": 0.5, "rewards/argmax_reward_func/std": 0.5080004930496216, "rewards/criterion_gradient_reward_func/mean": 0.3963722586631775, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.051842451095581, "sampling/importance_sampling_ratio/mean": 0.6392670273780823, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9141734838485718, "sampling/sampling_logp_difference/mean": 0.02089775912463665, "step": 113, "step_time": 24.387624620998395 }, { "clip_ratio/high_max": 0.003405707626370713, "clip_ratio/high_mean": 0.003405707626370713, "clip_ratio/low_mean": 0.001092008016712498, "clip_ratio/low_min": 0.001092008016712498, "clip_ratio/region_mean": 0.004497715592151508, "completions/clipped_ratio": 0.15625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1626.0, "completions/mean_length": 602.46875, "completions/mean_terminated_length": 382.1851806640625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.998635234311223, "epoch": 0.12166488794023479, "frac_reward_zero_std": 0.5625, "grad_norm": 0.03901458904147148, "kl": 0.06171248806640506, "learning_rate": 4.958918561958617e-05, "loss": 0.0177, "num_tokens": 9378244.0, "reward": 1.0243855714797974, "reward_std": 0.267374724149704, "rewards/argmax_reward_func/mean": 0.4375, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.3884480893611908, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19843751192092896, "rewards/format_reward_func/std": 0.0088388342410326, "sampling/importance_sampling_ratio/max": 2.623746156692505, "sampling/importance_sampling_ratio/mean": 0.6553435325622559, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8145618438720703, "sampling/sampling_logp_difference/mean": 0.021904589608311653, "step": 114, "step_time": 26.7968572030004 }, { "clip_ratio/high_max": 0.005515464439667994, "clip_ratio/high_mean": 0.005515464439667994, "clip_ratio/low_mean": 0.0010109527938766405, "clip_ratio/low_min": 0.0010109527938766405, "clip_ratio/region_mean": 0.006526417233544635, "completions/clipped_ratio": 0.09375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1575.0, "completions/mean_length": 516.875, "completions/mean_terminated_length": 384.96551513671875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 1.0691484352573752, "epoch": 0.12273212379935966, "frac_reward_zero_std": 0.4375, "grad_norm": 0.05996319279074669, "kl": 0.0620142407133244, "learning_rate": 4.9581564116628523e-05, "loss": 0.028, "num_tokens": 9444710.0, "reward": 1.010143518447876, "reward_std": 0.31930291652679443, "rewards/argmax_reward_func/mean": 0.34375, "rewards/argmax_reward_func/std": 0.4825586974620819, "rewards/criterion_gradient_reward_func/mean": 0.4749872088432312, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19140625, "rewards/format_reward_func/std": 0.03626146540045738, "sampling/importance_sampling_ratio/max": 2.766547918319702, "sampling/importance_sampling_ratio/mean": 0.7734300494194031, "sampling/importance_sampling_ratio/min": 0.0027692459989339113, "sampling/sampling_logp_difference/max": 0.5949783325195312, "sampling/sampling_logp_difference/mean": 0.02184232696890831, "step": 115, "step_time": 30.327655709001192 }, { "clip_ratio/high_max": 0.003541136087733321, "clip_ratio/high_mean": 0.003541136087733321, "clip_ratio/low_mean": 0.0011848221474792808, "clip_ratio/low_min": 0.0011848221474792808, "clip_ratio/region_mean": 0.004725958278868347, "completions/clipped_ratio": 0.125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1450.0, "completions/mean_length": 549.0625, "completions/mean_terminated_length": 371.5000305175781, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.98213080316782, "epoch": 0.12379935965848453, "frac_reward_zero_std": 0.375, "grad_norm": 0.04023901745676994, "kl": 0.06319078349042684, "learning_rate": 4.957387316074377e-05, "loss": 0.0398, "num_tokens": 9516250.0, "reward": 1.1433804035186768, "reward_std": 0.367916464805603, "rewards/argmax_reward_func/mean": 0.4375, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.5160366892814636, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.18984374403953552, "rewards/format_reward_func/std": 0.03802555426955223, "sampling/importance_sampling_ratio/max": 2.1967613697052, "sampling/importance_sampling_ratio/mean": 0.5948576331138611, "sampling/importance_sampling_ratio/min": 0.022073544561862946, "sampling/sampling_logp_difference/max": 0.4954519271850586, "sampling/sampling_logp_difference/mean": 0.019835567101836205, "step": 116, "step_time": 29.416932692996852 }, { "clip_ratio/high_max": 0.0024515973491361365, "clip_ratio/high_mean": 0.0024515973491361365, "clip_ratio/low_mean": 0.0006957514633540995, "clip_ratio/low_min": 0.0006957514633540995, "clip_ratio/region_mean": 0.003147348812490236, "completions/clipped_ratio": 0.125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1348.0, "completions/mean_length": 555.4375, "completions/mean_terminated_length": 378.7857360839844, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 1.1055871862918139, "epoch": 0.1248665955176094, "frac_reward_zero_std": 0.5625, "grad_norm": 0.07684978097677231, "kl": 0.05847687873756513, "learning_rate": 4.956611277366201e-05, "loss": -0.0629, "num_tokens": 9587050.0, "reward": 1.3931300640106201, "reward_std": 0.3082543611526489, "rewards/argmax_reward_func/mean": 0.65625, "rewards/argmax_reward_func/std": 0.4825586974620819, "rewards/criterion_gradient_reward_func/mean": 0.5376612544059753, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19921875, "rewards/format_reward_func/std": 0.004419418517500162, "sampling/importance_sampling_ratio/max": 2.82033109664917, "sampling/importance_sampling_ratio/mean": 0.8090525269508362, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7711284160614014, "sampling/sampling_logp_difference/mean": 0.021971844136714935, "step": 117, "step_time": 29.972688240000934 }, { "clip_ratio/high_max": 0.002545676274166908, "clip_ratio/high_mean": 0.002545676274166908, "clip_ratio/low_mean": 0.0010460948724357877, "clip_ratio/low_min": 0.0010460948724357877, "clip_ratio/region_mean": 0.003591771146602696, "completions/clipped_ratio": 0.0625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1649.0, "completions/mean_length": 525.15625, "completions/mean_terminated_length": 440.70001220703125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.9550892356783152, "epoch": 0.12593383137673425, "frac_reward_zero_std": 0.375, "grad_norm": 0.07698798179626465, "kl": 0.05944695579819381, "learning_rate": 4.955828297730949e-05, "loss": 0.0648, "num_tokens": 9660851.0, "reward": 1.2184199094772339, "reward_std": 0.3679164946079254, "rewards/argmax_reward_func/mean": 0.625, "rewards/argmax_reward_func/std": 0.49186936020851135, "rewards/criterion_gradient_reward_func/mean": 0.40513864159584045, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.18828125298023224, "rewards/format_reward_func/std": 0.026169713586568832, "sampling/importance_sampling_ratio/max": 2.9669137001037598, "sampling/importance_sampling_ratio/mean": 0.9082297682762146, "sampling/importance_sampling_ratio/min": 0.022533435374498367, "sampling/sampling_logp_difference/max": 0.7756257057189941, "sampling/sampling_logp_difference/mean": 0.022794779390096664, "step": 118, "step_time": 40.35908120799468 }, { "clip_ratio/high_max": 0.00464548752643168, "clip_ratio/high_mean": 0.00464548752643168, "clip_ratio/low_mean": 0.0012240043470228557, "clip_ratio/low_min": 0.0012240043470228557, "clip_ratio/region_mean": 0.005869491869816557, "completions/clipped_ratio": 0.0, "completions/max_length": 1459.0, "completions/max_terminated_length": 1459.0, "completions/mean_length": 398.90625, "completions/mean_terminated_length": 398.90625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.9100234769284725, "epoch": 0.12700106723585913, "frac_reward_zero_std": 0.5, "grad_norm": 0.09277813881635666, "kl": 0.07065701088868082, "learning_rate": 4.955038379380856e-05, "loss": 0.1244, "num_tokens": 9720870.0, "reward": 1.2975656986236572, "reward_std": 0.3148834705352783, "rewards/argmax_reward_func/mean": 0.59375, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.507722020149231, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19609375298023224, "rewards/format_reward_func/std": 0.01569540426135063, "sampling/importance_sampling_ratio/max": 2.2433760166168213, "sampling/importance_sampling_ratio/mean": 0.7197118997573853, "sampling/importance_sampling_ratio/min": 0.011974100954830647, "sampling/sampling_logp_difference/max": 0.7053737640380859, "sampling/sampling_logp_difference/mean": 0.020651930943131447, "step": 119, "step_time": 23.974257086998477 }, { "clip_ratio/high_max": 0.003425468094064854, "clip_ratio/high_mean": 0.003425468094064854, "clip_ratio/low_mean": 0.0005896068432775792, "clip_ratio/low_min": 0.0005896068432775792, "clip_ratio/region_mean": 0.004015074879134772, "completions/clipped_ratio": 0.1875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1387.0, "completions/mean_length": 621.78125, "completions/mean_terminated_length": 351.73077392578125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.9789111744612455, "epoch": 0.128068303094984, "frac_reward_zero_std": 0.5, "grad_norm": 0.04376758635044098, "kl": 0.0521186682744883, "learning_rate": 4.954241524547765e-05, "loss": -0.0216, "num_tokens": 9807467.0, "reward": 0.8254237771034241, "reward_std": 0.310464084148407, "rewards/argmax_reward_func/mean": 0.53125, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.09495500475168228, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19921875, "rewards/format_reward_func/std": 0.004419418517500162, "sampling/importance_sampling_ratio/max": 2.9695394039154053, "sampling/importance_sampling_ratio/mean": 0.722176194190979, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8316977024078369, "sampling/sampling_logp_difference/mean": 0.01907103694975376, "step": 120, "step_time": 36.72722796599737 }, { "clip_ratio/high_max": 0.0033577567228348926, "clip_ratio/high_mean": 0.0033577567228348926, "clip_ratio/low_mean": 0.0014870587838231586, "clip_ratio/low_min": 0.0014870587838231586, "clip_ratio/region_mean": 0.004844815506658051, "completions/clipped_ratio": 0.09375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1744.0, "completions/mean_length": 575.0625, "completions/mean_terminated_length": 449.17242431640625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.8426438122987747, "epoch": 0.12913553895410887, "frac_reward_zero_std": 0.5, "grad_norm": 0.0369657538831234, "kl": 0.05938100803177804, "learning_rate": 4.953437735483115e-05, "loss": 0.006, "num_tokens": 9875887.0, "reward": 0.8827331066131592, "reward_std": 0.2331242561340332, "rewards/argmax_reward_func/mean": 0.40625, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.28820183873176575, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.18828126788139343, "rewards/format_reward_func/std": 0.028386995196342468, "sampling/importance_sampling_ratio/max": 2.478381872177124, "sampling/importance_sampling_ratio/mean": 0.5948752164840698, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6152477264404297, "sampling/sampling_logp_difference/mean": 0.018781252205371857, "step": 121, "step_time": 27.287712673998612 }, { "clip_ratio/high_max": 0.0027174153074156493, "clip_ratio/high_mean": 0.0027174153074156493, "clip_ratio/low_mean": 0.0012910364603158087, "clip_ratio/low_min": 0.0012910364603158087, "clip_ratio/region_mean": 0.004008451767731458, "completions/clipped_ratio": 0.03125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1606.0, "completions/mean_length": 375.6875, "completions/mean_terminated_length": 330.0, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.8830891419202089, "epoch": 0.13020277481323372, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09697040170431137, "kl": 0.11202599387615919, "learning_rate": 4.9526270144579354e-05, "loss": -0.1124, "num_tokens": 9944367.0, "reward": 1.114847183227539, "reward_std": 0.13589707016944885, "rewards/argmax_reward_func/mean": 0.46875, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.4500034749507904, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19609375298023224, "rewards/format_reward_func/std": 0.01287247333675623, "sampling/importance_sampling_ratio/max": 2.879899024963379, "sampling/importance_sampling_ratio/mean": 0.820202112197876, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7829980850219727, "sampling/sampling_logp_difference/mean": 0.022768940776586533, "step": 122, "step_time": 29.95415536699693 }, { "clip_ratio/high_max": 0.0015540734784735832, "clip_ratio/high_mean": 0.0015540734784735832, "clip_ratio/low_mean": 0.0010834421627805568, "clip_ratio/low_min": 0.0010834421627805568, "clip_ratio/region_mean": 0.0026375156885478646, "completions/clipped_ratio": 0.15625, "completions/max_length": 1792.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 536.5, "completions/mean_terminated_length": 304.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.7924980334937572, "epoch": 0.13127001067235858, "frac_reward_zero_std": 0.5, "grad_norm": 0.06026678904891014, "kl": 0.05491509207058698, "learning_rate": 4.951809363762846e-05, "loss": 0.102, "num_tokens": 10016819.0, "reward": 1.222501277923584, "reward_std": 0.2728990316390991, "rewards/argmax_reward_func/mean": 0.4375, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.5920324325561523, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19296875596046448, "rewards/format_reward_func/std": 0.022210843861103058, "sampling/importance_sampling_ratio/max": 2.5236473083496094, "sampling/importance_sampling_ratio/mean": 0.8351523876190186, "sampling/importance_sampling_ratio/min": 0.029060667380690575, "sampling/sampling_logp_difference/max": 0.7091732025146484, "sampling/sampling_logp_difference/mean": 0.01733223721385002, "step": 123, "step_time": 30.291773711998758 }, { "clip_ratio/high_max": 0.0034466692886780947, "clip_ratio/high_mean": 0.0034466692886780947, "clip_ratio/low_mean": 0.000711643808244844, "clip_ratio/low_min": 0.000711643808244844, "clip_ratio/region_mean": 0.0041583131005609175, "completions/clipped_ratio": 0.09375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1784.0, "completions/mean_length": 686.5625, "completions/mean_terminated_length": 572.2069091796875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.6795778442174196, "epoch": 0.13233724653148346, "frac_reward_zero_std": 0.375, "grad_norm": 0.08347150683403015, "kl": 0.04900388850364834, "learning_rate": 4.9509847857080405e-05, "loss": -0.1198, "num_tokens": 10089215.0, "reward": 0.8449986577033997, "reward_std": 0.36570677161216736, "rewards/argmax_reward_func/mean": 0.375, "rewards/argmax_reward_func/std": 0.49186936020851135, "rewards/criterion_gradient_reward_func/mean": 0.27859237790107727, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19140625, "rewards/format_reward_func/std": 0.037356920540332794, "sampling/importance_sampling_ratio/max": 2.7632079124450684, "sampling/importance_sampling_ratio/mean": 0.9471502304077148, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5956296920776367, "sampling/sampling_logp_difference/mean": 0.015877146273851395, "step": 124, "step_time": 27.693548998997358 }, { "clip_ratio/high_max": 0.0029566571975010447, "clip_ratio/high_mean": 0.0029566571975010447, "clip_ratio/low_mean": 0.0007096172812453005, "clip_ratio/low_min": 0.0007096172812453005, "clip_ratio/region_mean": 0.0036662744787463453, "completions/clipped_ratio": 0.125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1511.0, "completions/mean_length": 651.96875, "completions/mean_terminated_length": 489.107177734375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.8111472092568874, "epoch": 0.13340448239060831, "frac_reward_zero_std": 0.5625, "grad_norm": 0.051503267139196396, "kl": 0.053361350670456886, "learning_rate": 4.950153282623289e-05, "loss": 0.0172, "num_tokens": 10166644.0, "reward": 1.3040904998779297, "reward_std": 0.3170931935310364, "rewards/argmax_reward_func/mean": 0.65625, "rewards/argmax_reward_func/std": 0.4825586974620819, "rewards/criterion_gradient_reward_func/mean": 0.45330923795700073, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19453124701976776, "rewards/format_reward_func/std": 0.02175224758684635, "sampling/importance_sampling_ratio/max": 2.834838628768921, "sampling/importance_sampling_ratio/mean": 0.8119727373123169, "sampling/importance_sampling_ratio/min": 0.008364823646843433, "sampling/sampling_logp_difference/max": 0.5317134857177734, "sampling/sampling_logp_difference/mean": 0.019595004618167877, "step": 125, "step_time": 36.693285681994894 }, { "clip_ratio/high_max": 0.002940320315246936, "clip_ratio/high_mean": 0.002940320315246936, "clip_ratio/low_mean": 0.0019635577700682916, "clip_ratio/low_min": 0.0019635577700682916, "clip_ratio/region_mean": 0.004903878092591185, "completions/clipped_ratio": 0.03125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1569.0, "completions/mean_length": 533.21875, "completions/mean_terminated_length": 492.6128845214844, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.828266428783536, "epoch": 0.1344717182497332, "frac_reward_zero_std": 0.4375, "grad_norm": 0.0664706900715828, "kl": 0.06305320328101516, "learning_rate": 4.949314856857926e-05, "loss": 0.0956, "num_tokens": 10231365.0, "reward": 1.0279117822647095, "reward_std": 0.36128735542297363, "rewards/argmax_reward_func/mean": 0.5625, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.2708805203437805, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19453124701976776, "rewards/format_reward_func/std": 0.017659839242696762, "sampling/importance_sampling_ratio/max": 2.6654839515686035, "sampling/importance_sampling_ratio/mean": 0.6876707077026367, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.46346187591552734, "sampling/sampling_logp_difference/mean": 0.018559055402874947, "step": 126, "step_time": 35.92191163499774 }, { "clip_ratio/high_max": 0.0010587005235720426, "clip_ratio/high_mean": 0.0010587005235720426, "clip_ratio/low_mean": 0.0005500477636815049, "clip_ratio/low_min": 0.0005500477636815049, "clip_ratio/region_mean": 0.0016087482872535475, "completions/clipped_ratio": 0.3125, "completions/max_length": 1792.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 829.8125, "completions/mean_terminated_length": 392.4545593261719, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.8924076501280069, "epoch": 0.13553895410885805, "frac_reward_zero_std": 0.6875, "grad_norm": 0.026612555608153343, "kl": 0.04729558911640197, "learning_rate": 4.948469510780845e-05, "loss": -0.0465, "num_tokens": 10311107.0, "reward": 1.3069833517074585, "reward_std": 0.1800912618637085, "rewards/argmax_reward_func/mean": 0.5625, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.5468271374702454, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19765624403953552, "rewards/format_reward_func/std": 0.013258252292871475, "sampling/importance_sampling_ratio/max": 1.704622745513916, "sampling/importance_sampling_ratio/mean": 0.5863226056098938, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.732081413269043, "sampling/sampling_logp_difference/mean": 0.017684902995824814, "step": 127, "step_time": 33.99609644699376 }, { "clip_ratio/high_max": 0.00350215716753155, "clip_ratio/high_mean": 0.00350215716753155, "clip_ratio/low_mean": 0.002205348575444077, "clip_ratio/low_min": 0.002205348575444077, "clip_ratio/region_mean": 0.005707505742975627, "completions/clipped_ratio": 0.0625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1669.0, "completions/mean_length": 528.71875, "completions/mean_terminated_length": 444.5000305175781, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.7467383118346334, "epoch": 0.13660618996798293, "frac_reward_zero_std": 0.4375, "grad_norm": 0.041136931627988815, "kl": 0.0763125594239682, "learning_rate": 4.947617246780494e-05, "loss": -0.0176, "num_tokens": 10392726.0, "reward": 1.265879511833191, "reward_std": 0.3999572694301605, "rewards/argmax_reward_func/mean": 0.59375, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.4736919105052948, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19843751192092896, "rewards/format_reward_func/std": 0.0088388342410326, "sampling/importance_sampling_ratio/max": 2.9850964546203613, "sampling/importance_sampling_ratio/mean": 0.6692011952400208, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1813938617706299, "sampling/sampling_logp_difference/mean": 0.01915297843515873, "step": 128, "step_time": 31.954205230000298 }, { "clip_ratio/high_max": 0.0018369901627011131, "clip_ratio/high_mean": 0.0018369901627011131, "clip_ratio/low_mean": 0.0014763461003894918, "clip_ratio/low_min": 0.0014763461003894918, "clip_ratio/region_mean": 0.003313336295832414, "completions/clipped_ratio": 0.15625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1635.0, "completions/mean_length": 748.40625, "completions/mean_terminated_length": 555.1481323242188, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.8064903542399406, "epoch": 0.1376734258271078, "frac_reward_zero_std": 0.3125, "grad_norm": 0.03717450797557831, "kl": 0.05916215182514861, "learning_rate": 4.9467580672648656e-05, "loss": 0.0591, "num_tokens": 10471587.0, "reward": 1.1479864120483398, "reward_std": 0.44415146112442017, "rewards/argmax_reward_func/mean": 0.5, "rewards/argmax_reward_func/std": 0.5080004930496216, "rewards/criterion_gradient_reward_func/mean": 0.4495489001274109, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19843751192092896, "rewards/format_reward_func/std": 0.0088388342410326, "sampling/importance_sampling_ratio/max": 2.313582181930542, "sampling/importance_sampling_ratio/mean": 0.6337773203849792, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7951787710189819, "sampling/sampling_logp_difference/mean": 0.01652953214943409, "step": 129, "step_time": 35.183144898001046 }, { "clip_ratio/high_max": 0.002275940940307919, "clip_ratio/high_mean": 0.002275940940307919, "clip_ratio/low_mean": 0.0006899431064084638, "clip_ratio/low_min": 0.0006899431064084638, "clip_ratio/region_mean": 0.0029658840467163827, "completions/clipped_ratio": 0.15625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 810.9375, "completions/mean_terminated_length": 629.25927734375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.5397803895175457, "epoch": 0.13874066168623267, "frac_reward_zero_std": 0.5625, "grad_norm": 0.04783239960670471, "kl": 0.05473995057400316, "learning_rate": 4.945891974661491e-05, "loss": 0.1138, "num_tokens": 10552715.0, "reward": 1.073197364807129, "reward_std": 0.26626986265182495, "rewards/argmax_reward_func/mean": 0.375, "rewards/argmax_reward_func/std": 0.49186936020851135, "rewards/criterion_gradient_reward_func/mean": 0.5021036863327026, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19609375298023224, "rewards/format_reward_func/std": 0.01569540426135063, "sampling/importance_sampling_ratio/max": 2.390848159790039, "sampling/importance_sampling_ratio/mean": 0.7553216814994812, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6297717094421387, "sampling/sampling_logp_difference/mean": 0.013600403442978859, "step": 130, "step_time": 33.794763528996555 }, { "clip_ratio/high_max": 0.0014762446517124772, "clip_ratio/high_mean": 0.0014762446517124772, "clip_ratio/low_mean": 0.0005643110198434442, "clip_ratio/low_min": 0.0005643110198434442, "clip_ratio/region_mean": 0.0020405556715559214, "completions/clipped_ratio": 0.15625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1131.0, "completions/mean_length": 673.40625, "completions/mean_terminated_length": 466.25927734375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.8632239159196615, "epoch": 0.13980789754535752, "frac_reward_zero_std": 0.6875, "grad_norm": 0.044139567762613297, "kl": 0.05779507884290069, "learning_rate": 4.9450189714174345e-05, "loss": -0.0449, "num_tokens": 10630452.0, "reward": 1.1574947834014893, "reward_std": 0.22428542375564575, "rewards/argmax_reward_func/mean": 0.59375, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.3660885691642761, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19765624403953552, "rewards/format_reward_func/std": 0.013258252292871475, "sampling/importance_sampling_ratio/max": 2.723970890045166, "sampling/importance_sampling_ratio/mean": 0.7562675476074219, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9669257402420044, "sampling/sampling_logp_difference/mean": 0.01831827685236931, "step": 131, "step_time": 36.77160691000245 }, { "clip_ratio/high_max": 0.0038935016564209945, "clip_ratio/high_mean": 0.0038935016564209945, "clip_ratio/low_mean": 0.0032629642337269615, "clip_ratio/low_min": 0.0032629642337269615, "clip_ratio/region_mean": 0.00715646587923402, "completions/clipped_ratio": 0.15625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 614.625, "completions/mean_terminated_length": 396.59259033203125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.698557292111218, "epoch": 0.14087513340448238, "frac_reward_zero_std": 0.25, "grad_norm": 0.06649530678987503, "kl": 0.06460206967312843, "learning_rate": 4.9441390599992864e-05, "loss": -0.0962, "num_tokens": 10699970.0, "reward": 0.9352280497550964, "reward_std": 0.37675532698631287, "rewards/argmax_reward_func/mean": 0.375, "rewards/argmax_reward_func/std": 0.49186936020851135, "rewards/criterion_gradient_reward_func/mean": 0.38288429379463196, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.17734375596046448, "rewards/format_reward_func/std": 0.047671325504779816, "sampling/importance_sampling_ratio/max": 2.9787545204162598, "sampling/importance_sampling_ratio/mean": 0.8989795446395874, "sampling/importance_sampling_ratio/min": 0.059735193848609924, "sampling/sampling_logp_difference/max": 0.6685388088226318, "sampling/sampling_logp_difference/mean": 0.016789959743618965, "step": 132, "step_time": 41.28456522200031 }, { "clip_ratio/high_max": 0.0008375017969228793, "clip_ratio/high_mean": 0.0008375017969228793, "clip_ratio/low_mean": 0.0005156294246262405, "clip_ratio/low_min": 0.0005156294246262405, "clip_ratio/region_mean": 0.0013531312251870986, "completions/clipped_ratio": 0.1875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 886.75, "completions/mean_terminated_length": 677.84619140625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.6656341068446636, "epoch": 0.14194236926360726, "frac_reward_zero_std": 0.6875, "grad_norm": 0.04334983974695206, "kl": 0.04454635572619736, "learning_rate": 4.943252242893154e-05, "loss": -0.1636, "num_tokens": 10776742.0, "reward": 1.167112946510315, "reward_std": 0.22428542375564575, "rewards/argmax_reward_func/mean": 0.53125, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.4382067322731018, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19765624403953552, "rewards/format_reward_func/std": 0.013258252292871475, "sampling/importance_sampling_ratio/max": 1.8704473972320557, "sampling/importance_sampling_ratio/mean": 0.5243113040924072, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48828887939453125, "sampling/sampling_logp_difference/mean": 0.015098264440894127, "step": 133, "step_time": 28.56244768599754 }, { "clip_ratio/high_max": 0.002587135626527015, "clip_ratio/high_mean": 0.002587135626527015, "clip_ratio/low_mean": 0.0006815311062382534, "clip_ratio/low_min": 0.0006815311062382534, "clip_ratio/region_mean": 0.0032686667327652685, "completions/clipped_ratio": 0.15625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1628.0, "completions/mean_length": 740.34375, "completions/mean_terminated_length": 545.5925903320312, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 1.0465656332671642, "epoch": 0.14300960512273211, "frac_reward_zero_std": 0.5, "grad_norm": 0.034073300659656525, "kl": 0.05758290330413729, "learning_rate": 4.9423585226046576e-05, "loss": -0.0577, "num_tokens": 10852651.0, "reward": 1.2155845165252686, "reward_std": 0.3137786388397217, "rewards/argmax_reward_func/mean": 0.59375, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.4249594807624817, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19687500596046448, "rewards/format_reward_func/std": 0.0176776684820652, "sampling/importance_sampling_ratio/max": 1.9634289741516113, "sampling/importance_sampling_ratio/mean": 0.5235851407051086, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6791582107543945, "sampling/sampling_logp_difference/mean": 0.022920683026313782, "step": 134, "step_time": 33.20963674000268 }, { "clip_ratio/high_max": 0.0019263344875071198, "clip_ratio/high_mean": 0.0019263344875071198, "clip_ratio/low_mean": 0.00044245863318792544, "clip_ratio/low_min": 0.00044245863318792544, "clip_ratio/region_mean": 0.002368793127971003, "completions/clipped_ratio": 0.1875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1637.0, "completions/mean_length": 735.6875, "completions/mean_terminated_length": 491.923095703125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.6959915962070227, "epoch": 0.144076840981857, "frac_reward_zero_std": 0.5625, "grad_norm": 0.021606730297207832, "kl": 0.07013013097457588, "learning_rate": 4.9414579016589197e-05, "loss": 0.0169, "num_tokens": 10934387.0, "reward": 1.2170027494430542, "reward_std": 0.3181980550289154, "rewards/argmax_reward_func/mean": 0.59375, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.4295027554035187, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19374999403953552, "rewards/format_reward_func/std": 0.0353553406894207, "sampling/importance_sampling_ratio/max": 1.8493765592575073, "sampling/importance_sampling_ratio/mean": 0.5107149481773376, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0909366607666016, "sampling/sampling_logp_difference/mean": 0.016207415610551834, "step": 135, "step_time": 34.69813160099693 }, { "clip_ratio/high_max": 0.0019691415182023775, "clip_ratio/high_mean": 0.0019691415182023775, "clip_ratio/low_mean": 0.0006198141782078892, "clip_ratio/low_min": 0.0006198141782078892, "clip_ratio/region_mean": 0.0025889557218761183, "completions/clipped_ratio": 0.125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1520.0, "completions/mean_length": 775.15625, "completions/mean_terminated_length": 629.8928833007812, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.6313972799107432, "epoch": 0.14514407684098185, "frac_reward_zero_std": 0.5625, "grad_norm": 0.05793055519461632, "kl": 0.060856341500766575, "learning_rate": 4.940550382600562e-05, "loss": -0.181, "num_tokens": 11016278.0, "reward": 1.2864928245544434, "reward_std": 0.31267377734184265, "rewards/argmax_reward_func/mean": 0.59375, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.4950864315032959, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19765625894069672, "rewards/format_reward_func/std": 0.009753772988915443, "sampling/importance_sampling_ratio/max": 2.7924816608428955, "sampling/importance_sampling_ratio/mean": 0.6663376092910767, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6528959274291992, "sampling/sampling_logp_difference/mean": 0.01782122254371643, "step": 136, "step_time": 33.99964887399619 }, { "clip_ratio/high_max": 0.0024557073920732364, "clip_ratio/high_mean": 0.0024557073920732364, "clip_ratio/low_mean": 0.0012262582167750224, "clip_ratio/low_min": 0.0012262582167750224, "clip_ratio/region_mean": 0.0036819656015723012, "completions/clipped_ratio": 0.125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1547.0, "completions/mean_length": 744.59375, "completions/mean_terminated_length": 594.9642944335938, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.5440439824014902, "epoch": 0.14621131270010673, "frac_reward_zero_std": 0.4375, "grad_norm": 0.04474367946386337, "kl": 0.06304056313820183, "learning_rate": 4.939635967993696e-05, "loss": 0.0562, "num_tokens": 11085515.0, "reward": 1.1324188709259033, "reward_std": 0.40437665581703186, "rewards/argmax_reward_func/mean": 0.46875, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.4683564603328705, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.1953125, "rewards/format_reward_func/std": 0.018445102497935295, "sampling/importance_sampling_ratio/max": 2.1578822135925293, "sampling/importance_sampling_ratio/mean": 0.5799423456192017, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5586528778076172, "sampling/sampling_logp_difference/mean": 0.014404837042093277, "step": 137, "step_time": 27.31567918100336 }, { "clip_ratio/high_max": 0.000578813451284077, "clip_ratio/high_mean": 0.000578813451284077, "clip_ratio/low_mean": 0.001179462797153974, "clip_ratio/low_min": 0.001179462797153974, "clip_ratio/region_mean": 0.001758276248438051, "completions/clipped_ratio": 0.4375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1359.0, "completions/mean_length": 1082.34375, "completions/mean_terminated_length": 530.388916015625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.6007820046506822, "epoch": 0.1472785485592316, "frac_reward_zero_std": 0.6875, "grad_norm": 0.030754202976822853, "kl": 0.04517504223622382, "learning_rate": 4.938714660421913e-05, "loss": -0.1258, "num_tokens": 11164116.0, "reward": 0.8630474209785461, "reward_std": 0.183405801653862, "rewards/argmax_reward_func/mean": 0.4375, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.23023495078086853, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.1953125, "rewards/format_reward_func/std": 0.018445102497935295, "sampling/importance_sampling_ratio/max": 2.5172057151794434, "sampling/importance_sampling_ratio/mean": 0.6790514588356018, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5966720581054688, "sampling/sampling_logp_difference/mean": 0.013072899542748928, "step": 138, "step_time": 29.660218309998527 }, { "clip_ratio/high_max": 0.0013592173454526346, "clip_ratio/high_mean": 0.0013592173454526346, "clip_ratio/low_mean": 0.0012628794174815994, "clip_ratio/low_min": 0.0012628794174815994, "clip_ratio/region_mean": 0.0026220967520202976, "completions/clipped_ratio": 0.25, "completions/max_length": 1792.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 894.1875, "completions/mean_terminated_length": 594.9166870117188, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.5647016791626811, "epoch": 0.14834578441835647, "frac_reward_zero_std": 0.4375, "grad_norm": 0.021024126559495926, "kl": 0.04520266386680305, "learning_rate": 4.937786462488284e-05, "loss": 0.0864, "num_tokens": 11238918.0, "reward": 1.247377634048462, "reward_std": 0.31267374753952026, "rewards/argmax_reward_func/mean": 0.53125, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.518471360206604, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19765625894069672, "rewards/format_reward_func/std": 0.009753772988915443, "sampling/importance_sampling_ratio/max": 2.3051271438598633, "sampling/importance_sampling_ratio/mean": 0.502336323261261, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9189271926879883, "sampling/sampling_logp_difference/mean": 0.013351859524846077, "step": 139, "step_time": 29.018333888996494 }, { "clip_ratio/high_max": 0.001373249839161872, "clip_ratio/high_mean": 0.001373249839161872, "clip_ratio/low_mean": 0.0004147420695517212, "clip_ratio/low_min": 0.0004147420695517212, "clip_ratio/region_mean": 0.0017879919105325826, "completions/clipped_ratio": 0.34375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1652.0, "completions/mean_length": 1192.15625, "completions/mean_terminated_length": 877.952392578125, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "entropy": 0.604381981305778, "epoch": 0.14941302027748132, "frac_reward_zero_std": 0.6875, "grad_norm": 0.026798900216817856, "kl": 0.042868462507613, "learning_rate": 4.9368513768153454e-05, "loss": 0.0615, "num_tokens": 11328703.0, "reward": 1.3468117713928223, "reward_std": 0.22649511694908142, "rewards/argmax_reward_func/mean": 0.65625, "rewards/argmax_reward_func/std": 0.4825586974620819, "rewards/criterion_gradient_reward_func/mean": 0.494468092918396, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19609375298023224, "rewards/format_reward_func/std": 0.01569540426135063, "sampling/importance_sampling_ratio/max": 2.4450507164001465, "sampling/importance_sampling_ratio/mean": 0.6139612793922424, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3755264282226562, "sampling/sampling_logp_difference/mean": 0.014979476109147072, "step": 140, "step_time": 30.295391252000627 }, { "clip_ratio/high_max": 0.0010080283973366022, "clip_ratio/high_mean": 0.0010080283973366022, "clip_ratio/low_mean": 0.0005281970152282156, "clip_ratio/low_min": 0.0005281970152282156, "clip_ratio/region_mean": 0.0015362254125648178, "completions/clipped_ratio": 0.375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1624.0, "completions/mean_length": 1168.6875, "completions/mean_terminated_length": 794.7000122070312, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.4784691845998168, "epoch": 0.15048025613660618, "frac_reward_zero_std": 0.625, "grad_norm": 0.018744932487607002, "kl": 0.05686362599954009, "learning_rate": 4.935909406045095e-05, "loss": -0.0729, "num_tokens": 11424317.0, "reward": 1.3515886068344116, "reward_std": 0.268479585647583, "rewards/argmax_reward_func/mean": 0.75, "rewards/argmax_reward_func/std": 0.4399413466453552, "rewards/criterion_gradient_reward_func/mean": 0.4039323031902313, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19765624403953552, "rewards/format_reward_func/std": 0.013258252292871475, "sampling/importance_sampling_ratio/max": 2.722687005996704, "sampling/importance_sampling_ratio/mean": 0.7193788290023804, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0444092750549316, "sampling/sampling_logp_difference/mean": 0.01098846085369587, "step": 141, "step_time": 34.060510299003 }, { "clip_ratio/high_max": 0.0019873030178132467, "clip_ratio/high_mean": 0.0019873030178132467, "clip_ratio/low_mean": 0.0005527056746359449, "clip_ratio/low_min": 0.0005527056746359449, "clip_ratio/region_mean": 0.0025400086924491916, "completions/clipped_ratio": 0.46875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1761.0, "completions/mean_length": 1155.90625, "completions/mean_terminated_length": 594.6470336914062, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.5935456287115812, "epoch": 0.15154749199573106, "frac_reward_zero_std": 0.5625, "grad_norm": 0.020840080454945564, "kl": 0.04174042830709368, "learning_rate": 4.9349605528389845e-05, "loss": -0.0381, "num_tokens": 11515768.0, "reward": 0.9073436260223389, "reward_std": 0.30935922265052795, "rewards/argmax_reward_func/mean": 0.40625, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.30109357833862305, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5642244815826416, "sampling/importance_sampling_ratio/mean": 0.6683065295219421, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3415629863739014, "sampling/sampling_logp_difference/mean": 0.013238091953098774, "step": 142, "step_time": 39.64482545399733 }, { "clip_ratio/high_max": 0.001203391708259005, "clip_ratio/high_mean": 0.001203391708259005, "clip_ratio/low_mean": 0.0005386820303101558, "clip_ratio/low_min": 0.0005386820303101558, "clip_ratio/region_mean": 0.0017420737276552245, "completions/clipped_ratio": 0.53125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1198.0, "completions/mean_length": 1227.4375, "completions/mean_terminated_length": 587.6000366210938, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.5291084330528975, "epoch": 0.1526147278548559, "frac_reward_zero_std": 0.6875, "grad_norm": 0.021302150562405586, "kl": 0.052715251222252846, "learning_rate": 4.9340048198779086e-05, "loss": -0.0186, "num_tokens": 11602862.0, "reward": 1.116316795349121, "reward_std": 0.22097085416316986, "rewards/argmax_reward_func/mean": 0.46875, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.4475668668746948, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7378108501434326, "sampling/importance_sampling_ratio/mean": 0.8082627654075623, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5270624160766602, "sampling/sampling_logp_difference/mean": 0.012704521417617798, "step": 143, "step_time": 31.130956973998764 }, { "clip_ratio/high_max": 0.0012461320584407076, "clip_ratio/high_mean": 0.0012461320584407076, "clip_ratio/low_mean": 0.0008264939497166779, "clip_ratio/low_min": 0.0008264939497166779, "clip_ratio/region_mean": 0.0020726260081573855, "completions/clipped_ratio": 0.375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1520.0, "completions/mean_length": 1038.15625, "completions/mean_terminated_length": 585.8500366210938, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.5681149112060666, "epoch": 0.1536819637139808, "frac_reward_zero_std": 0.5625, "grad_norm": 0.03235192969441414, "kl": 0.061176844174042344, "learning_rate": 4.9330422098622036e-05, "loss": 0.113, "num_tokens": 11686519.0, "reward": 0.9811115264892578, "reward_std": 0.30935919284820557, "rewards/argmax_reward_func/mean": 0.53125, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.24986158311367035, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7073593139648438, "sampling/importance_sampling_ratio/mean": 0.5356861352920532, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5985562801361084, "sampling/sampling_logp_difference/mean": 0.013373913243412971, "step": 144, "step_time": 34.57526049899934 }, { "clip_ratio/high_max": 0.0023173607914941385, "clip_ratio/high_mean": 0.0023173607914941385, "clip_ratio/low_mean": 0.0005296957042446593, "clip_ratio/low_min": 0.0005296957042446593, "clip_ratio/region_mean": 0.0028470565030147554, "completions/clipped_ratio": 0.46875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1439.0, "completions/mean_length": 1166.15625, "completions/mean_terminated_length": 613.941162109375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.3736153105273843, "epoch": 0.15474919957310565, "frac_reward_zero_std": 0.5, "grad_norm": 0.035123053938150406, "kl": 0.046433478011749685, "learning_rate": 4.932072725511635e-05, "loss": 0.0906, "num_tokens": 11776416.0, "reward": 1.0273255109786987, "reward_std": 0.3535533845424652, "rewards/argmax_reward_func/mean": 0.4375, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.3898255228996277, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8004015684127808, "sampling/importance_sampling_ratio/mean": 0.6265352368354797, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.938629150390625, "sampling/sampling_logp_difference/mean": 0.010657379403710365, "step": 145, "step_time": 37.168279140994855 }, { "clip_ratio/high_max": 0.0010037496904260479, "clip_ratio/high_mean": 0.0010037496904260479, "clip_ratio/low_mean": 0.001143164397944929, "clip_ratio/low_min": 0.001143164397944929, "clip_ratio/region_mean": 0.0021469140956469346, "completions/clipped_ratio": 0.46875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1419.0, "completions/mean_length": 1156.46875, "completions/mean_terminated_length": 595.7058715820312, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.5426225787959993, "epoch": 0.15581643543223053, "frac_reward_zero_std": 0.5, "grad_norm": 0.018576569855213165, "kl": 0.05040669138543308, "learning_rate": 4.93109636956539e-05, "loss": 0.0235, "num_tokens": 11869997.0, "reward": 1.212945580482483, "reward_std": 0.3535533845424652, "rewards/argmax_reward_func/mean": 0.5625, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.45044559240341187, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.42446231842041, "sampling/importance_sampling_ratio/mean": 0.7011717557907104, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8561782836914062, "sampling/sampling_logp_difference/mean": 0.012058618478477001, "step": 146, "step_time": 33.13332339200315 }, { "clip_ratio/high_max": 0.0003975275612901896, "clip_ratio/high_mean": 0.0003975275612901896, "clip_ratio/low_mean": 0.0010127288987860084, "clip_ratio/low_min": 0.0010127288987860084, "clip_ratio/region_mean": 0.001410256460076198, "completions/clipped_ratio": 0.25, "completions/max_length": 1792.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 1048.4375, "completions/mean_terminated_length": 800.5833740234375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.48649877961724997, "epoch": 0.1568836712913554, "frac_reward_zero_std": 0.8125, "grad_norm": 0.021685024723410606, "kl": 0.05727243726141751, "learning_rate": 4.9301131447820715e-05, "loss": -0.0084, "num_tokens": 11960303.0, "reward": 0.9041178226470947, "reward_std": 0.09501747041940689, "rewards/argmax_reward_func/mean": 0.375, "rewards/argmax_reward_func/std": 0.49186936020851135, "rewards/criterion_gradient_reward_func/mean": 0.3338053822517395, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.1953125, "rewards/format_reward_func/std": 0.018445102497935295, "sampling/importance_sampling_ratio/max": 2.828810214996338, "sampling/importance_sampling_ratio/mean": 0.6272746920585632, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7491645812988281, "sampling/sampling_logp_difference/mean": 0.013969182036817074, "step": 147, "step_time": 35.84456920199591 }, { "clip_ratio/high_max": 0.0010107391281053424, "clip_ratio/high_mean": 0.0010107391281053424, "clip_ratio/low_mean": 0.00023409690766129643, "clip_ratio/low_min": 0.00023409690766129643, "clip_ratio/region_mean": 0.0012448360357666388, "completions/clipped_ratio": 0.375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1746.0, "completions/mean_length": 1309.09375, "completions/mean_terminated_length": 1019.3500366210938, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.39242542162537575, "epoch": 0.15795090715048027, "frac_reward_zero_std": 0.6875, "grad_norm": 0.010449844412505627, "kl": 0.04511894995812327, "learning_rate": 4.9291230539396905e-05, "loss": 0.0037, "num_tokens": 12038190.0, "reward": 1.069015622138977, "reward_std": 0.22428540885448456, "rewards/argmax_reward_func/mean": 0.53125, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.3401094377040863, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19765624403953552, "rewards/format_reward_func/std": 0.013258252292871475, "sampling/importance_sampling_ratio/max": 0.9428624510765076, "sampling/importance_sampling_ratio/mean": 0.34565699100494385, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1167781352996826, "sampling/sampling_logp_difference/mean": 0.010177024640142918, "step": 148, "step_time": 27.883940283998527 }, { "clip_ratio/high_max": 0.0009177881365758367, "clip_ratio/high_mean": 0.0009177881365758367, "clip_ratio/low_mean": 0.00036379447192302905, "clip_ratio/low_min": 0.00036379447192302905, "clip_ratio/region_mean": 0.0012815825975849293, "completions/clipped_ratio": 0.59375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1351.0, "completions/mean_length": 1315.78125, "completions/mean_terminated_length": 619.7692260742188, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.34854994621127844, "epoch": 0.15901814300960512, "frac_reward_zero_std": 0.625, "grad_norm": 0.015574318356812, "kl": 0.04610704246442765, "learning_rate": 4.9281260998356565e-05, "loss": -0.0268, "num_tokens": 12140111.0, "reward": 1.17301607131958, "reward_std": 0.2651650309562683, "rewards/argmax_reward_func/mean": 0.625, "rewards/argmax_reward_func/std": 0.49186936020851135, "rewards/criterion_gradient_reward_func/mean": 0.3480161428451538, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9408550262451172, "sampling/importance_sampling_ratio/mean": 0.543925404548645, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7348033785820007, "sampling/sampling_logp_difference/mean": 0.00943097285926342, "step": 149, "step_time": 39.600183260999984 }, { "clip_ratio/high_max": 0.0008507754209858831, "clip_ratio/high_mean": 0.0008507754209858831, "clip_ratio/low_mean": 0.0008573358536523301, "clip_ratio/low_min": 0.0008573358536523301, "clip_ratio/region_mean": 0.0017081112710002344, "completions/clipped_ratio": 0.59375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1646.0, "completions/mean_length": 1382.96875, "completions/mean_terminated_length": 785.1538696289062, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.49305779952555895, "epoch": 0.16008537886872998, "frac_reward_zero_std": 0.625, "grad_norm": 0.01877957209944725, "kl": 0.03695497091393918, "learning_rate": 4.927122285286771e-05, "loss": -0.0589, "num_tokens": 12249188.0, "reward": 1.3194915056228638, "reward_std": 0.22318057715892792, "rewards/argmax_reward_func/mean": 0.59375, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.5273039937019348, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19843751192092896, "rewards/format_reward_func/std": 0.0088388342410326, "sampling/importance_sampling_ratio/max": 2.6772594451904297, "sampling/importance_sampling_ratio/mean": 0.643267035484314, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9692580699920654, "sampling/sampling_logp_difference/mean": 0.011253305710852146, "step": 150, "step_time": 38.24354869200215 }, { "clip_ratio/high_max": 0.000913825431780424, "clip_ratio/high_mean": 0.000913825431780424, "clip_ratio/low_mean": 0.00028578674027812667, "clip_ratio/low_min": 0.00028578674027812667, "clip_ratio/region_mean": 0.0011996121720585506, "completions/clipped_ratio": 0.46875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1325.0, "completions/mean_length": 1261.15625, "completions/mean_terminated_length": 792.7647094726562, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.35973603185266256, "epoch": 0.16115261472785486, "frac_reward_zero_std": 0.6875, "grad_norm": 0.02602352574467659, "kl": 0.05424344539642334, "learning_rate": 4.9261116131292186e-05, "loss": -0.0172, "num_tokens": 12348821.0, "reward": 1.234575867652893, "reward_std": 0.17898638546466827, "rewards/argmax_reward_func/mean": 0.5625, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.4736383855342865, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19843751192092896, "rewards/format_reward_func/std": 0.0088388342410326, "sampling/importance_sampling_ratio/max": 2.1940290927886963, "sampling/importance_sampling_ratio/mean": 0.7000988721847534, "sampling/importance_sampling_ratio/min": 0.08255302906036377, "sampling/sampling_logp_difference/max": 0.9512200355529785, "sampling/sampling_logp_difference/mean": 0.009589425288140774, "step": 151, "step_time": 37.17394343699743 }, { "clip_ratio/high_max": 0.0021364511994761415, "clip_ratio/high_mean": 0.0021364511994761415, "clip_ratio/low_mean": 0.001119285516324453, "clip_ratio/low_min": 0.001119285516324453, "clip_ratio/region_mean": 0.0032557367158005945, "completions/clipped_ratio": 0.53125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1655.0, "completions/mean_length": 1318.4375, "completions/mean_terminated_length": 781.7333984375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.42834923788905144, "epoch": 0.1622198505869797, "frac_reward_zero_std": 0.4375, "grad_norm": 0.02678682468831539, "kl": 0.04619070573244244, "learning_rate": 4.925094086218559e-05, "loss": -0.0323, "num_tokens": 12435919.0, "reward": 0.9711881875991821, "reward_std": 0.39774754643440247, "rewards/argmax_reward_func/mean": 0.40625, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.3649381697177887, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.063044786453247, "sampling/importance_sampling_ratio/mean": 0.4690224528312683, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8668680191040039, "sampling/sampling_logp_difference/mean": 0.01106030773371458, "step": 152, "step_time": 30.07618685899797 }, { "clip_ratio/high_max": 0.000752798245230224, "clip_ratio/high_mean": 0.000752798245230224, "clip_ratio/low_mean": 0.000652629867545329, "clip_ratio/low_min": 0.000652629867545329, "clip_ratio/region_mean": 0.001405428112775553, "completions/clipped_ratio": 0.375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1731.0, "completions/mean_length": 1226.46875, "completions/mean_terminated_length": 887.1500244140625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.5663786409422755, "epoch": 0.1632870864461046, "frac_reward_zero_std": 0.6875, "grad_norm": 0.02970126084983349, "kl": 0.04342758376151323, "learning_rate": 4.9240697074297206e-05, "loss": -0.0451, "num_tokens": 12525114.0, "reward": 1.209636926651001, "reward_std": 0.22097085416316986, "rewards/argmax_reward_func/mean": 0.65625, "rewards/argmax_reward_func/std": 0.4825586974620819, "rewards/criterion_gradient_reward_func/mean": 0.3533869981765747, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.3652102947235107, "sampling/importance_sampling_ratio/mean": 0.6874557733535767, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9028873443603516, "sampling/sampling_logp_difference/mean": 0.01237706933170557, "step": 153, "step_time": 36.27162277499701 }, { "clip_ratio/high_max": 0.0007834280440874863, "clip_ratio/high_mean": 0.0007834280440874863, "clip_ratio/low_mean": 0.00041602327473810874, "clip_ratio/low_min": 0.00041602327473810874, "clip_ratio/region_mean": 0.001199451318825595, "completions/clipped_ratio": 0.59375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1514.0, "completions/mean_length": 1330.59375, "completions/mean_terminated_length": 656.2307739257812, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.36492390278726816, "epoch": 0.16435432230522945, "frac_reward_zero_std": 0.625, "grad_norm": 0.022206375375390053, "kl": 0.04165895376354456, "learning_rate": 4.923038479656991e-05, "loss": 0.0678, "num_tokens": 12618443.0, "reward": 0.19765624403953552, "reward_std": 0.1800912618637085, "rewards/argmax_reward_func/mean": 0.5, "rewards/argmax_reward_func/std": 0.5080004930496216, "rewards/criterion_gradient_reward_func/mean": -0.5, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19765624403953552, "rewards/format_reward_func/std": 0.009753772057592869, "sampling/importance_sampling_ratio/max": 1.6720852851867676, "sampling/importance_sampling_ratio/mean": 0.4292204976081848, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6383188962936401, "sampling/sampling_logp_difference/mean": 0.008939147926867008, "step": 154, "step_time": 38.04142408500047 }, { "clip_ratio/high_max": 0.0017299931278103031, "clip_ratio/high_mean": 0.0017299931278103031, "clip_ratio/low_mean": 0.00021897202714171726, "clip_ratio/low_min": 0.00021897202714171726, "clip_ratio/region_mean": 0.001948965162227978, "completions/clipped_ratio": 0.625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1674.0, "completions/mean_length": 1441.875, "completions/mean_terminated_length": 858.3333740234375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.3664526203647256, "epoch": 0.16542155816435433, "frac_reward_zero_std": 0.6875, "grad_norm": 0.012804657220840454, "kl": 0.040795577224344015, "learning_rate": 4.922000405814008e-05, "loss": -0.0045, "num_tokens": 12717527.0, "reward": 1.2073709964752197, "reward_std": 0.22097086906433105, "rewards/argmax_reward_func/mean": 0.59375, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.4136210083961487, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.547576427459717, "sampling/importance_sampling_ratio/mean": 0.6002135276794434, "sampling/importance_sampling_ratio/min": 0.005531584843993187, "sampling/sampling_logp_difference/max": 1.4229247570037842, "sampling/sampling_logp_difference/mean": 0.009666403755545616, "step": 155, "step_time": 37.86146181499862 }, { "clip_ratio/high_max": 0.001287826984480489, "clip_ratio/high_mean": 0.001287826984480489, "clip_ratio/low_mean": 0.0004597727074724389, "clip_ratio/low_min": 0.0004597727074724389, "clip_ratio/region_mean": 0.0017475996846769704, "completions/clipped_ratio": 0.40625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1687.0, "completions/mean_length": 1238.53125, "completions/mean_terminated_length": 859.8421020507812, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.38742409087717533, "epoch": 0.1664887940234792, "frac_reward_zero_std": 0.4375, "grad_norm": 0.03325730934739113, "kl": 0.060451810248196125, "learning_rate": 4.920955488833753e-05, "loss": -0.0062, "num_tokens": 12809642.0, "reward": 1.3965051174163818, "reward_std": 0.39774757623672485, "rewards/argmax_reward_func/mean": 0.65625, "rewards/argmax_reward_func/std": 0.4825586974620819, "rewards/criterion_gradient_reward_func/mean": 0.5402551293373108, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.4836106300354004, "sampling/importance_sampling_ratio/mean": 0.5036962032318115, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.231393814086914, "sampling/sampling_logp_difference/mean": 0.010699176229536533, "step": 156, "step_time": 37.029759547998765 }, { "clip_ratio/high_max": 0.0008201083292078692, "clip_ratio/high_mean": 0.0008201083292078692, "clip_ratio/low_mean": 0.000334059695887845, "clip_ratio/low_min": 0.000334059695887845, "clip_ratio/region_mean": 0.0011541680250957143, "completions/clipped_ratio": 0.6875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1602.0, "completions/mean_length": 1436.8125, "completions/mean_terminated_length": 655.4000244140625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.4630144378170371, "epoch": 0.16755602988260407, "frac_reward_zero_std": 0.625, "grad_norm": 0.010510778985917568, "kl": 0.04108553775586188, "learning_rate": 4.9199037316685414e-05, "loss": 0.0111, "num_tokens": 12912702.0, "reward": 1.259836196899414, "reward_std": 0.2651650309562683, "rewards/argmax_reward_func/mean": 0.6875, "rewards/argmax_reward_func/std": 0.4709290862083435, "rewards/criterion_gradient_reward_func/mean": 0.372336208820343, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5298041105270386, "sampling/importance_sampling_ratio/mean": 0.336368203163147, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7491698265075684, "sampling/sampling_logp_difference/mean": 0.010931240394711494, "step": 157, "step_time": 42.06135727899891 }, { "clip_ratio/high_max": 0.0002969545421365183, "clip_ratio/high_mean": 0.0002969545421365183, "clip_ratio/low_mean": 0.00010535187175264582, "clip_ratio/low_min": 0.00010535187175264582, "clip_ratio/region_mean": 0.0004023064138891641, "completions/clipped_ratio": 0.40625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 1117.03125, "completions/mean_terminated_length": 655.2105102539062, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.36838842183351517, "epoch": 0.16862326574172892, "frac_reward_zero_std": 0.8125, "grad_norm": 0.020622767508029938, "kl": 0.05509336479008198, "learning_rate": 4.9188451372900154e-05, "loss": 0.0306, "num_tokens": 13011287.0, "reward": 1.389756679534912, "reward_std": 0.13258251547813416, "rewards/argmax_reward_func/mean": 0.78125, "rewards/argmax_reward_func/std": 0.420013427734375, "rewards/criterion_gradient_reward_func/mean": 0.40850669145584106, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5225892066955566, "sampling/importance_sampling_ratio/mean": 0.6869158744812012, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0754833221435547, "sampling/sampling_logp_difference/mean": 0.010018827393651009, "step": 158, "step_time": 36.208851132001655 }, { "clip_ratio/high_max": 0.0010799089941428974, "clip_ratio/high_mean": 0.0010799089941428974, "clip_ratio/low_mean": 0.0006319284202618292, "clip_ratio/low_min": 0.0006319284202618292, "clip_ratio/region_mean": 0.001711837425318663, "completions/clipped_ratio": 0.46875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1378.0, "completions/mean_length": 1273.15625, "completions/mean_terminated_length": 815.3529663085938, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.4118488421663642, "epoch": 0.16969050160085378, "frac_reward_zero_std": 0.5625, "grad_norm": 0.03552957996726036, "kl": 0.049312147486489266, "learning_rate": 4.917779708689137e-05, "loss": -0.0687, "num_tokens": 13101548.0, "reward": 1.3195083141326904, "reward_std": 0.3115689158439636, "rewards/argmax_reward_func/mean": 0.65625, "rewards/argmax_reward_func/std": 0.4825586974620819, "rewards/criterion_gradient_reward_func/mean": 0.4648208022117615, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19843751192092896, "rewards/format_reward_func/std": 0.0088388342410326, "sampling/importance_sampling_ratio/max": 2.575622797012329, "sampling/importance_sampling_ratio/mean": 0.8208548426628113, "sampling/importance_sampling_ratio/min": 0.050096821039915085, "sampling/sampling_logp_difference/max": 0.9466409683227539, "sampling/sampling_logp_difference/mean": 0.009599758312106133, "step": 159, "step_time": 34.71526961699965 }, { "clip_ratio/high_max": 0.000741234307497507, "clip_ratio/high_mean": 0.000741234307497507, "clip_ratio/low_mean": 0.0005193522829358699, "clip_ratio/low_min": 0.0005193522829358699, "clip_ratio/region_mean": 0.0012605865904333768, "completions/clipped_ratio": 0.46875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1758.0, "completions/mean_length": 1261.65625, "completions/mean_terminated_length": 793.7058715820312, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.3697207015939057, "epoch": 0.17075773745997866, "frac_reward_zero_std": 0.625, "grad_norm": 0.02760879509150982, "kl": 0.03732721181586385, "learning_rate": 4.9167074488761735e-05, "loss": -0.057, "num_tokens": 13189605.0, "reward": 1.1630346775054932, "reward_std": 0.26958441734313965, "rewards/argmax_reward_func/mean": 0.625, "rewards/argmax_reward_func/std": 0.49186936020851135, "rewards/criterion_gradient_reward_func/mean": 0.3411596715450287, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19687500596046448, "rewards/format_reward_func/std": 0.012296733446419239, "sampling/importance_sampling_ratio/max": 2.504138708114624, "sampling/importance_sampling_ratio/mean": 0.5870160460472107, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0593810081481934, "sampling/sampling_logp_difference/mean": 0.009612875990569592, "step": 160, "step_time": 31.70064257499689 }, { "clip_ratio/high_max": 0.00043186522816540673, "clip_ratio/high_mean": 0.00043186522816540673, "clip_ratio/low_mean": 0.0001885295732790837, "clip_ratio/low_min": 0.0001885295732790837, "clip_ratio/region_mean": 0.0006203948014444904, "completions/clipped_ratio": 0.5, "completions/max_length": 1792.0, "completions/max_terminated_length": 1720.0, "completions/mean_length": 1294.875, "completions/mean_terminated_length": 797.75, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.30690338741987944, "epoch": 0.1718249733191035, "frac_reward_zero_std": 0.8125, "grad_norm": 0.01547767873853445, "kl": 0.041334501816891134, "learning_rate": 4.9156283608806966e-05, "loss": 0.0774, "num_tokens": 13291623.0, "reward": 1.5022562742233276, "reward_std": 0.09059805423021317, "rewards/argmax_reward_func/mean": 0.8125, "rewards/argmax_reward_func/std": 0.3965577781200409, "rewards/criterion_gradient_reward_func/mean": 0.4913187623023987, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19843751192092896, "rewards/format_reward_func/std": 0.0088388342410326, "sampling/importance_sampling_ratio/max": 2.738396167755127, "sampling/importance_sampling_ratio/mean": 0.7333502173423767, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7928717136383057, "sampling/sampling_logp_difference/mean": 0.008399875834584236, "step": 161, "step_time": 38.44203021099747 }, { "clip_ratio/high_max": 0.0033087405427068006, "clip_ratio/high_mean": 0.0033087405427068006, "clip_ratio/low_mean": 0.0010463491780683398, "clip_ratio/low_min": 0.0010463491780683398, "clip_ratio/region_mean": 0.004355089688033331, "completions/clipped_ratio": 0.40625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1251.0, "completions/mean_length": 1072.1875, "completions/mean_terminated_length": 579.6842041015625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.4156420882791281, "epoch": 0.1728922091782284, "frac_reward_zero_std": 0.25, "grad_norm": 0.031199974939227104, "kl": 0.07990888156928122, "learning_rate": 4.914542447751569e-05, "loss": -0.0571, "num_tokens": 13383715.0, "reward": 1.177269697189331, "reward_std": 0.48945045471191406, "rewards/argmax_reward_func/mean": 0.53125, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.4499259293079376, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19609375298023224, "rewards/format_reward_func/std": 0.01435350812971592, "sampling/importance_sampling_ratio/max": 1.8092663288116455, "sampling/importance_sampling_ratio/mean": 0.5533678531646729, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8484561443328857, "sampling/sampling_logp_difference/mean": 0.011400667950510979, "step": 162, "step_time": 35.42834360700181 }, { "clip_ratio/high_max": 0.0008725524348847102, "clip_ratio/high_mean": 0.0008725524348847102, "clip_ratio/low_mean": 0.0006518150185002014, "clip_ratio/low_min": 0.0006518150185002014, "clip_ratio/region_mean": 0.0015243674679368269, "completions/clipped_ratio": 0.46875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 1267.625, "completions/mean_terminated_length": 804.941162109375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.45732096675783396, "epoch": 0.17395944503735325, "frac_reward_zero_std": 0.5, "grad_norm": 0.028626272454857826, "kl": 0.05681217892561108, "learning_rate": 4.9134497125569375e-05, "loss": -0.0423, "num_tokens": 13477175.0, "reward": 1.3030600547790527, "reward_std": 0.2728990316390991, "rewards/argmax_reward_func/mean": 0.625, "rewards/argmax_reward_func/std": 0.49186936020851135, "rewards/criterion_gradient_reward_func/mean": 0.48352891206741333, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19453124701976776, "rewards/format_reward_func/std": 0.02175224758684635, "sampling/importance_sampling_ratio/max": 1.8148844242095947, "sampling/importance_sampling_ratio/mean": 0.5706734657287598, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6777968406677246, "sampling/sampling_logp_difference/mean": 0.011177308857440948, "step": 163, "step_time": 40.45508070399774 }, { "clip_ratio/high_max": 0.0010651793163560797, "clip_ratio/high_mean": 0.0010651793163560797, "clip_ratio/low_mean": 0.0006130660640337737, "clip_ratio/low_min": 0.0006130660640337737, "clip_ratio/region_mean": 0.0016782453658379382, "completions/clipped_ratio": 0.5625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1603.0, "completions/mean_length": 1318.4375, "completions/mean_terminated_length": 709.5714721679688, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.44309902004897594, "epoch": 0.17502668089647813, "frac_reward_zero_std": 0.5, "grad_norm": 0.03291741758584976, "kl": 0.04494770092424005, "learning_rate": 4.9123501583842234e-05, "loss": 0.0969, "num_tokens": 13578469.0, "reward": 1.288834571838379, "reward_std": 0.3535533845424652, "rewards/argmax_reward_func/mean": 0.625, "rewards/argmax_reward_func/std": 0.49186936020851135, "rewards/criterion_gradient_reward_func/mean": 0.4638344943523407, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9307258129119873, "sampling/importance_sampling_ratio/mean": 0.7394727468490601, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6377806663513184, "sampling/sampling_logp_difference/mean": 0.011100257746875286, "step": 164, "step_time": 38.229370347999065 }, { "clip_ratio/high_max": 0.0011267156223766506, "clip_ratio/high_mean": 0.0011267156223766506, "clip_ratio/low_mean": 0.001000707185085048, "clip_ratio/low_min": 0.001000707185085048, "clip_ratio/region_mean": 0.0021274228038237197, "completions/clipped_ratio": 0.5, "completions/max_length": 1792.0, "completions/max_terminated_length": 1658.0, "completions/mean_length": 1267.71875, "completions/mean_terminated_length": 743.4375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.4167851433157921, "epoch": 0.17609391675560299, "frac_reward_zero_std": 0.4375, "grad_norm": 0.03298526257276535, "kl": 0.04897854430601001, "learning_rate": 4.9112437883401156e-05, "loss": -0.0117, "num_tokens": 13681940.0, "reward": 1.1437773704528809, "reward_std": 0.3535533845424652, "rewards/argmax_reward_func/mean": 0.5625, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.38440242409706116, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19687500596046448, "rewards/format_reward_func/std": 0.012296733446419239, "sampling/importance_sampling_ratio/max": 2.5646183490753174, "sampling/importance_sampling_ratio/mean": 0.8025414347648621, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.0020155906677246, "sampling/sampling_logp_difference/mean": 0.011908257380127907, "step": 165, "step_time": 35.31206962499891 }, { "clip_ratio/high_max": 0.0010326060255465563, "clip_ratio/high_mean": 0.0010326060255465563, "clip_ratio/low_mean": 0.0007045038473734166, "clip_ratio/low_min": 0.0007045038473734166, "clip_ratio/region_mean": 0.0017371098747389624, "completions/clipped_ratio": 0.4375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 1277.5625, "completions/mean_terminated_length": 877.4444580078125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.27194711845368147, "epoch": 0.17716115261472787, "frac_reward_zero_std": 0.5, "grad_norm": 0.030854983255267143, "kl": 0.045381892239674926, "learning_rate": 4.91013060555056e-05, "loss": -0.056, "num_tokens": 13774000.0, "reward": 1.0295453071594238, "reward_std": 0.3535533845424652, "rewards/argmax_reward_func/mean": 0.5, "rewards/argmax_reward_func/std": 0.5080004930496216, "rewards/criterion_gradient_reward_func/mean": 0.3295453190803528, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.2145402431488037, "sampling/importance_sampling_ratio/mean": 0.734785258769989, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8491315841674805, "sampling/sampling_logp_difference/mean": 0.00793092418462038, "step": 166, "step_time": 45.58902939099971 }, { "clip_ratio/high_max": 0.0020238167126080953, "clip_ratio/high_mean": 0.0020238167126080953, "clip_ratio/low_mean": 0.0015796079496794846, "clip_ratio/low_min": 0.0015796079496794846, "clip_ratio/region_mean": 0.003603424644097686, "completions/clipped_ratio": 0.5, "completions/max_length": 1792.0, "completions/max_terminated_length": 1522.0, "completions/mean_length": 1210.875, "completions/mean_terminated_length": 629.75, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.3087663631886244, "epoch": 0.17822838847385272, "frac_reward_zero_std": 0.375, "grad_norm": 0.025363989174365997, "kl": 0.07386617956217378, "learning_rate": 4.90901061316075e-05, "loss": -0.1689, "num_tokens": 13884282.0, "reward": 1.1301212310791016, "reward_std": 0.4419417083263397, "rewards/argmax_reward_func/mean": 0.4375, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.4926213026046753, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.3359084129333496, "sampling/importance_sampling_ratio/mean": 0.5853468179702759, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7493441104888916, "sampling/sampling_logp_difference/mean": 0.009683697484433651, "step": 167, "step_time": 48.23502202200143 }, { "clip_ratio/high_max": 0.0006347759990603663, "clip_ratio/high_mean": 0.0006347759990603663, "clip_ratio/low_mean": 0.0008136461146932561, "clip_ratio/low_min": 0.0008136461146932561, "clip_ratio/region_mean": 0.0014484221137536224, "completions/clipped_ratio": 0.46875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1404.0, "completions/mean_length": 1175.5625, "completions/mean_terminated_length": 631.6470336914062, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3629835289902985, "epoch": 0.17929562433297758, "frac_reward_zero_std": 0.4375, "grad_norm": 0.03960619866847992, "kl": 0.05155582586303353, "learning_rate": 4.9078838143351225e-05, "loss": -0.1121, "num_tokens": 13971270.0, "reward": 1.0819389820098877, "reward_std": 0.3557630777359009, "rewards/argmax_reward_func/mean": 0.5625, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.32256385684013367, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19687500596046448, "rewards/format_reward_func/std": 0.010530293919146061, "sampling/importance_sampling_ratio/max": 2.8050055503845215, "sampling/importance_sampling_ratio/mean": 0.765491783618927, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6606196165084839, "sampling/sampling_logp_difference/mean": 0.009839700534939766, "step": 168, "step_time": 30.687987770998006 }, { "clip_ratio/high_max": 0.000900883344002068, "clip_ratio/high_mean": 0.000900883344002068, "clip_ratio/low_mean": 0.00034551074168120977, "clip_ratio/low_min": 0.00034551074168120977, "clip_ratio/region_mean": 0.0012463940656743944, "completions/clipped_ratio": 0.46875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1715.0, "completions/mean_length": 1217.59375, "completions/mean_terminated_length": 710.7647094726562, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.4375229673460126, "epoch": 0.18036286019210246, "frac_reward_zero_std": 0.625, "grad_norm": 0.03297114744782448, "kl": 0.05550515162758529, "learning_rate": 4.906750212257341e-05, "loss": 0.0188, "num_tokens": 14064793.0, "reward": 1.2329511642456055, "reward_std": 0.2651650309562683, "rewards/argmax_reward_func/mean": 0.5625, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.47045108675956726, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.6697325706481934, "sampling/importance_sampling_ratio/mean": 0.6021915674209595, "sampling/importance_sampling_ratio/min": 0.001799049903638661, "sampling/sampling_logp_difference/max": 0.6562404632568359, "sampling/sampling_logp_difference/mean": 0.011284079402685165, "step": 169, "step_time": 36.3779339700086 }, { "clip_ratio/high_max": 0.0017687929721432738, "clip_ratio/high_mean": 0.0017687929721432738, "clip_ratio/low_mean": 0.00031905151445243973, "clip_ratio/low_min": 0.00031905151445243973, "clip_ratio/region_mean": 0.002087844479319756, "completions/clipped_ratio": 0.375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1545.0, "completions/mean_length": 1187.375, "completions/mean_terminated_length": 824.6000366210938, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.3542481083422899, "epoch": 0.1814300960512273, "frac_reward_zero_std": 0.5625, "grad_norm": 0.026795953512191772, "kl": 0.05217215686570853, "learning_rate": 4.9056098101302925e-05, "loss": 0.0395, "num_tokens": 14157009.0, "reward": 1.398764967918396, "reward_std": 0.30935919284820557, "rewards/argmax_reward_func/mean": 0.65625, "rewards/argmax_reward_func/std": 0.4825586974620819, "rewards/criterion_gradient_reward_func/mean": 0.542514979839325, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.314265489578247, "sampling/importance_sampling_ratio/mean": 0.7896668910980225, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6418085098266602, "sampling/sampling_logp_difference/mean": 0.010049017146229744, "step": 170, "step_time": 34.380948728001385 }, { "clip_ratio/high_max": 0.0008589461594965542, "clip_ratio/high_mean": 0.0008589461594965542, "clip_ratio/low_mean": 0.0004995781509933295, "clip_ratio/low_min": 0.0004995781509933295, "clip_ratio/region_mean": 0.001358524303213926, "completions/clipped_ratio": 0.53125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1547.0, "completions/mean_length": 1369.4375, "completions/mean_terminated_length": 890.5333862304688, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.32464212318882346, "epoch": 0.1824973319103522, "frac_reward_zero_std": 0.5625, "grad_norm": 0.0228361114859581, "kl": 0.04841511428821832, "learning_rate": 4.9044626111760786e-05, "loss": 0.027, "num_tokens": 14258205.0, "reward": 1.2082233428955078, "reward_std": 0.268479585647583, "rewards/argmax_reward_func/mean": 0.625, "rewards/argmax_reward_func/std": 0.49186936020851135, "rewards/criterion_gradient_reward_func/mean": 0.3855670392513275, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19765625894069672, "rewards/format_reward_func/std": 0.009753772988915443, "sampling/importance_sampling_ratio/max": 2.643857717514038, "sampling/importance_sampling_ratio/mean": 0.43174880743026733, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9019136428833008, "sampling/sampling_logp_difference/mean": 0.00972104910761118, "step": 171, "step_time": 34.740334170001006 }, { "clip_ratio/high_max": 0.0003626111283665523, "clip_ratio/high_mean": 0.0003626111283665523, "clip_ratio/low_mean": 0.00040032136348600034, "clip_ratio/low_min": 0.00040032136348600034, "clip_ratio/region_mean": 0.0007629324918525526, "completions/clipped_ratio": 0.46875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 1240.375, "completions/mean_terminated_length": 753.6470336914062, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.3659816877916455, "epoch": 0.18356456776947705, "frac_reward_zero_std": 0.75, "grad_norm": 0.008607005700469017, "kl": 0.054276965791359544, "learning_rate": 4.9033086186360033e-05, "loss": -0.0071, "num_tokens": 14361121.0, "reward": 1.2964102029800415, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func/mean": 0.625, "rewards/argmax_reward_func/std": 0.49186936020851135, "rewards/criterion_gradient_reward_func/mean": 0.47141018509864807, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.724207878112793, "sampling/importance_sampling_ratio/mean": 0.5103445053100586, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8645298480987549, "sampling/sampling_logp_difference/mean": 0.010439449921250343, "step": 172, "step_time": 36.50159338400226 }, { "clip_ratio/high_max": 0.0008045742179092485, "clip_ratio/high_mean": 0.0008045742179092485, "clip_ratio/low_mean": 0.0006355319874273846, "clip_ratio/low_min": 0.0006355319874273846, "clip_ratio/region_mean": 0.0014401062053366331, "completions/clipped_ratio": 0.28125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1701.0, "completions/mean_length": 1027.125, "completions/mean_terminated_length": 727.8261108398438, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2989286547526717, "epoch": 0.18463180362860193, "frac_reward_zero_std": 0.625, "grad_norm": 0.021518684923648834, "kl": 0.05867633759044111, "learning_rate": 4.902147835770565e-05, "loss": 0.001, "num_tokens": 14452615.0, "reward": 1.3107714653015137, "reward_std": 0.2651650309562683, "rewards/argmax_reward_func/mean": 0.6875, "rewards/argmax_reward_func/std": 0.4709290862083435, "rewards/criterion_gradient_reward_func/mean": 0.42327141761779785, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.3837778568267822, "sampling/importance_sampling_ratio/mean": 0.5596511363983154, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7007021903991699, "sampling/sampling_logp_difference/mean": 0.009176788851618767, "step": 173, "step_time": 35.846035705999384 }, { "clip_ratio/high_max": 0.0006044476504030172, "clip_ratio/high_mean": 0.0006044476504030172, "clip_ratio/low_mean": 0.0003441807930357754, "clip_ratio/low_min": 0.0003441807930357754, "clip_ratio/region_mean": 0.0009486284434387926, "completions/clipped_ratio": 0.34375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1560.0, "completions/mean_length": 977.78125, "completions/mean_terminated_length": 551.2857055664062, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3648164002224803, "epoch": 0.18569903948772679, "frac_reward_zero_std": 0.5625, "grad_norm": 0.04037713259458542, "kl": 0.054365706513635814, "learning_rate": 4.900980265859448e-05, "loss": -0.0023, "num_tokens": 14531602.0, "reward": 1.1198675632476807, "reward_std": 0.27400386333465576, "rewards/argmax_reward_func/mean": 0.5625, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.36361756920814514, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19374999403953552, "rewards/format_reward_func/std": 0.0353553406894207, "sampling/importance_sampling_ratio/max": 2.2931747436523438, "sampling/importance_sampling_ratio/mean": 0.7977558970451355, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7232780456542969, "sampling/sampling_logp_difference/mean": 0.009322757832705975, "step": 174, "step_time": 31.47886482600734 }, { "clip_ratio/high_max": 0.0014628515527874697, "clip_ratio/high_mean": 0.0014628515527874697, "clip_ratio/low_mean": 0.0006275343785091536, "clip_ratio/low_min": 0.0006275343785091536, "clip_ratio/region_mean": 0.0020903859422105597, "completions/clipped_ratio": 0.40625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1365.0, "completions/mean_length": 1054.84375, "completions/mean_terminated_length": 550.4736938476562, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.30665461625903845, "epoch": 0.18676627534685167, "frac_reward_zero_std": 0.375, "grad_norm": 0.042510196566581726, "kl": 0.06080572272185236, "learning_rate": 4.899805912201513e-05, "loss": 0.1191, "num_tokens": 14615347.0, "reward": 1.1793272495269775, "reward_std": 0.4419417381286621, "rewards/argmax_reward_func/mean": 0.5625, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.41682717204093933, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.469658136367798, "sampling/importance_sampling_ratio/mean": 0.7924492955207825, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2119817733764648, "sampling/sampling_logp_difference/mean": 0.008956140838563442, "step": 175, "step_time": 32.402886578995094 }, { "clip_ratio/high_max": 0.002008463503443636, "clip_ratio/high_mean": 0.002008463503443636, "clip_ratio/low_mean": 0.00044837491077487357, "clip_ratio/low_min": 0.00044837491077487357, "clip_ratio/region_mean": 0.0024568384142185096, "completions/clipped_ratio": 0.375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1666.0, "completions/mean_length": 1127.15625, "completions/mean_terminated_length": 728.25, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.2269983757287264, "epoch": 0.18783351120597652, "frac_reward_zero_std": 0.5, "grad_norm": 0.025843970477581024, "kl": 0.059472991386428475, "learning_rate": 4.898624778114787e-05, "loss": -0.0288, "num_tokens": 14706670.0, "reward": 1.1807063817977905, "reward_std": 0.35576310753822327, "rewards/argmax_reward_func/mean": 0.5, "rewards/argmax_reward_func/std": 0.5080004930496216, "rewards/criterion_gradient_reward_func/mean": 0.4822688102722168, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19843751192092896, "rewards/format_reward_func/std": 0.0088388342410326, "sampling/importance_sampling_ratio/max": 2.9345035552978516, "sampling/importance_sampling_ratio/mean": 0.7786688804626465, "sampling/importance_sampling_ratio/min": 0.04188890755176544, "sampling/sampling_logp_difference/max": 0.6336469650268555, "sampling/sampling_logp_difference/mean": 0.00682302750647068, "step": 176, "step_time": 35.71681290300148 }, { "clip_ratio/high_max": 0.0015067452914081514, "clip_ratio/high_mean": 0.0015067452914081514, "clip_ratio/low_mean": 0.0006824636257078964, "clip_ratio/low_min": 0.0006824636257078964, "clip_ratio/region_mean": 0.0021892089062021114, "completions/clipped_ratio": 0.1875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1690.0, "completions/mean_length": 799.40625, "completions/mean_terminated_length": 570.34619140625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.37487364560365677, "epoch": 0.18890074706510138, "frac_reward_zero_std": 0.5625, "grad_norm": 0.02404840476810932, "kl": 0.07522101863287389, "learning_rate": 4.897436866936454e-05, "loss": 0.0334, "num_tokens": 14782157.0, "reward": 1.0716251134872437, "reward_std": 0.30935919284820557, "rewards/argmax_reward_func/mean": 0.46875, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.4028750956058502, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.6819229125976562, "sampling/importance_sampling_ratio/mean": 0.5833064317703247, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7175498008728027, "sampling/sampling_logp_difference/mean": 0.010035542771220207, "step": 177, "step_time": 32.03928722300225 }, { "clip_ratio/high_max": 0.0003083371884713415, "clip_ratio/high_mean": 0.0003083371884713415, "clip_ratio/low_mean": 0.0005946887067693751, "clip_ratio/low_min": 0.0005946887067693751, "clip_ratio/region_mean": 0.0009030258843267802, "completions/clipped_ratio": 0.3125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1765.0, "completions/mean_length": 1050.5, "completions/mean_terminated_length": 713.45458984375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.44630222814157605, "epoch": 0.18996798292422626, "frac_reward_zero_std": 0.75, "grad_norm": 0.01703566312789917, "kl": 0.059698743745684624, "learning_rate": 4.896242182022849e-05, "loss": 0.0187, "num_tokens": 14863517.0, "reward": 1.1125457286834717, "reward_std": 0.1767766773700714, "rewards/argmax_reward_func/mean": 0.5, "rewards/argmax_reward_func/std": 0.5080004930496216, "rewards/criterion_gradient_reward_func/mean": 0.412545770406723, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7916653156280518, "sampling/importance_sampling_ratio/mean": 0.730034589767456, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.5156669616699219, "sampling/sampling_logp_difference/mean": 0.01059631910175085, "step": 178, "step_time": 35.22311523699682 }, { "clip_ratio/high_max": 0.0011985730889136903, "clip_ratio/high_mean": 0.0011985730889136903, "clip_ratio/low_mean": 0.0011143812953378074, "clip_ratio/low_min": 0.0011143812953378074, "clip_ratio/region_mean": 0.0023129543842514977, "completions/clipped_ratio": 0.28125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 864.71875, "completions/mean_terminated_length": 501.86956787109375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.392739191185683, "epoch": 0.1910352187833511, "frac_reward_zero_std": 0.5625, "grad_norm": 0.03343877196311951, "kl": 0.0742323137819767, "learning_rate": 4.8950407267494405e-05, "loss": -0.0349, "num_tokens": 14940924.0, "reward": 1.061286211013794, "reward_std": 0.3082543611526489, "rewards/argmax_reward_func/mean": 0.59375, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.26831740140914917, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19921875, "rewards/format_reward_func/std": 0.004419418517500162, "sampling/importance_sampling_ratio/max": 2.966917037963867, "sampling/importance_sampling_ratio/mean": 0.9990112781524658, "sampling/importance_sampling_ratio/min": 0.026830922812223434, "sampling/sampling_logp_difference/max": 0.7371463775634766, "sampling/sampling_logp_difference/mean": 0.011188439093530178, "step": 179, "step_time": 34.00251564300197 }, { "clip_ratio/high_max": 0.0008279019384644926, "clip_ratio/high_mean": 0.0008279019384644926, "clip_ratio/low_mean": 0.0002385004481766373, "clip_ratio/low_min": 0.0002385004481766373, "clip_ratio/region_mean": 0.0010664023866411299, "completions/clipped_ratio": 0.28125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1592.0, "completions/mean_length": 1027.5625, "completions/mean_terminated_length": 728.434814453125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.5657435394823551, "epoch": 0.192102454642476, "frac_reward_zero_std": 0.75, "grad_norm": 0.022676639258861542, "kl": 0.05571307067293674, "learning_rate": 4.89383250451083e-05, "loss": -0.0862, "num_tokens": 15018574.0, "reward": 1.0958267450332642, "reward_std": 0.1800912469625473, "rewards/argmax_reward_func/mean": 0.4375, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.4606705904006958, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19765624403953552, "rewards/format_reward_func/std": 0.013258252292871475, "sampling/importance_sampling_ratio/max": 1.8920981884002686, "sampling/importance_sampling_ratio/mean": 0.5371487140655518, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7807998657226562, "sampling/sampling_logp_difference/mean": 0.013049173168838024, "step": 180, "step_time": 36.378767773998334 }, { "clip_ratio/high_max": 0.0009579225479683373, "clip_ratio/high_mean": 0.0009579225479683373, "clip_ratio/low_mean": 0.0003260530920670135, "clip_ratio/low_min": 0.0003260530920670135, "clip_ratio/region_mean": 0.0012839756400353508, "completions/clipped_ratio": 0.21875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 809.625, "completions/mean_terminated_length": 534.5599975585938, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.355758517049253, "epoch": 0.19316969050160085, "frac_reward_zero_std": 0.625, "grad_norm": 0.03898197039961815, "kl": 0.063386169844307, "learning_rate": 4.892617518720737e-05, "loss": 0.0974, "num_tokens": 15096956.0, "reward": 1.1404106616973877, "reward_std": 0.2651650309562683, "rewards/argmax_reward_func/mean": 0.6875, "rewards/argmax_reward_func/std": 0.4709290862083435, "rewards/criterion_gradient_reward_func/mean": 0.2529106140136719, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.880505323410034, "sampling/importance_sampling_ratio/mean": 0.7210962176322937, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1117972135543823, "sampling/sampling_logp_difference/mean": 0.009598508477210999, "step": 181, "step_time": 34.069520264003586 }, { "clip_ratio/high_max": 0.0006681311715510674, "clip_ratio/high_mean": 0.0006681311715510674, "clip_ratio/low_mean": 0.0002413371294096578, "clip_ratio/low_min": 0.0002413371294096578, "clip_ratio/region_mean": 0.0009094683009607252, "completions/clipped_ratio": 0.15625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1676.0, "completions/mean_length": 899.78125, "completions/mean_terminated_length": 734.5555419921875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.3068634932860732, "epoch": 0.19423692636072573, "frac_reward_zero_std": 0.6875, "grad_norm": 0.02067856304347515, "kl": 0.06992260564584285, "learning_rate": 4.891395772811992e-05, "loss": -0.0455, "num_tokens": 15188177.0, "reward": 1.380434274673462, "reward_std": 0.22097085416316986, "rewards/argmax_reward_func/mean": 0.59375, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.5866843461990356, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8972697257995605, "sampling/importance_sampling_ratio/mean": 0.61464524269104, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9594991207122803, "sampling/sampling_logp_difference/mean": 0.009234393015503883, "step": 182, "step_time": 33.953558512997915 }, { "clip_ratio/high_max": 0.0033329958278045524, "clip_ratio/high_mean": 0.0033329958278045524, "clip_ratio/low_mean": 0.00046368176845135167, "clip_ratio/low_min": 0.00046368176845135167, "clip_ratio/region_mean": 0.003796677559876116, "completions/clipped_ratio": 0.21875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1682.0, "completions/mean_length": 841.3125, "completions/mean_terminated_length": 575.1199951171875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.4762461297214031, "epoch": 0.19530416221985059, "frac_reward_zero_std": 0.375, "grad_norm": 0.027415238320827484, "kl": 0.0984925739467144, "learning_rate": 4.890167270236523e-05, "loss": -0.0812, "num_tokens": 15264635.0, "reward": 1.2905315160751343, "reward_std": 0.44415146112442017, "rewards/argmax_reward_func/mean": 0.5625, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.5295939445495605, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19843751192092896, "rewards/format_reward_func/std": 0.0088388342410326, "sampling/importance_sampling_ratio/max": 1.446051836013794, "sampling/importance_sampling_ratio/mean": 0.5019406080245972, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9023923873901367, "sampling/sampling_logp_difference/mean": 0.011371584609150887, "step": 183, "step_time": 29.29527110300114 }, { "clip_ratio/high_max": 0.0012198529366287403, "clip_ratio/high_mean": 0.0012198529366287403, "clip_ratio/low_mean": 0.00040117175376508385, "clip_ratio/low_min": 0.00040117175376508385, "clip_ratio/region_mean": 0.0016210246903938241, "completions/clipped_ratio": 0.3125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1472.0, "completions/mean_length": 982.09375, "completions/mean_terminated_length": 613.95458984375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.5050224410369992, "epoch": 0.19637139807897544, "frac_reward_zero_std": 0.5625, "grad_norm": 0.03566712141036987, "kl": 0.06959813740104437, "learning_rate": 4.888932014465352e-05, "loss": 0.0658, "num_tokens": 15351864.0, "reward": 0.7389964461326599, "reward_std": 0.30493977665901184, "rewards/argmax_reward_func/mean": 0.34375, "rewards/argmax_reward_func/std": 0.4825586974620819, "rewards/criterion_gradient_reward_func/mean": 0.1983715295791626, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19687500596046448, "rewards/format_reward_func/std": 0.0176776684820652, "sampling/importance_sampling_ratio/max": 1.8644251823425293, "sampling/importance_sampling_ratio/mean": 0.6549676060676575, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8567876815795898, "sampling/sampling_logp_difference/mean": 0.012304308824241161, "step": 184, "step_time": 35.622206566004024 }, { "clip_ratio/high_max": 0.001133847123128362, "clip_ratio/high_mean": 0.001133847123128362, "clip_ratio/low_mean": 0.0002683174316189252, "clip_ratio/low_min": 0.0002683174316189252, "clip_ratio/region_mean": 0.001402164540195372, "completions/clipped_ratio": 0.125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 641.28125, "completions/mean_terminated_length": 476.89288330078125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.44533111341297626, "epoch": 0.19743863393810032, "frac_reward_zero_std": 0.5625, "grad_norm": 0.04660416767001152, "kl": 0.07955898437649012, "learning_rate": 4.887690008988578e-05, "loss": 0.0411, "num_tokens": 15421391.0, "reward": 1.1220818758010864, "reward_std": 0.3071494698524475, "rewards/argmax_reward_func/mean": 0.46875, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.45489439368247986, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19843751192092896, "rewards/format_reward_func/std": 0.0088388342410326, "sampling/importance_sampling_ratio/max": 2.4521751403808594, "sampling/importance_sampling_ratio/mean": 0.948140025138855, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.719355583190918, "sampling/sampling_logp_difference/mean": 0.01204899325966835, "step": 185, "step_time": 30.23948762100008 }, { "clip_ratio/high_max": 0.0008530319828423671, "clip_ratio/high_mean": 0.0008530319828423671, "clip_ratio/low_mean": 0.0010821321138791973, "clip_ratio/low_min": 0.0010821321138791973, "clip_ratio/region_mean": 0.0019351641021785326, "completions/clipped_ratio": 0.25, "completions/max_length": 1792.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 856.0, "completions/mean_terminated_length": 544.0, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2797395451925695, "epoch": 0.19850586979722518, "frac_reward_zero_std": 0.5625, "grad_norm": 0.026976950466632843, "kl": 0.06662332289852202, "learning_rate": 4.8864412573153726e-05, "loss": -0.0251, "num_tokens": 15504785.0, "reward": 1.3393031358718872, "reward_std": 0.30935919284820557, "rewards/argmax_reward_func/mean": 0.71875, "rewards/argmax_reward_func/std": 0.45680341124534607, "rewards/criterion_gradient_reward_func/mean": 0.42055314779281616, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.6958746910095215, "sampling/importance_sampling_ratio/mean": 0.8409218788146973, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1064176559448242, "sampling/sampling_logp_difference/mean": 0.007877079769968987, "step": 186, "step_time": 38.466283652995116 }, { "clip_ratio/high_max": 0.0007029271291685291, "clip_ratio/high_mean": 0.0007029271291685291, "clip_ratio/low_mean": 0.0002124436286976561, "clip_ratio/low_min": 0.0002124436286976561, "clip_ratio/region_mean": 0.0009153707578661852, "completions/clipped_ratio": 0.15625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1361.0, "completions/mean_length": 652.5, "completions/mean_terminated_length": 441.4814758300781, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.5050439513288438, "epoch": 0.19957310565635006, "frac_reward_zero_std": 0.75, "grad_norm": 0.02560998499393463, "kl": 0.08732620347291231, "learning_rate": 4.885185762973967e-05, "loss": 0.0044, "num_tokens": 15574681.0, "reward": 1.1592646837234497, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func/mean": 0.4375, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.5217646956443787, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.076676607131958, "sampling/importance_sampling_ratio/mean": 0.8040215373039246, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8213949203491211, "sampling/sampling_logp_difference/mean": 0.013475039973855019, "step": 187, "step_time": 29.421941866998168 }, { "clip_ratio/high_max": 0.0019261797788203694, "clip_ratio/high_mean": 0.0019261797788203694, "clip_ratio/low_mean": 0.000643321833194932, "clip_ratio/low_min": 0.000643321833194932, "clip_ratio/region_mean": 0.0025695016083773226, "completions/clipped_ratio": 0.1875, "completions/max_length": 1792.0, "completions/max_terminated_length": 1691.0, "completions/mean_length": 802.46875, "completions/mean_terminated_length": 574.1154174804688, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.46088301856070757, "epoch": 0.2006403415154749, "frac_reward_zero_std": 0.5625, "grad_norm": 0.032154880464076996, "kl": 0.06854391633532941, "learning_rate": 4.883923529511646e-05, "loss": 0.0228, "num_tokens": 15658170.0, "reward": 1.1611487865447998, "reward_std": 0.26626989245414734, "rewards/argmax_reward_func/mean": 0.4375, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.524429976940155, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19921875, "rewards/format_reward_func/std": 0.004419418517500162, "sampling/importance_sampling_ratio/max": 2.8473198413848877, "sampling/importance_sampling_ratio/mean": 0.6584773063659668, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8383798599243164, "sampling/sampling_logp_difference/mean": 0.011808345094323158, "step": 188, "step_time": 33.89593217900256 }, { "clip_ratio/high_max": 0.0005993407903588377, "clip_ratio/high_mean": 0.0005993407903588377, "clip_ratio/low_mean": 0.000948339193200809, "clip_ratio/low_min": 0.000948339193200809, "clip_ratio/region_mean": 0.0015476799781026784, "completions/clipped_ratio": 0.34375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1632.0, "completions/mean_length": 1025.875, "completions/mean_terminated_length": 624.5714111328125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.4052355093881488, "epoch": 0.2017075773745998, "frac_reward_zero_std": 0.5625, "grad_norm": 0.04442531242966652, "kl": 0.06884683342650533, "learning_rate": 4.8826545604947306e-05, "loss": -0.0686, "num_tokens": 15739900.0, "reward": 1.0582778453826904, "reward_std": 0.30935919284820557, "rewards/argmax_reward_func/mean": 0.53125, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.32702791690826416, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.6832306385040283, "sampling/importance_sampling_ratio/mean": 0.7865561246871948, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6801505088806152, "sampling/sampling_logp_difference/mean": 0.010447611100971699, "step": 189, "step_time": 35.72450646800098 }, { "clip_ratio/high_max": 0.0005454872371046804, "clip_ratio/high_mean": 0.0005454872371046804, "clip_ratio/low_mean": 0.00025799842114793137, "clip_ratio/low_min": 0.00025799842114793137, "clip_ratio/region_mean": 0.000803485654614633, "completions/clipped_ratio": 0.3125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1460.0, "completions/mean_length": 965.09375, "completions/mean_terminated_length": 589.227294921875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.4299275577068329, "epoch": 0.20277481323372465, "frac_reward_zero_std": 0.6875, "grad_norm": 0.030092692002654076, "kl": 0.09941192297264934, "learning_rate": 4.8813788595085766e-05, "loss": 0.0047, "num_tokens": 15826315.0, "reward": 1.1159111261367798, "reward_std": 0.1800912618637085, "rewards/argmax_reward_func/mean": 0.4375, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.48075491189956665, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19765624403953552, "rewards/format_reward_func/std": 0.013258252292871475, "sampling/importance_sampling_ratio/max": 2.1555590629577637, "sampling/importance_sampling_ratio/mean": 0.612689733505249, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5298700332641602, "sampling/sampling_logp_difference/mean": 0.010036558844149113, "step": 190, "step_time": 33.23896736399547 }, { "clip_ratio/high_max": 0.0009756650688359514, "clip_ratio/high_mean": 0.0009756650688359514, "clip_ratio/low_mean": 0.00044401062950782944, "clip_ratio/low_min": 0.00044401062950782944, "clip_ratio/region_mean": 0.0014196756983437808, "completions/clipped_ratio": 0.34375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1205.0, "completions/mean_length": 881.3125, "completions/mean_terminated_length": 404.2857360839844, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.4176445873454213, "epoch": 0.20384204909284953, "frac_reward_zero_std": 0.625, "grad_norm": 0.05207477882504463, "kl": 0.07938546198420227, "learning_rate": 4.880096430157557e-05, "loss": 0.223, "num_tokens": 15914199.0, "reward": 1.176633358001709, "reward_std": 0.2220757156610489, "rewards/argmax_reward_func/mean": 0.65625, "rewards/argmax_reward_func/std": 0.4825586974620819, "rewards/criterion_gradient_reward_func/mean": 0.32116469740867615, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19921875, "rewards/format_reward_func/std": 0.004419418517500162, "sampling/importance_sampling_ratio/max": 2.6831746101379395, "sampling/importance_sampling_ratio/mean": 0.5863227844238281, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6832051277160645, "sampling/sampling_logp_difference/mean": 0.010655419901013374, "step": 191, "step_time": 39.15446536100171 }, { "clip_ratio/high_max": 0.0015242292247421574, "clip_ratio/high_mean": 0.0015242292247421574, "clip_ratio/low_mean": 0.00037007435457780957, "clip_ratio/low_min": 0.00037007435457780957, "clip_ratio/region_mean": 0.0018943035720440093, "completions/clipped_ratio": 0.125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1376.0, "completions/mean_length": 657.84375, "completions/mean_terminated_length": 495.8214416503906, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.5549521055072546, "epoch": 0.20490928495197439, "frac_reward_zero_std": 0.4375, "grad_norm": 0.03024817444384098, "kl": 0.09470632369630039, "learning_rate": 4.878807276065059e-05, "loss": -0.0003, "num_tokens": 15981602.0, "reward": 1.0923466682434082, "reward_std": 0.3977475166320801, "rewards/argmax_reward_func/mean": 0.40625, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.48609673976898193, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.2490732669830322, "sampling/importance_sampling_ratio/mean": 0.5820620059967041, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6343803405761719, "sampling/sampling_logp_difference/mean": 0.01400070171803236, "step": 192, "step_time": 30.665978998995342 }, { "clip_ratio/high_max": 0.0006778007718821755, "clip_ratio/high_mean": 0.0006778007718821755, "clip_ratio/low_mean": 0.00019362829334568232, "clip_ratio/low_min": 0.00019362829334568232, "clip_ratio/region_mean": 0.0008714290652278578, "completions/clipped_ratio": 0.25, "completions/max_length": 1792.0, "completions/max_terminated_length": 1537.0, "completions/mean_length": 878.125, "completions/mean_terminated_length": 573.5, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.35331084951758385, "epoch": 0.20597652081109924, "frac_reward_zero_std": 0.625, "grad_norm": 0.03205087408423424, "kl": 0.07683330692816526, "learning_rate": 4.8775114008734656e-05, "loss": 0.0069, "num_tokens": 16073132.0, "reward": 1.21002197265625, "reward_std": 0.2651650309562683, "rewards/argmax_reward_func/mean": 0.625, "rewards/argmax_reward_func/std": 0.49186936020851135, "rewards/criterion_gradient_reward_func/mean": 0.38502201437950134, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.4529802799224854, "sampling/importance_sampling_ratio/mean": 0.6308029890060425, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5470104217529297, "sampling/sampling_logp_difference/mean": 0.008459147997200489, "step": 193, "step_time": 35.5070745360008 }, { "clip_ratio/high_max": 0.0008514477358403383, "clip_ratio/high_mean": 0.0008514477358403383, "clip_ratio/low_mean": 0.0007423070055665448, "clip_ratio/low_min": 0.0007423070055665448, "clip_ratio/region_mean": 0.0015937547414068831, "completions/clipped_ratio": 0.28125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1473.0, "completions/mean_length": 938.5625, "completions/mean_terminated_length": 604.6087036132812, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.34781370544806123, "epoch": 0.20704375667022412, "frac_reward_zero_std": 0.5625, "grad_norm": 0.027193564921617508, "kl": 0.09393758396618068, "learning_rate": 4.876208808244154e-05, "loss": -0.0526, "num_tokens": 16149910.0, "reward": 1.0905637741088867, "reward_std": 0.26626989245414734, "rewards/argmax_reward_func/mean": 0.375, "rewards/argmax_reward_func/std": 0.49186936020851135, "rewards/criterion_gradient_reward_func/mean": 0.5163450837135315, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19921875, "rewards/format_reward_func/std": 0.004419418517500162, "sampling/importance_sampling_ratio/max": 1.7377398014068604, "sampling/importance_sampling_ratio/mean": 0.6900028586387634, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6546311378479004, "sampling/sampling_logp_difference/mean": 0.007427344564348459, "step": 194, "step_time": 29.198929090000092 }, { "clip_ratio/high_max": 0.001548028321849415, "clip_ratio/high_mean": 0.001548028321849415, "clip_ratio/low_mean": 0.0005983702540106606, "clip_ratio/low_min": 0.0005983702540106606, "clip_ratio/region_mean": 0.0021463985758600757, "completions/clipped_ratio": 0.375, "completions/max_length": 1792.0, "completions/max_terminated_length": 1670.0, "completions/mean_length": 1042.75, "completions/mean_terminated_length": 593.2000122070312, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.4554205648601055, "epoch": 0.20811099252934898, "frac_reward_zero_std": 0.4375, "grad_norm": 0.024736542254686356, "kl": 0.05607226269785315, "learning_rate": 4.874899501857477e-05, "loss": 0.0299, "num_tokens": 16239150.0, "reward": 1.1955065727233887, "reward_std": 0.39774757623672485, "rewards/argmax_reward_func/mean": 0.53125, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.46425652503967285, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.961890459060669, "sampling/importance_sampling_ratio/mean": 0.46906429529190063, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8028736114501953, "sampling/sampling_logp_difference/mean": 0.011387611739337444, "step": 195, "step_time": 41.45920760199624 }, { "clip_ratio/high_max": 0.0005850023553648498, "clip_ratio/high_mean": 0.0005850023553648498, "clip_ratio/low_mean": 0.00017400289289071225, "clip_ratio/low_min": 0.00017400289289071225, "clip_ratio/region_mean": 0.000759005248255562, "completions/clipped_ratio": 0.28125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1653.0, "completions/mean_length": 963.0625, "completions/mean_terminated_length": 638.6956787109375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.41500036511570215, "epoch": 0.20917822838847386, "frac_reward_zero_std": 0.75, "grad_norm": 0.012187820859253407, "kl": 0.053419790929183364, "learning_rate": 4.8735834854127595e-05, "loss": -0.0651, "num_tokens": 16321740.0, "reward": 1.273116111755371, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func/mean": 0.75, "rewards/argmax_reward_func/std": 0.4399413466453552, "rewards/criterion_gradient_reward_func/mean": 0.3231160640716553, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8016568422317505, "sampling/importance_sampling_ratio/mean": 0.46950215101242065, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2462286949157715, "sampling/sampling_logp_difference/mean": 0.01061326079070568, "step": 196, "step_time": 33.14636822599459 }, { "clip_ratio/high_max": 0.0009648510422266554, "clip_ratio/high_mean": 0.0009648510422266554, "clip_ratio/low_mean": 0.000647639375529252, "clip_ratio/low_min": 0.000647639375529252, "clip_ratio/region_mean": 0.0016124904395837802, "completions/clipped_ratio": 0.28125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1623.0, "completions/mean_length": 953.78125, "completions/mean_terminated_length": 625.7825927734375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.31897920835763216, "epoch": 0.2102454642475987, "frac_reward_zero_std": 0.5625, "grad_norm": 0.029799019917845726, "kl": 0.0579157373867929, "learning_rate": 4.872260762628283e-05, "loss": -0.0755, "num_tokens": 16405695.0, "reward": 0.8613806962966919, "reward_std": 0.2684796154499054, "rewards/argmax_reward_func/mean": 0.5625, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.1012243926525116, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19765624403953552, "rewards/format_reward_func/std": 0.013258252292871475, "sampling/importance_sampling_ratio/max": 1.9306573867797852, "sampling/importance_sampling_ratio/mean": 0.6673228144645691, "sampling/importance_sampling_ratio/min": 0.08400370925664902, "sampling/sampling_logp_difference/max": 1.0099213123321533, "sampling/sampling_logp_difference/mean": 0.008726523257791996, "step": 197, "step_time": 34.14001962099792 }, { "clip_ratio/high_max": 0.0014597564913856331, "clip_ratio/high_mean": 0.0014597564913856331, "clip_ratio/low_mean": 0.0012561242474475875, "clip_ratio/low_min": 0.0012561242474475875, "clip_ratio/region_mean": 0.0027158807388332207, "completions/clipped_ratio": 0.3125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1548.0, "completions/mean_length": 896.125, "completions/mean_terminated_length": 488.90911865234375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.4833885212428868, "epoch": 0.2113127001067236, "frac_reward_zero_std": 0.5, "grad_norm": 0.03493157774209976, "kl": 0.06782837677747011, "learning_rate": 4.870931337241278e-05, "loss": -0.086, "num_tokens": 16484787.0, "reward": 1.017473578453064, "reward_std": 0.23533397912979126, "rewards/argmax_reward_func/mean": 0.40625, "rewards/argmax_reward_func/std": 0.49899089336395264, "rewards/criterion_gradient_reward_func/mean": 0.42137983441352844, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.18984374403953552, "rewards/format_reward_func/std": 0.027576414868235588, "sampling/importance_sampling_ratio/max": 2.5149478912353516, "sampling/importance_sampling_ratio/mean": 0.8519099950790405, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1755292415618896, "sampling/sampling_logp_difference/mean": 0.01291737612336874, "step": 198, "step_time": 34.504471944001125 }, { "clip_ratio/high_max": 0.0010310479683539597, "clip_ratio/high_mean": 0.0010310479683539597, "clip_ratio/low_mean": 0.0006210875890246825, "clip_ratio/low_min": 0.0006210875890246825, "clip_ratio/region_mean": 0.0016521355573786423, "completions/clipped_ratio": 0.40625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1731.0, "completions/mean_length": 1057.71875, "completions/mean_terminated_length": 555.3157958984375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.43510067695751786, "epoch": 0.21237993596584845, "frac_reward_zero_std": 0.5625, "grad_norm": 0.030380066484212875, "kl": 0.0682327023241669, "learning_rate": 4.8695952130079126e-05, "loss": 0.0091, "num_tokens": 16570450.0, "reward": 1.214232087135315, "reward_std": 0.26737475395202637, "rewards/argmax_reward_func/mean": 0.5625, "rewards/argmax_reward_func/std": 0.504016101360321, "rewards/criterion_gradient_reward_func/mean": 0.4532944858074188, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.19843751192092896, "rewards/format_reward_func/std": 0.0088388342410326, "sampling/importance_sampling_ratio/max": 2.433450222015381, "sampling/importance_sampling_ratio/mean": 0.8577107191085815, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6997148990631104, "sampling/sampling_logp_difference/mean": 0.009758539497852325, "step": 199, "step_time": 32.133282171000246 }, { "clip_ratio/high_max": 0.0015859631057537626, "clip_ratio/high_mean": 0.0015859631057537626, "clip_ratio/low_mean": 0.000520224140927894, "clip_ratio/low_min": 0.000520224140927894, "clip_ratio/region_mean": 0.0021061872466816567, "completions/clipped_ratio": 0.3125, "completions/max_length": 1792.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 984.59375, "completions/mean_terminated_length": 617.5909423828125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.4761558761820197, "epoch": 0.21344717182497333, "frac_reward_zero_std": 0.4375, "grad_norm": 0.025197388604283333, "kl": 0.038490127422846854, "learning_rate": 4.8682523937032823e-05, "loss": -0.0533, "num_tokens": 16654485.0, "reward": 1.0878987312316895, "reward_std": 0.3977475166320801, "rewards/argmax_reward_func/mean": 0.46875, "rewards/argmax_reward_func/std": 0.507007360458374, "rewards/criterion_gradient_reward_func/mean": 0.4191488027572632, "rewards/criterion_gradient_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.20000000298023224, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8967418670654297, "sampling/importance_sampling_ratio/mean": 0.5669448375701904, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5264902114868164, "sampling/sampling_logp_difference/mean": 0.011462414637207985, "step": 200, "step_time": 34.38070027500544 } ], "logging_steps": 1, "max_steps": 1874, "num_input_tokens_seen": 16654485, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }