{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.00808, "eval_steps": 500, "global_step": 404, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2757.0, "completions/max_terminated_length": 2757.0, "completions/mean_length": 2474.625, "completions/mean_terminated_length": 2484.48388671875, "completions/min_length": 1847.0, "completions/min_terminated_length": 1847.0, "entropy": 0.6398179829120636, "epoch": 2e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.9939162731170654, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0017, "num_tokens": 102428.0, "reward": -0.06937500089406967, "reward_std": 0.1426866054534912, "rewards/rollout_reward_func/mean": -0.06937500089406967, "rewards/rollout_reward_func/std": 0.22933192551136017, "sampling/importance_sampling_ratio/max": 1.4089819192886353, "sampling/importance_sampling_ratio/mean": 0.976771354675293, "sampling/importance_sampling_ratio/min": 0.49511995911598206, "sampling/sampling_logp_difference/max": 0.6248791217803955, "sampling/sampling_logp_difference/mean": 0.02513751946389675, "step": 1, "step_time": 40.5301194210042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6398179829120636, "epoch": 4e-05, "grad_norm": 1.9967504739761353, "kl": 0.0, "learning_rate": 2.8571428571428575e-07, "loss": -0.0017, "step": 2, "step_time": 8.158031945989933 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 2696.0, "completions/max_terminated_length": 2696.0, "completions/mean_length": 2449.78125, "completions/mean_terminated_length": 2449.78125, "completions/min_length": 1730.0, "completions/min_terminated_length": 1730.0, "entropy": 0.5683682635426521, "epoch": 6e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.7206465005874634, "kl": 0.0009393466789333615, "learning_rate": 5.714285714285715e-07, "loss": -0.0564, "num_tokens": 204432.0, "reward": -0.07793749868869781, "reward_std": 0.19137459993362427, "rewards/rollout_reward_func/mean": -0.07793749868869781, "rewards/rollout_reward_func/std": 0.27293646335601807, "sampling/importance_sampling_ratio/max": 1.327203631401062, "sampling/importance_sampling_ratio/mean": 1.0219132900238037, "sampling/importance_sampling_ratio/min": 0.7237246036529541, "sampling/sampling_logp_difference/max": 0.2326061725616455, "sampling/sampling_logp_difference/mean": 0.02151086926460266, "step": 3, "step_time": 37.95992787499563 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0018939394503831863, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0038470644503831863, "entropy": 0.5685252919793129, "epoch": 8e-05, "grad_norm": 1.7116172313690186, "kl": 0.0009586924861650914, "learning_rate": 8.571428571428572e-07, "loss": -0.056, "step": 4, "step_time": 8.032366728002671 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2665.0, "completions/max_terminated_length": 2665.0, "completions/mean_length": 2506.28125, "completions/mean_terminated_length": 2506.28125, "completions/min_length": 2145.0, "completions/min_terminated_length": 2145.0, "entropy": 0.5751285180449486, "epoch": 0.0001, "frac_reward_zero_std": 0.0, "grad_norm": 2.0214176177978516, "kl": 0.0009553735217195936, "learning_rate": 1.142857142857143e-06, "loss": -0.0098, "num_tokens": 307992.0, "reward": -0.0325000025331974, "reward_std": 0.07031647861003876, "rewards/rollout_reward_func/mean": -0.0325000025331974, "rewards/rollout_reward_func/std": 0.07426369935274124, "sampling/importance_sampling_ratio/max": 1.2946679592132568, "sampling/importance_sampling_ratio/mean": 0.9828097224235535, "sampling/importance_sampling_ratio/min": 0.43161657452583313, "sampling/sampling_logp_difference/max": 0.3309330940246582, "sampling/sampling_logp_difference/mean": 0.022440873086452484, "step": 5, "step_time": 40.78049653198832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5760079547762871, "epoch": 0.00012, "grad_norm": 1.9722453355789185, "kl": 0.0010062552464660257, "learning_rate": 1.4285714285714286e-06, "loss": -0.0114, "step": 6, "step_time": 8.023996326002816 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0018939394503831863, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0038470644503831863, "completions/clipped_ratio": 0.0, "completions/max_length": 2772.0, "completions/max_terminated_length": 2772.0, "completions/mean_length": 2420.15625, "completions/mean_terminated_length": 2420.15625, "completions/min_length": 1002.0, "completions/min_terminated_length": 1002.0, "entropy": 0.5471096336841583, "epoch": 0.00014, "frac_reward_zero_std": 0.0, "grad_norm": 1.4982593059539795, "kl": 0.001003716250124853, "learning_rate": 1.7142857142857145e-06, "loss": 0.0061, "num_tokens": 409519.0, "reward": -0.0521249994635582, "reward_std": 0.2334682047367096, "rewards/rollout_reward_func/mean": -0.0521249994635582, "rewards/rollout_reward_func/std": 0.3047172725200653, "sampling/importance_sampling_ratio/max": 1.9076436758041382, "sampling/importance_sampling_ratio/mean": 1.0103974342346191, "sampling/importance_sampling_ratio/min": 0.5369899868965149, "sampling/sampling_logp_difference/max": 0.30723023414611816, "sampling/sampling_logp_difference/mean": 0.02194805070757866, "step": 7, "step_time": 36.75636156700784 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.0039100684225559235, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009769443422555923, "entropy": 0.5471609607338905, "epoch": 0.00016, "grad_norm": 1.3890447616577148, "kl": 0.0008665668829053175, "learning_rate": 2.0000000000000003e-06, "loss": 0.0069, "step": 8, "step_time": 8.295392295018246 }, { "clip_ratio/high_max": 0.01119087846018374, "clip_ratio/high_mean": 0.00559543923009187, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00754856423009187, "completions/clipped_ratio": 0.0, "completions/max_length": 2707.0, "completions/max_terminated_length": 2707.0, "completions/mean_length": 2485.09375, "completions/mean_terminated_length": 2485.09375, "completions/min_length": 1610.0, "completions/min_terminated_length": 1610.0, "entropy": 0.5709630325436592, "epoch": 0.00018, "frac_reward_zero_std": 0.0, "grad_norm": 1.5354437828063965, "kl": 0.0010416117875138298, "learning_rate": 2.285714285714286e-06, "loss": 0.0544, "num_tokens": 512600.0, "reward": 0.010625001043081284, "reward_std": 0.14592772722244263, "rewards/rollout_reward_func/mean": 0.010625001043081284, "rewards/rollout_reward_func/std": 0.2494889795780182, "sampling/importance_sampling_ratio/max": 1.6029342412948608, "sampling/importance_sampling_ratio/mean": 0.9924947619438171, "sampling/importance_sampling_ratio/min": 0.556939959526062, "sampling/sampling_logp_difference/max": 0.3805968761444092, "sampling/sampling_logp_difference/mean": 0.024389563128352165, "step": 9, "step_time": 39.20236621199729 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.5720346607267857, "epoch": 0.0002, "grad_norm": 1.5500391721725464, "kl": 0.0010869200268643908, "learning_rate": 2.571428571428571e-06, "loss": 0.0545, "step": 10, "step_time": 8.161946275991795 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.0013586956774815917, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007218070677481592, "completions/clipped_ratio": 0.0, "completions/max_length": 2716.0, "completions/max_terminated_length": 2716.0, "completions/mean_length": 2482.46875, "completions/mean_terminated_length": 2482.46875, "completions/min_length": 2252.0, "completions/min_terminated_length": 2252.0, "entropy": 0.6314467415213585, "epoch": 0.00022, "frac_reward_zero_std": 0.0, "grad_norm": 1.8739888668060303, "kl": 0.0012773948546964675, "learning_rate": 2.8571428571428573e-06, "loss": 0.0387, "num_tokens": 615646.0, "reward": -0.031562499701976776, "reward_std": 0.0844234824180603, "rewards/rollout_reward_func/mean": -0.031562499701976776, "rewards/rollout_reward_func/std": 0.09391380101442337, "sampling/importance_sampling_ratio/max": 1.9019598960876465, "sampling/importance_sampling_ratio/mean": 0.9923563003540039, "sampling/importance_sampling_ratio/min": 0.5546591281890869, "sampling/sampling_logp_difference/max": 0.3334968090057373, "sampling/sampling_logp_difference/mean": 0.02209584228694439, "step": 11, "step_time": 40.81080743800703 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.6323637664318085, "epoch": 0.00024, "grad_norm": 1.8624191284179688, "kl": 0.0011840567749459296, "learning_rate": 3.142857142857143e-06, "loss": 0.0397, "step": 12, "step_time": 8.157768286997452 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2760.0, "completions/max_terminated_length": 2760.0, "completions/mean_length": 2491.75, "completions/mean_terminated_length": 2491.75, "completions/min_length": 1867.0, "completions/min_terminated_length": 1867.0, "entropy": 0.5670790821313858, "epoch": 0.00026, "frac_reward_zero_std": 0.0, "grad_norm": 2.0665578842163086, "kl": 0.0014767120446776971, "learning_rate": 3.428571428571429e-06, "loss": -0.0258, "num_tokens": 719436.0, "reward": -0.06818749755620956, "reward_std": 0.1440477967262268, "rewards/rollout_reward_func/mean": -0.06818749755620956, "rewards/rollout_reward_func/std": 0.1960759162902832, "sampling/importance_sampling_ratio/max": 1.743096113204956, "sampling/importance_sampling_ratio/mean": 1.0059804916381836, "sampling/importance_sampling_ratio/min": 0.5690397024154663, "sampling/sampling_logp_difference/max": 0.42962151765823364, "sampling/sampling_logp_difference/mean": 0.02228451520204544, "step": 13, "step_time": 40.541536634002114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5652061775326729, "epoch": 0.00028, "grad_norm": 2.1127512454986572, "kl": 0.0011144974414492026, "learning_rate": 3.7142857142857146e-06, "loss": -0.0274, "step": 14, "step_time": 8.263732075007283 }, { "clip_ratio/high_max": 0.007575757801532745, "clip_ratio/high_mean": 0.0037878789007663727, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005741003900766373, "completions/clipped_ratio": 0.0, "completions/max_length": 2691.0, "completions/max_terminated_length": 2691.0, "completions/mean_length": 2484.71875, "completions/mean_terminated_length": 2484.71875, "completions/min_length": 2008.0, "completions/min_terminated_length": 2008.0, "entropy": 0.597566194832325, "epoch": 0.0003, "frac_reward_zero_std": 0.0, "grad_norm": 2.0065197944641113, "kl": 0.0012788765816367231, "learning_rate": 4.000000000000001e-06, "loss": -0.0418, "num_tokens": 822717.0, "reward": -0.10331249237060547, "reward_std": 0.1455155909061432, "rewards/rollout_reward_func/mean": -0.10331249237060547, "rewards/rollout_reward_func/std": 0.20404654741287231, "sampling/importance_sampling_ratio/max": 1.4405966997146606, "sampling/importance_sampling_ratio/mean": 0.9779974818229675, "sampling/importance_sampling_ratio/min": 0.42821815609931946, "sampling/sampling_logp_difference/max": 0.4516181945800781, "sampling/sampling_logp_difference/mean": 0.023157190531492233, "step": 15, "step_time": 37.62107830101013 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.5933906957507133, "epoch": 0.00032, "grad_norm": 1.9404207468032837, "kl": 0.0014216557101462968, "learning_rate": 4.2857142857142855e-06, "loss": -0.0459, "step": 16, "step_time": 9.040029301002505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005093443673104048, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005093443673104048, "completions/clipped_ratio": 0.0, "completions/max_length": 2700.0, "completions/max_terminated_length": 2700.0, "completions/mean_length": 2493.53125, "completions/mean_terminated_length": 2493.53125, "completions/min_length": 2279.0, "completions/min_terminated_length": 2279.0, "entropy": 0.5841243341565132, "epoch": 0.00034, "frac_reward_zero_std": 0.0, "grad_norm": 2.7072598934173584, "kl": 0.0023420277575496584, "learning_rate": 4.571428571428572e-06, "loss": 0.0018, "num_tokens": 926571.0, "reward": -0.02068750001490116, "reward_std": 0.08841773867607117, "rewards/rollout_reward_func/mean": -0.02068750001490116, "rewards/rollout_reward_func/std": 0.09211999177932739, "sampling/importance_sampling_ratio/max": 1.8994961977005005, "sampling/importance_sampling_ratio/mean": 1.0768578052520752, "sampling/importance_sampling_ratio/min": 0.44597309827804565, "sampling/sampling_logp_difference/max": 0.5778663158416748, "sampling/sampling_logp_difference/mean": 0.02513628825545311, "step": 17, "step_time": 39.35067342498951 }, { "clip_ratio/high_max": 0.007582720601931214, "clip_ratio/high_mean": 0.003791360300965607, "clip_ratio/low_mean": 0.005744485300965607, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009535845601931214, "entropy": 0.5797206610441208, "epoch": 0.00036, "grad_norm": 2.360910177230835, "kl": 0.0030427857127506286, "learning_rate": 4.857142857142858e-06, "loss": -0.0002, "step": 18, "step_time": 8.148258179004188 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 2750.0, "completions/max_terminated_length": 2750.0, "completions/mean_length": 2509.125, "completions/mean_terminated_length": 2509.125, "completions/min_length": 2229.0, "completions/min_terminated_length": 2229.0, "entropy": 0.5699355229735374, "epoch": 0.00038, "frac_reward_zero_std": 0.0, "grad_norm": 1.7883220911026, "kl": 0.0033391022589057684, "learning_rate": 5.142857142857142e-06, "loss": -0.024, "num_tokens": 1030914.0, "reward": -0.04781249538064003, "reward_std": 0.11873149871826172, "rewards/rollout_reward_func/mean": -0.04781249538064003, "rewards/rollout_reward_func/std": 0.18549835681915283, "sampling/importance_sampling_ratio/max": 1.7033848762512207, "sampling/importance_sampling_ratio/mean": 1.0213618278503418, "sampling/importance_sampling_ratio/min": 0.5055848360061646, "sampling/sampling_logp_difference/max": 0.3674435615539551, "sampling/sampling_logp_difference/mean": 0.022144002839922905, "step": 19, "step_time": 40.42446762702457 }, { "clip_ratio/high_max": 0.006623641354963183, "clip_ratio/high_mean": 0.0033118206774815917, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007218070677481592, "entropy": 0.5630137547850609, "epoch": 0.0004, "grad_norm": 1.659043550491333, "kl": 0.004340015002526343, "learning_rate": 5.428571428571429e-06, "loss": -0.0286, "step": 20, "step_time": 8.287577473987767 }, { "clip_ratio/high_max": 0.016526442486792803, "clip_ratio/high_mean": 0.009621917037293315, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009621917037293315, "completions/clipped_ratio": 0.0, "completions/max_length": 2694.0, "completions/max_terminated_length": 2694.0, "completions/mean_length": 2414.625, "completions/mean_terminated_length": 2414.625, "completions/min_length": 973.0, "completions/min_terminated_length": 973.0, "entropy": 0.5076057054102421, "epoch": 0.00042, "frac_reward_zero_std": 0.0, "grad_norm": 1.7286955118179321, "kl": 0.003812528128037229, "learning_rate": 5.7142857142857145e-06, "loss": -0.0864, "num_tokens": 1131985.0, "reward": -0.07437499612569809, "reward_std": 0.15552687644958496, "rewards/rollout_reward_func/mean": -0.07437499612569809, "rewards/rollout_reward_func/std": 0.24696791172027588, "sampling/importance_sampling_ratio/max": 1.4717481136322021, "sampling/importance_sampling_ratio/mean": 0.9868855476379395, "sampling/importance_sampling_ratio/min": 0.5080141425132751, "sampling/sampling_logp_difference/max": 0.33080339431762695, "sampling/sampling_logp_difference/mean": 0.023507488891482353, "step": 21, "step_time": 38.73707472301612 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.49742893874645233, "epoch": 0.00044, "grad_norm": 1.7899357080459595, "kl": 0.005958152993116528, "learning_rate": 6e-06, "loss": -0.0874, "step": 22, "step_time": 8.990655720990617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 2769.0, "completions/max_terminated_length": 2769.0, "completions/mean_length": 2447.125, "completions/mean_terminated_length": 2447.125, "completions/min_length": 1353.0, "completions/min_terminated_length": 1353.0, "entropy": 0.5160082057118416, "epoch": 0.00046, "frac_reward_zero_std": 0.0, "grad_norm": 1.897569179534912, "kl": 0.009750543569680303, "learning_rate": 6.285714285714286e-06, "loss": -0.0297, "num_tokens": 1233712.0, "reward": -0.06949999928474426, "reward_std": 0.13460204005241394, "rewards/rollout_reward_func/mean": -0.06949999928474426, "rewards/rollout_reward_func/std": 0.19123773276805878, "sampling/importance_sampling_ratio/max": 1.734204649925232, "sampling/importance_sampling_ratio/mean": 1.0283379554748535, "sampling/importance_sampling_ratio/min": 0.6807689666748047, "sampling/sampling_logp_difference/max": 0.32382988929748535, "sampling/sampling_logp_difference/mean": 0.025856411084532738, "step": 23, "step_time": 38.87603813601163 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.013556985300965607, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.015510110184550285, "entropy": 0.5070139579474926, "epoch": 0.00048, "grad_norm": 1.6847742795944214, "kl": 0.013837641919963062, "learning_rate": 6.571428571428572e-06, "loss": -0.0327, "step": 24, "step_time": 8.237429195003642 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2677.0, "completions/max_terminated_length": 2677.0, "completions/mean_length": 2477.65625, "completions/mean_terminated_length": 2477.65625, "completions/min_length": 2004.0, "completions/min_terminated_length": 2004.0, "entropy": 0.46570198982954025, "epoch": 0.0005, "frac_reward_zero_std": 0.0, "grad_norm": 1.8208125829696655, "kl": 0.017063510487787426, "learning_rate": 6.857142857142858e-06, "loss": -0.0471, "num_tokens": 1336013.0, "reward": -0.041374996304512024, "reward_std": 0.13650161027908325, "rewards/rollout_reward_func/mean": -0.041374996304512024, "rewards/rollout_reward_func/std": 0.20301242172718048, "sampling/importance_sampling_ratio/max": 1.717456579208374, "sampling/importance_sampling_ratio/mean": 0.9826828241348267, "sampling/importance_sampling_ratio/min": 0.5129362344741821, "sampling/sampling_logp_difference/max": 0.6331992149353027, "sampling/sampling_logp_difference/mean": 0.03637850284576416, "step": 25, "step_time": 38.30766316399968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.453332532197237, "epoch": 0.00052, "grad_norm": 1.9915790557861328, "kl": 0.022920054849237204, "learning_rate": 7.1428571428571436e-06, "loss": -0.049, "step": 26, "step_time": 8.041683328003273 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "completions/clipped_ratio": 0.0, "completions/max_length": 2685.0, "completions/max_terminated_length": 2685.0, "completions/mean_length": 2544.25, "completions/mean_terminated_length": 2544.25, "completions/min_length": 2150.0, "completions/min_terminated_length": 2150.0, "entropy": 0.4926302433013916, "epoch": 0.00054, "frac_reward_zero_std": 0.0, "grad_norm": 1.7289246320724487, "kl": 0.02990493644028902, "learning_rate": 7.428571428571429e-06, "loss": 0.0072, "num_tokens": 1441197.0, "reward": -0.06387500464916229, "reward_std": 0.10385941714048386, "rewards/rollout_reward_func/mean": -0.06387500464916229, "rewards/rollout_reward_func/std": 0.16128110885620117, "sampling/importance_sampling_ratio/max": 1.4102411270141602, "sampling/importance_sampling_ratio/mean": 0.8670729398727417, "sampling/importance_sampling_ratio/min": 0.3404218554496765, "sampling/sampling_logp_difference/max": 0.7324519157409668, "sampling/sampling_logp_difference/mean": 0.04336021840572357, "step": 27, "step_time": 39.86006593199272 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.009598214295692742, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.01350446417927742, "entropy": 0.4816759377717972, "epoch": 0.00056, "grad_norm": 1.775354266166687, "kl": 0.044567104894667864, "learning_rate": 7.714285714285716e-06, "loss": 0.003, "step": 28, "step_time": 8.571332884996082 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "completions/clipped_ratio": 0.0, "completions/max_length": 2696.0, "completions/max_terminated_length": 2696.0, "completions/mean_length": 2481.5625, "completions/mean_terminated_length": 2481.5625, "completions/min_length": 1996.0, "completions/min_terminated_length": 1996.0, "entropy": 0.42902951687574387, "epoch": 0.00058, "frac_reward_zero_std": 0.0, "grad_norm": 1.6338751316070557, "kl": 0.05208651162683964, "learning_rate": 8.000000000000001e-06, "loss": -0.1571, "num_tokens": 1543722.0, "reward": -0.007187500596046448, "reward_std": 0.14660096168518066, "rewards/rollout_reward_func/mean": -0.007187500596046448, "rewards/rollout_reward_func/std": 0.22778363525867462, "sampling/importance_sampling_ratio/max": 2.0380847454071045, "sampling/importance_sampling_ratio/mean": 0.9687550067901611, "sampling/importance_sampling_ratio/min": 0.2778550982475281, "sampling/sampling_logp_difference/max": 1.2108449935913086, "sampling/sampling_logp_difference/mean": 0.05287490040063858, "step": 29, "step_time": 38.6119481420028 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.009895833441987634, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013802083441987634, "entropy": 0.4154880531132221, "epoch": 0.0006, "grad_norm": 1.6772791147232056, "kl": 0.0775398297701031, "learning_rate": 8.285714285714287e-06, "loss": -0.1612, "step": 30, "step_time": 8.044421385988244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0018382353009656072, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018382353009656072, "completions/clipped_ratio": 0.0, "completions/max_length": 2710.0, "completions/max_terminated_length": 2710.0, "completions/mean_length": 2503.375, "completions/mean_terminated_length": 2503.375, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "entropy": 0.40555064752697945, "epoch": 0.00062, "frac_reward_zero_std": 0.0, "grad_norm": 1.574500560760498, "kl": 0.05575865495484322, "learning_rate": 8.571428571428571e-06, "loss": -0.0645, "num_tokens": 1647517.0, "reward": -0.04531250149011612, "reward_std": 0.14535784721374512, "rewards/rollout_reward_func/mean": -0.04531250149011612, "rewards/rollout_reward_func/std": 0.2093304991722107, "sampling/importance_sampling_ratio/max": 1.7538701295852661, "sampling/importance_sampling_ratio/mean": 0.8808121681213379, "sampling/importance_sampling_ratio/min": 0.2845723628997803, "sampling/sampling_logp_difference/max": 0.8374984264373779, "sampling/sampling_logp_difference/mean": 0.0564584843814373, "step": 31, "step_time": 38.74203074599063 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.003791360300965607, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006916360347531736, "entropy": 0.3967522084712982, "epoch": 0.00064, "grad_norm": 1.582572340965271, "kl": 0.06716000568121672, "learning_rate": 8.857142857142858e-06, "loss": -0.0667, "step": 32, "step_time": 8.189764875001856 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.005744485300965607, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011603860300965607, "completions/clipped_ratio": 0.0, "completions/max_length": 2688.0, "completions/max_terminated_length": 2688.0, "completions/mean_length": 2415.21875, "completions/mean_terminated_length": 2415.21875, "completions/min_length": 1304.0, "completions/min_terminated_length": 1304.0, "entropy": 0.3761625960469246, "epoch": 0.00066, "frac_reward_zero_std": 0.0, "grad_norm": 1.277189016342163, "kl": 0.06961325742304325, "learning_rate": 9.142857142857144e-06, "loss": -0.0156, "num_tokens": 1748089.0, "reward": -0.0175624992698431, "reward_std": 0.22900649905204773, "rewards/rollout_reward_func/mean": -0.0175624992698431, "rewards/rollout_reward_func/std": 0.31228137016296387, "sampling/importance_sampling_ratio/max": 2.0467588901519775, "sampling/importance_sampling_ratio/mean": 0.9604863524436951, "sampling/importance_sampling_ratio/min": 0.22416925430297852, "sampling/sampling_logp_difference/max": 1.0932765007019043, "sampling/sampling_logp_difference/mean": 0.06231032684445381, "step": 33, "step_time": 37.7063103410037 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.009765625, "clip_ratio/low_mean": 0.013606223976239562, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.02337184874340892, "entropy": 0.36862019822001457, "epoch": 0.00068, "grad_norm": 1.103268027305603, "kl": 0.08232864388264716, "learning_rate": 9.42857142857143e-06, "loss": -0.0152, "step": 34, "step_time": 8.04393709400756 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0013586956774815917, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005264945677481592, "completions/clipped_ratio": 0.0, "completions/max_length": 2683.0, "completions/max_terminated_length": 2683.0, "completions/mean_length": 2458.0, "completions/mean_terminated_length": 2458.0, "completions/min_length": 1005.0, "completions/min_terminated_length": 1005.0, "entropy": 0.3750050254166126, "epoch": 0.0007, "frac_reward_zero_std": 0.0, "grad_norm": 1.6973440647125244, "kl": 0.06921286834403872, "learning_rate": 9.714285714285715e-06, "loss": -0.1278, "num_tokens": 1850470.0, "reward": -0.0572500079870224, "reward_std": 0.3149961531162262, "rewards/rollout_reward_func/mean": -0.0572500079870224, "rewards/rollout_reward_func/std": 0.40404173731803894, "sampling/importance_sampling_ratio/max": 2.6847193241119385, "sampling/importance_sampling_ratio/mean": 1.0530672073364258, "sampling/importance_sampling_ratio/min": 0.1825435906648636, "sampling/sampling_logp_difference/max": 1.4770822525024414, "sampling/sampling_logp_difference/mean": 0.06285598129034042, "step": 35, "step_time": 38.96468219499366 }, { "clip_ratio/high_max": 0.020833333488553762, "clip_ratio/high_mean": 0.010416666744276881, "clip_ratio/low_mean": 0.005205760127864778, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015622426988556981, "entropy": 0.36646367236971855, "epoch": 0.00072, "grad_norm": 1.709372878074646, "kl": 0.08743807720020413, "learning_rate": 1e-05, "loss": -0.129, "step": 36, "step_time": 8.023841024005378 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.009765625, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 2705.0, "completions/max_terminated_length": 2705.0, "completions/mean_length": 2519.6875, "completions/mean_terminated_length": 2519.6875, "completions/min_length": 2134.0, "completions/min_terminated_length": 2134.0, "entropy": 0.34471601620316505, "epoch": 0.00074, "frac_reward_zero_std": 0.0, "grad_norm": 2.450613021850586, "kl": 0.33715078979730606, "learning_rate": 9.999999998148153e-06, "loss": -0.2702, "num_tokens": 1954652.0, "reward": -0.017500001937150955, "reward_std": 0.07098734378814697, "rewards/rollout_reward_func/mean": -0.017500001937150955, "rewards/rollout_reward_func/std": 0.08203067630529404, "sampling/importance_sampling_ratio/max": 2.792572498321533, "sampling/importance_sampling_ratio/mean": 0.9947938919067383, "sampling/importance_sampling_ratio/min": 0.13970009982585907, "sampling/sampling_logp_difference/max": 2.3089332580566406, "sampling/sampling_logp_difference/mean": 0.08333279192447662, "step": 37, "step_time": 39.65462483598094 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3357379361987114, "epoch": 0.00076, "grad_norm": 2.3665034770965576, "kl": 0.45696336030960083, "learning_rate": 9.999999992592613e-06, "loss": -0.276, "step": 38, "step_time": 8.107524030987406 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0022321429569274187, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006138392956927419, "completions/clipped_ratio": 0.0, "completions/max_length": 2630.0, "completions/max_terminated_length": 2630.0, "completions/mean_length": 2378.6875, "completions/mean_terminated_length": 2378.6875, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "entropy": 0.3408471681177616, "epoch": 0.00078, "frac_reward_zero_std": 0.0, "grad_norm": 3.7294774055480957, "kl": 0.16567623522132635, "learning_rate": 9.999999983333379e-06, "loss": -0.0684, "num_tokens": 2054728.0, "reward": -0.14356249570846558, "reward_std": 0.20076312124729156, "rewards/rollout_reward_func/mean": -0.14356249570846558, "rewards/rollout_reward_func/std": 0.34942853450775146, "sampling/importance_sampling_ratio/max": 2.2133710384368896, "sampling/importance_sampling_ratio/mean": 0.9344292879104614, "sampling/importance_sampling_ratio/min": 0.10769516229629517, "sampling/sampling_logp_difference/max": 1.4032087326049805, "sampling/sampling_logp_difference/mean": 0.08816322684288025, "step": 39, "step_time": 40.239868925993505 }, { "clip_ratio/high_max": 0.027901785913854837, "clip_ratio/high_mean": 0.018136160681024194, "clip_ratio/low_mean": 0.014892988605424762, "clip_ratio/low_min": 0.008370535913854837, "clip_ratio/region_mean": 0.0330291495192796, "entropy": 0.3337853290140629, "epoch": 0.0008, "grad_norm": 3.1899514198303223, "kl": 0.2171315811574459, "learning_rate": 9.999999970370451e-06, "loss": -0.0714, "step": 40, "step_time": 8.054502869999851 }, { "clip_ratio/high_max": 0.020089285913854837, "clip_ratio/high_mean": 0.011997767956927419, "clip_ratio/low_mean": 0.0018939394503831863, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013891707407310605, "completions/clipped_ratio": 0.0, "completions/max_length": 2690.0, "completions/max_terminated_length": 2690.0, "completions/mean_length": 2504.5, "completions/mean_terminated_length": 2504.5, "completions/min_length": 1868.0, "completions/min_terminated_length": 1868.0, "entropy": 0.3315769322216511, "epoch": 0.00082, "frac_reward_zero_std": 0.0, "grad_norm": 2.1958541870117188, "kl": 0.556195599026978, "learning_rate": 9.99999995370383e-06, "loss": 0.1613, "num_tokens": 2158491.0, "reward": -0.03606250509619713, "reward_std": 0.12721404433250427, "rewards/rollout_reward_func/mean": -0.03606250509619713, "rewards/rollout_reward_func/std": 0.2096494734287262, "sampling/importance_sampling_ratio/max": 2.9438412189483643, "sampling/importance_sampling_ratio/mean": 1.0292738676071167, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.6189584732055664, "sampling/sampling_logp_difference/mean": 0.09665805101394653, "step": 41, "step_time": 40.6466921770043 }, { "clip_ratio/high_max": 0.03548450651578605, "clip_ratio/high_mean": 0.017742253257893026, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017742253257893026, "entropy": 0.33141712099313736, "epoch": 0.00084, "grad_norm": 2.158820390701294, "kl": 0.4570291112177074, "learning_rate": 9.999999933333514e-06, "loss": 0.1573, "step": 42, "step_time": 8.173876360000577 }, { "clip_ratio/high_max": 0.019301470601931214, "clip_ratio/high_mean": 0.009650735300965607, "clip_ratio/low_mean": 0.0022321429569274187, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011882878257893026, "completions/clipped_ratio": 0.0, "completions/max_length": 2684.0, "completions/max_terminated_length": 2684.0, "completions/mean_length": 2314.28125, "completions/mean_terminated_length": 2314.28125, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "entropy": 0.3161774128675461, "epoch": 0.00086, "frac_reward_zero_std": 0.0, "grad_norm": 2.4876537322998047, "kl": 0.5472150244750082, "learning_rate": 9.999999909259504e-06, "loss": -0.0026, "num_tokens": 2256178.0, "reward": -0.11968749761581421, "reward_std": 0.24497605860233307, "rewards/rollout_reward_func/mean": -0.11968749761581421, "rewards/rollout_reward_func/std": 0.3643242418766022, "sampling/importance_sampling_ratio/max": 2.8263914585113525, "sampling/importance_sampling_ratio/mean": 1.0258629322052002, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.7011332511901855, "sampling/sampling_logp_difference/mean": 0.10345172137022018, "step": 43, "step_time": 38.945439303992316 }, { "clip_ratio/high_max": 0.013494318351149559, "clip_ratio/high_mean": 0.01065340917557478, "clip_ratio/low_mean": 0.012890624813735485, "clip_ratio/low_min": 0.0078125, "clip_ratio/region_mean": 0.023544033989310265, "entropy": 0.3204997628927231, "epoch": 0.00088, "grad_norm": 2.031017780303955, "kl": 0.5513593647629023, "learning_rate": 9.9999998814818e-06, "loss": -0.0065, "step": 44, "step_time": 8.607130141004745 }, { "clip_ratio/high_max": 0.025781250093132257, "clip_ratio/high_mean": 0.014843750046566129, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01875000004656613, "completions/clipped_ratio": 0.0, "completions/max_length": 2688.0, "completions/max_terminated_length": 2688.0, "completions/mean_length": 2464.5625, "completions/mean_terminated_length": 2464.5625, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "entropy": 0.35709187015891075, "epoch": 0.0009, "frac_reward_zero_std": 0.0, "grad_norm": 2.1241366863250732, "kl": 0.2615460283122957, "learning_rate": 9.999999850000403e-06, "loss": -0.071, "num_tokens": 2358679.0, "reward": -0.054625004529953, "reward_std": 0.10086569935083389, "rewards/rollout_reward_func/mean": -0.054625004529953, "rewards/rollout_reward_func/std": 0.15416575968265533, "sampling/importance_sampling_ratio/max": 2.5754051208496094, "sampling/importance_sampling_ratio/mean": 1.00569486618042, "sampling/importance_sampling_ratio/min": 0.007187636569142342, "sampling/sampling_logp_difference/max": 3.3626089096069336, "sampling/sampling_logp_difference/mean": 0.10147839784622192, "step": 45, "step_time": 39.38599755401083 }, { "clip_ratio/high_max": 0.025781250093132257, "clip_ratio/high_mean": 0.01679687504656613, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02460937504656613, "entropy": 0.3584505319595337, "epoch": 0.00092, "grad_norm": 1.9520725011825562, "kl": 0.2969521852210164, "learning_rate": 9.999999814815314e-06, "loss": -0.069, "step": 46, "step_time": 8.046390703013458 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "completions/clipped_ratio": 0.0, "completions/max_length": 2686.0, "completions/max_terminated_length": 2686.0, "completions/mean_length": 2508.46875, "completions/mean_terminated_length": 2508.46875, "completions/min_length": 2281.0, "completions/min_terminated_length": 2281.0, "entropy": 0.3446594066917896, "epoch": 0.00094, "frac_reward_zero_std": 0.0, "grad_norm": 3.154435873031616, "kl": 0.28098534140735865, "learning_rate": 9.99999977592653e-06, "loss": -0.0095, "num_tokens": 2462728.0, "reward": -0.007187499664723873, "reward_std": 0.05442311242222786, "rewards/rollout_reward_func/mean": -0.007187499664723873, "rewards/rollout_reward_func/std": 0.05979720130562782, "sampling/importance_sampling_ratio/max": 2.8695878982543945, "sampling/importance_sampling_ratio/mean": 0.9559117555618286, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.546647071838379, "sampling/sampling_logp_difference/mean": 0.10615775734186172, "step": 47, "step_time": 41.4313356499988 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.34684762358665466, "epoch": 0.00096, "grad_norm": 3.2517645359039307, "kl": 0.24896481167525053, "learning_rate": 9.999999733334051e-06, "loss": -0.0127, "step": 48, "step_time": 8.04854733600223 }, { "clip_ratio/high_max": 0.01499417726881802, "clip_ratio/high_mean": 0.009450213401578367, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015309588401578367, "completions/clipped_ratio": 0.0, "completions/max_length": 2774.0, "completions/max_terminated_length": 2774.0, "completions/mean_length": 2508.15625, "completions/mean_terminated_length": 2508.15625, "completions/min_length": 1637.0, "completions/min_terminated_length": 1637.0, "entropy": 0.44613758474588394, "epoch": 0.00098, "frac_reward_zero_std": 0.0, "grad_norm": 1.6932541131973267, "kl": 0.1522258846089244, "learning_rate": 9.99999968703788e-06, "loss": -0.0828, "num_tokens": 2566857.0, "reward": -0.01862499676644802, "reward_std": 0.17793361842632294, "rewards/rollout_reward_func/mean": -0.01862499676644802, "rewards/rollout_reward_func/std": 0.2609236538410187, "sampling/importance_sampling_ratio/max": 2.0118746757507324, "sampling/importance_sampling_ratio/mean": 0.6943110227584839, "sampling/importance_sampling_ratio/min": 0.10269977152347565, "sampling/sampling_logp_difference/max": 1.4632587432861328, "sampling/sampling_logp_difference/mean": 0.08679313212633133, "step": 49, "step_time": 38.579419355992286 }, { "clip_ratio/high_max": 0.0390625, "clip_ratio/high_mean": 0.020889945793896914, "clip_ratio/low_mean": 0.01171875, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.032608695793896914, "entropy": 0.44533828645944595, "epoch": 0.001, "grad_norm": 1.3143709897994995, "kl": 0.13447811640799046, "learning_rate": 9.999999637038016e-06, "loss": -0.085, "step": 50, "step_time": 9.22721716301021 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "completions/clipped_ratio": 0.0, "completions/max_length": 2741.0, "completions/max_terminated_length": 2741.0, "completions/mean_length": 2454.53125, "completions/mean_terminated_length": 2454.53125, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "entropy": 0.31926462426781654, "epoch": 0.00102, "frac_reward_zero_std": 0.125, "grad_norm": 1.8239920139312744, "kl": 0.1380001651123166, "learning_rate": 9.999999583334458e-06, "loss": 0.0342, "num_tokens": 2668420.0, "reward": -0.10174999386072159, "reward_std": 0.16292209923267365, "rewards/rollout_reward_func/mean": -0.10174999386072159, "rewards/rollout_reward_func/std": 0.30493563413619995, "sampling/importance_sampling_ratio/max": 2.260237216949463, "sampling/importance_sampling_ratio/mean": 0.9161804914474487, "sampling/importance_sampling_ratio/min": 0.1295454055070877, "sampling/sampling_logp_difference/max": 1.4744603633880615, "sampling/sampling_logp_difference/mean": 0.087027907371521, "step": 51, "step_time": 40.09692535500653 }, { "clip_ratio/high_max": 0.02734375, "clip_ratio/high_mean": 0.01953125, "clip_ratio/low_mean": 0.01171875, "clip_ratio/low_min": 0.0078125, "clip_ratio/region_mean": 0.03125, "entropy": 0.31379691883921623, "epoch": 0.00104, "grad_norm": 1.4041469097137451, "kl": 0.11381018441170454, "learning_rate": 9.999999525927207e-06, "loss": 0.0289, "step": 52, "step_time": 8.14323568900727 }, { "clip_ratio/high_max": 0.011600378900766373, "clip_ratio/high_mean": 0.005800189450383186, "clip_ratio/low_mean": 0.009895833441987634, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01569602289237082, "completions/clipped_ratio": 0.0, "completions/max_length": 2827.0, "completions/max_terminated_length": 2827.0, "completions/mean_length": 2525.75, "completions/mean_terminated_length": 2525.75, "completions/min_length": 1651.0, "completions/min_terminated_length": 1651.0, "entropy": 0.3053863197565079, "epoch": 0.00106, "frac_reward_zero_std": 0.0, "grad_norm": 2.0077009201049805, "kl": 0.22236851323395967, "learning_rate": 9.999999464816262e-06, "loss": -0.1174, "num_tokens": 2772803.0, "reward": -0.023000001907348633, "reward_std": 0.2369600236415863, "rewards/rollout_reward_func/mean": -0.023000001907348633, "rewards/rollout_reward_func/std": 0.3612251579761505, "sampling/importance_sampling_ratio/max": 2.7046000957489014, "sampling/importance_sampling_ratio/mean": 0.9783951640129089, "sampling/importance_sampling_ratio/min": 0.022106723859906197, "sampling/sampling_logp_difference/max": 2.115330696105957, "sampling/sampling_logp_difference/mean": 0.07762297242879868, "step": 53, "step_time": 39.43046704398148 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013671875, "entropy": 0.3010335825383663, "epoch": 0.00108, "grad_norm": 2.0416150093078613, "kl": 0.24721599649637938, "learning_rate": 9.999999400001624e-06, "loss": -0.1213, "step": 54, "step_time": 8.363815268006874 }, { "clip_ratio/high_max": 0.01953125, "clip_ratio/high_mean": 0.009765625, "clip_ratio/low_mean": 0.0016447368543595076, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011410361854359508, "completions/clipped_ratio": 0.0, "completions/max_length": 2618.0, "completions/max_terminated_length": 2618.0, "completions/mean_length": 2366.34375, "completions/mean_terminated_length": 2366.34375, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "entropy": 0.33765238150954247, "epoch": 0.0011, "frac_reward_zero_std": 0.0, "grad_norm": 1.974804162979126, "kl": 0.18207258731126785, "learning_rate": 9.999999331483293e-06, "loss": 0.0599, "num_tokens": 2871736.0, "reward": -0.10312500596046448, "reward_std": 0.28904440999031067, "rewards/rollout_reward_func/mean": -0.10312500596046448, "rewards/rollout_reward_func/std": 0.4227745532989502, "sampling/importance_sampling_ratio/max": 2.3023340702056885, "sampling/importance_sampling_ratio/mean": 0.8361672163009644, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.0252685546875, "sampling/sampling_logp_difference/mean": 0.1051572933793068, "step": 55, "step_time": 38.05553440601216 }, { "clip_ratio/high_max": 0.022820723708719015, "clip_ratio/high_mean": 0.013363486854359508, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019222861854359508, "entropy": 0.33843234553933144, "epoch": 0.00112, "grad_norm": 2.0378940105438232, "kl": 0.17897878028452396, "learning_rate": 9.999999259261269e-06, "loss": 0.0585, "step": 56, "step_time": 7.9421927529911045 }, { "clip_ratio/high_max": 0.0234375, "clip_ratio/high_mean": 0.01171875, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 2746.0, "completions/max_terminated_length": 2746.0, "completions/mean_length": 2515.8125, "completions/mean_terminated_length": 2515.8125, "completions/min_length": 1867.0, "completions/min_terminated_length": 1867.0, "entropy": 0.32060882076621056, "epoch": 0.00114, "frac_reward_zero_std": 0.0, "grad_norm": 2.0523271560668945, "kl": 0.23650522576645017, "learning_rate": 9.999999183335551e-06, "loss": 0.1251, "num_tokens": 2976047.0, "reward": -0.0755000039935112, "reward_std": 0.1690390408039093, "rewards/rollout_reward_func/mean": -0.0755000039935112, "rewards/rollout_reward_func/std": 0.2713806927204132, "sampling/importance_sampling_ratio/max": 2.124682664871216, "sampling/importance_sampling_ratio/mean": 0.9381376504898071, "sampling/importance_sampling_ratio/min": 0.15374599397182465, "sampling/sampling_logp_difference/max": 1.6065802574157715, "sampling/sampling_logp_difference/mean": 0.07379056513309479, "step": 57, "step_time": 40.01717420799832 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013671875, "entropy": 0.3272933140397072, "epoch": 0.00116, "grad_norm": 1.6893479824066162, "kl": 0.20167131815105677, "learning_rate": 9.999999103706142e-06, "loss": 0.1206, "step": 58, "step_time": 8.224727661014185 }, { "clip_ratio/high_max": 0.016927083488553762, "clip_ratio/high_mean": 0.008463541744276881, "clip_ratio/low_mean": 0.003791360300965607, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012254901928827167, "completions/clipped_ratio": 0.0, "completions/max_length": 2738.0, "completions/max_terminated_length": 2738.0, "completions/mean_length": 2365.625, "completions/mean_terminated_length": 2365.625, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "entropy": 0.328445702791214, "epoch": 0.00118, "frac_reward_zero_std": 0.0, "grad_norm": 1.7868247032165527, "kl": 0.2537320605479181, "learning_rate": 9.999999020373038e-06, "loss": -0.0306, "num_tokens": 3074895.0, "reward": -0.13575001060962677, "reward_std": 0.2329138219356537, "rewards/rollout_reward_func/mean": -0.13575001060962677, "rewards/rollout_reward_func/std": 0.3464026153087616, "sampling/importance_sampling_ratio/max": 1.9202526807785034, "sampling/importance_sampling_ratio/mean": 0.8327348232269287, "sampling/importance_sampling_ratio/min": 0.11179676651954651, "sampling/sampling_logp_difference/max": 1.572983741760254, "sampling/sampling_logp_difference/mean": 0.07091033458709717, "step": 59, "step_time": 37.41163979900739 }, { "clip_ratio/high_max": 0.04427083348855376, "clip_ratio/high_mean": 0.02408854174427688, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026041666977107525, "entropy": 0.3374003656208515, "epoch": 0.0012, "grad_norm": 1.3850635290145874, "kl": 0.21838521771132946, "learning_rate": 9.999998933336242e-06, "loss": -0.0349, "step": 60, "step_time": 8.157472340019012 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "completions/clipped_ratio": 0.0, "completions/max_length": 2783.0, "completions/max_terminated_length": 2783.0, "completions/mean_length": 2484.78125, "completions/mean_terminated_length": 2484.78125, "completions/min_length": 1034.0, "completions/min_terminated_length": 1034.0, "entropy": 0.3523460924625397, "epoch": 0.00122, "frac_reward_zero_std": 0.0, "grad_norm": 2.1732754707336426, "kl": 0.06509718182496727, "learning_rate": 9.999998842595754e-06, "loss": -0.0152, "num_tokens": 3178521.0, "reward": -0.08306249976158142, "reward_std": 0.19620290398597717, "rewards/rollout_reward_func/mean": -0.08306249976158142, "rewards/rollout_reward_func/std": 0.258004754781723, "sampling/importance_sampling_ratio/max": 2.3492777347564697, "sampling/importance_sampling_ratio/mean": 1.0118275880813599, "sampling/importance_sampling_ratio/min": 0.2545172870159149, "sampling/sampling_logp_difference/max": 0.9898586273193359, "sampling/sampling_logp_difference/mean": 0.05905335396528244, "step": 61, "step_time": 41.6817599410133 }, { "clip_ratio/high_max": 0.01953125, "clip_ratio/high_mean": 0.01171875, "clip_ratio/low_mean": 0.016276041977107525, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.027994791977107525, "entropy": 0.35907527431845665, "epoch": 0.00124, "grad_norm": 1.8089604377746582, "kl": 0.07727690786123276, "learning_rate": 9.999998748151573e-06, "loss": -0.0186, "step": 62, "step_time": 8.292622140004823 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "completions/clipped_ratio": 0.0, "completions/max_length": 2798.0, "completions/max_terminated_length": 2798.0, "completions/mean_length": 2538.625, "completions/mean_terminated_length": 2538.625, "completions/min_length": 2284.0, "completions/min_terminated_length": 2284.0, "entropy": 0.3591052442789078, "epoch": 0.00126, "frac_reward_zero_std": 0.0, "grad_norm": 1.542479157447815, "kl": 0.18014016561210155, "learning_rate": 9.999998650003697e-06, "loss": -0.123, "num_tokens": 3283395.0, "reward": 0.009062500670552254, "reward_std": 0.06245460733771324, "rewards/rollout_reward_func/mean": 0.009062500670552254, "rewards/rollout_reward_func/std": 0.06502402573823929, "sampling/importance_sampling_ratio/max": 2.2381744384765625, "sampling/importance_sampling_ratio/mean": 0.9013581871986389, "sampling/importance_sampling_ratio/min": 0.09256737679243088, "sampling/sampling_logp_difference/max": 1.322446346282959, "sampling/sampling_logp_difference/mean": 0.07073262333869934, "step": 63, "step_time": 41.169668218979496 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.3636559210717678, "epoch": 0.00128, "grad_norm": 2.6269946098327637, "kl": 0.17624082788825035, "learning_rate": 9.999998548152132e-06, "loss": -0.125, "step": 64, "step_time": 8.296477131996653 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 2689.0, "completions/max_terminated_length": 2689.0, "completions/mean_length": 2548.90625, "completions/mean_terminated_length": 2548.90625, "completions/min_length": 2301.0, "completions/min_terminated_length": 2301.0, "entropy": 0.3906066454946995, "epoch": 0.0013, "frac_reward_zero_std": 0.0, "grad_norm": 2.987762212753296, "kl": 0.13116181548684835, "learning_rate": 9.999998442596872e-06, "loss": 0.016, "num_tokens": 3388504.0, "reward": -0.021375000476837158, "reward_std": 0.125992089509964, "rewards/rollout_reward_func/mean": -0.021375000476837158, "rewards/rollout_reward_func/std": 0.19460426270961761, "sampling/importance_sampling_ratio/max": 2.4538660049438477, "sampling/importance_sampling_ratio/mean": 0.9186806082725525, "sampling/importance_sampling_ratio/min": 0.14261139929294586, "sampling/sampling_logp_difference/max": 1.1358554363250732, "sampling/sampling_logp_difference/mean": 0.07268854975700378, "step": 65, "step_time": 40.21691108200321 }, { "clip_ratio/high_max": 0.01953125, "clip_ratio/high_mean": 0.009765625, "clip_ratio/low_mean": 0.009765625, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.01953125, "entropy": 0.3885869197547436, "epoch": 0.00132, "grad_norm": 2.8780970573425293, "kl": 0.13104048231616616, "learning_rate": 9.999998333337923e-06, "loss": 0.0126, "step": 66, "step_time": 8.100458256005368 }, { "clip_ratio/high_max": 0.020833333488553762, "clip_ratio/high_mean": 0.01627604174427688, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01627604174427688, "completions/clipped_ratio": 0.0, "completions/max_length": 2643.0, "completions/max_terminated_length": 2643.0, "completions/mean_length": 2450.53125, "completions/mean_terminated_length": 2450.53125, "completions/min_length": 1017.0, "completions/min_terminated_length": 1017.0, "entropy": 0.3505115546286106, "epoch": 0.00134, "frac_reward_zero_std": 0.125, "grad_norm": 1.603503704071045, "kl": 0.17773088440299034, "learning_rate": 9.99999822037528e-06, "loss": -0.0312, "num_tokens": 3490536.0, "reward": -0.06999999284744263, "reward_std": 0.17571845650672913, "rewards/rollout_reward_func/mean": -0.06999999284744263, "rewards/rollout_reward_func/std": 0.2663976848125458, "sampling/importance_sampling_ratio/max": 2.7864060401916504, "sampling/importance_sampling_ratio/mean": 0.9576973915100098, "sampling/importance_sampling_ratio/min": 0.23718009889125824, "sampling/sampling_logp_difference/max": 1.3010015487670898, "sampling/sampling_logp_difference/mean": 0.07168266922235489, "step": 67, "step_time": 40.63574154300295 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "entropy": 0.3506194017827511, "epoch": 0.00136, "grad_norm": 1.6443591117858887, "kl": 0.1914054024964571, "learning_rate": 9.999998103708944e-06, "loss": -0.0329, "step": 68, "step_time": 7.992264769010944 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 2775.0, "completions/max_terminated_length": 2775.0, "completions/mean_length": 2423.375, "completions/mean_terminated_length": 2423.375, "completions/min_length": 609.0, "completions/min_terminated_length": 609.0, "entropy": 0.35105321556329727, "epoch": 0.00138, "frac_reward_zero_std": 0.0, "grad_norm": 1.6509017944335938, "kl": 0.13090949086472392, "learning_rate": 9.999997983338918e-06, "loss": 0.0782, "num_tokens": 3591770.0, "reward": -0.05456249788403511, "reward_std": 0.1812342405319214, "rewards/rollout_reward_func/mean": -0.05456249788403511, "rewards/rollout_reward_func/std": 0.2536933422088623, "sampling/importance_sampling_ratio/max": 2.6188161373138428, "sampling/importance_sampling_ratio/mean": 0.9236185550689697, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9868068695068359, "sampling/sampling_logp_difference/mean": 0.07983951270580292, "step": 69, "step_time": 40.37906555600057 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.009765625, "clip_ratio/low_mean": 0.005744485300965607, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015510110184550285, "entropy": 0.35209859535098076, "epoch": 0.0014, "grad_norm": 1.7578048706054688, "kl": 0.13869365211576223, "learning_rate": 9.999997859265198e-06, "loss": 0.0777, "step": 70, "step_time": 8.326206992009247 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.005468750023283064, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007421875023283064, "completions/clipped_ratio": 0.0, "completions/max_length": 2720.0, "completions/max_terminated_length": 2720.0, "completions/mean_length": 2578.5, "completions/mean_terminated_length": 2578.5, "completions/min_length": 2418.0, "completions/min_terminated_length": 2418.0, "entropy": 0.4290778189897537, "epoch": 0.00142, "frac_reward_zero_std": 0.0, "grad_norm": 2.7589776515960693, "kl": 0.14776458498090506, "learning_rate": 9.999997731487788e-06, "loss": 0.114, "num_tokens": 3697547.0, "reward": 0.012937500141561031, "reward_std": 0.07223817706108093, "rewards/rollout_reward_func/mean": 0.012937500141561031, "rewards/rollout_reward_func/std": 0.07497953623533249, "sampling/importance_sampling_ratio/max": 2.384115695953369, "sampling/importance_sampling_ratio/mean": 1.0900431871414185, "sampling/importance_sampling_ratio/min": 0.3165396749973297, "sampling/sampling_logp_difference/max": 1.0836691856384277, "sampling/sampling_logp_difference/mean": 0.07604964077472687, "step": 71, "step_time": 41.12706470600824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.4288586899638176, "epoch": 0.00144, "grad_norm": 2.7190988063812256, "kl": 0.13816983718425035, "learning_rate": 9.999997600006685e-06, "loss": 0.1101, "step": 72, "step_time": 8.669852263999928 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 2775.0, "completions/max_terminated_length": 2775.0, "completions/mean_length": 2492.09375, "completions/mean_terminated_length": 2492.09375, "completions/min_length": 1328.0, "completions/min_terminated_length": 1328.0, "entropy": 0.3525308780372143, "epoch": 0.00146, "frac_reward_zero_std": 0.0, "grad_norm": 2.7203540802001953, "kl": 0.1596079389564693, "learning_rate": 9.999997464821892e-06, "loss": -0.0414, "num_tokens": 3800398.0, "reward": -0.000937499338760972, "reward_std": 0.16524627804756165, "rewards/rollout_reward_func/mean": -0.000937499338760972, "rewards/rollout_reward_func/std": 0.2613305449485779, "sampling/importance_sampling_ratio/max": 2.968574047088623, "sampling/importance_sampling_ratio/mean": 1.0997503995895386, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0971083641052246, "sampling/sampling_logp_difference/mean": 0.07693592458963394, "step": 73, "step_time": 39.97552160199848 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.3504711091518402, "epoch": 0.00148, "grad_norm": 2.504063129425049, "kl": 0.1592545616440475, "learning_rate": 9.999997325933409e-06, "loss": -0.0468, "step": 74, "step_time": 8.246904959989479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 2716.0, "completions/max_terminated_length": 2716.0, "completions/mean_length": 2496.875, "completions/mean_terminated_length": 2496.875, "completions/min_length": 1654.0, "completions/min_terminated_length": 1654.0, "entropy": 0.35697900131344795, "epoch": 0.0015, "frac_reward_zero_std": 0.0, "grad_norm": 2.647608518600464, "kl": 0.0990099674090743, "learning_rate": 9.999997183341233e-06, "loss": 0.0068, "num_tokens": 3903870.0, "reward": -0.09281250834465027, "reward_std": 0.1840982586145401, "rewards/rollout_reward_func/mean": -0.09281250834465027, "rewards/rollout_reward_func/std": 0.3194171190261841, "sampling/importance_sampling_ratio/max": 2.5169003009796143, "sampling/importance_sampling_ratio/mean": 1.1982736587524414, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.017937421798706, "sampling/sampling_logp_difference/mean": 0.05824393406510353, "step": 75, "step_time": 39.21068185700278 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.009765625, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.015625, "entropy": 0.3578021042048931, "epoch": 0.00152, "grad_norm": 2.5704877376556396, "kl": 0.0889095813035965, "learning_rate": 9.999997037045365e-06, "loss": -0.0003, "step": 76, "step_time": 8.127733456996793 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 2657.0, "completions/max_terminated_length": 2657.0, "completions/mean_length": 2542.59375, "completions/mean_terminated_length": 2542.59375, "completions/min_length": 2408.0, "completions/min_terminated_length": 2408.0, "entropy": 0.3645514212548733, "epoch": 0.00154, "frac_reward_zero_std": 0.0, "grad_norm": 2.7248449325561523, "kl": 0.09846360562369227, "learning_rate": 9.999996887045808e-06, "loss": -0.027, "num_tokens": 4008956.0, "reward": -0.0031249993480741978, "reward_std": 0.06556794792413712, "rewards/rollout_reward_func/mean": -0.0031249993480741978, "rewards/rollout_reward_func/std": 0.0677013099193573, "sampling/importance_sampling_ratio/max": 2.925798177719116, "sampling/importance_sampling_ratio/mean": 1.2111601829528809, "sampling/importance_sampling_ratio/min": 0.17253729701042175, "sampling/sampling_logp_difference/max": 1.6732475757598877, "sampling/sampling_logp_difference/mean": 0.06314364075660706, "step": 77, "step_time": 41.08125996801391 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.009765625, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.015625, "entropy": 0.36346615105867386, "epoch": 0.00156, "grad_norm": 2.5180888175964355, "kl": 0.0976226981729269, "learning_rate": 9.99999673334256e-06, "loss": -0.0306, "step": 78, "step_time": 8.931478123995475 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 2696.0, "completions/max_terminated_length": 2696.0, "completions/mean_length": 2443.71875, "completions/mean_terminated_length": 2443.71875, "completions/min_length": 1264.0, "completions/min_terminated_length": 1264.0, "entropy": 0.37226664274930954, "epoch": 0.00158, "frac_reward_zero_std": 0.0, "grad_norm": 1.6087464094161987, "kl": 0.12296244129538536, "learning_rate": 9.99999657593562e-06, "loss": -0.0621, "num_tokens": 4110538.0, "reward": 0.0046875025145709515, "reward_std": 0.1708545833826065, "rewards/rollout_reward_func/mean": 0.0046875025145709515, "rewards/rollout_reward_func/std": 0.2877609431743622, "sampling/importance_sampling_ratio/max": 2.034745931625366, "sampling/importance_sampling_ratio/mean": 0.9870630502700806, "sampling/importance_sampling_ratio/min": 0.13036122918128967, "sampling/sampling_logp_difference/max": 1.4829730987548828, "sampling/sampling_logp_difference/mean": 0.0670691505074501, "step": 79, "step_time": 39.7622713850069 }, { "clip_ratio/high_max": 0.011600378900766373, "clip_ratio/high_mean": 0.005800189450383186, "clip_ratio/low_mean": 0.016295553650707006, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.02209574286825955, "entropy": 0.3671272359788418, "epoch": 0.0016, "grad_norm": 1.653118371963501, "kl": 0.13659503497183323, "learning_rate": 9.99999641482499e-06, "loss": -0.0661, "step": 80, "step_time": 8.155490196986648 }, { "clip_ratio/high_max": 0.0234375, "clip_ratio/high_mean": 0.01171875, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 2723.0, "completions/max_terminated_length": 2723.0, "completions/mean_length": 2522.65625, "completions/mean_terminated_length": 2522.65625, "completions/min_length": 1634.0, "completions/min_terminated_length": 1634.0, "entropy": 0.32682304456830025, "epoch": 0.00162, "frac_reward_zero_std": 0.0, "grad_norm": 1.6380876302719116, "kl": 0.11479341750964522, "learning_rate": 9.999996250010671e-06, "loss": -0.0309, "num_tokens": 4214814.0, "reward": 0.0560000017285347, "reward_std": 0.2510544955730438, "rewards/rollout_reward_func/mean": 0.0560000017285347, "rewards/rollout_reward_func/std": 0.35488805174827576, "sampling/importance_sampling_ratio/max": 2.08979868888855, "sampling/importance_sampling_ratio/mean": 1.054112434387207, "sampling/importance_sampling_ratio/min": 0.2209986299276352, "sampling/sampling_logp_difference/max": 1.3254785537719727, "sampling/sampling_logp_difference/mean": 0.04853574186563492, "step": 81, "step_time": 39.71311388498725 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.01171875, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.017578125, "entropy": 0.3208409249782562, "epoch": 0.00164, "grad_norm": 1.6973153352737427, "kl": 0.11302056722342968, "learning_rate": 9.999996081492662e-06, "loss": -0.0305, "step": 82, "step_time": 8.233296020996931 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 2735.0, "completions/max_terminated_length": 2735.0, "completions/mean_length": 2485.5625, "completions/mean_terminated_length": 2485.5625, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "entropy": 0.34682078659534454, "epoch": 0.00166, "frac_reward_zero_std": 0.0, "grad_norm": 1.4546597003936768, "kl": 0.26240994967520237, "learning_rate": 9.999995909270962e-06, "loss": 0.0636, "num_tokens": 4318021.0, "reward": -0.08749999105930328, "reward_std": 0.24293166399002075, "rewards/rollout_reward_func/mean": -0.08749999105930328, "rewards/rollout_reward_func/std": 0.3317111134529114, "sampling/importance_sampling_ratio/max": 2.534839391708374, "sampling/importance_sampling_ratio/mean": 0.9597312211990356, "sampling/importance_sampling_ratio/min": 0.14286302030086517, "sampling/sampling_logp_difference/max": 1.5682034492492676, "sampling/sampling_logp_difference/mean": 0.07494957745075226, "step": 83, "step_time": 40.59216376800032 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.3458695523440838, "epoch": 0.00168, "grad_norm": 1.3503389358520508, "kl": 0.2947566229850054, "learning_rate": 9.999995733345573e-06, "loss": 0.0628, "step": 84, "step_time": 8.696758408012101 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 2717.0, "completions/max_terminated_length": 2717.0, "completions/mean_length": 2553.5, "completions/mean_terminated_length": 2553.5, "completions/min_length": 2408.0, "completions/min_terminated_length": 2408.0, "entropy": 0.34082989022135735, "epoch": 0.0017, "frac_reward_zero_std": 0.0, "grad_norm": 1.7237926721572876, "kl": 0.215694485232234, "learning_rate": 9.999995553716494e-06, "loss": -0.0164, "num_tokens": 4423460.0, "reward": -0.0023124990984797478, "reward_std": 0.05388234183192253, "rewards/rollout_reward_func/mean": -0.0023124990984797478, "rewards/rollout_reward_func/std": 0.07007481157779694, "sampling/importance_sampling_ratio/max": 2.3099896907806396, "sampling/importance_sampling_ratio/mean": 0.9558588266372681, "sampling/importance_sampling_ratio/min": 0.09475265443325043, "sampling/sampling_logp_difference/max": 1.84321129322052, "sampling/sampling_logp_difference/mean": 0.06895856559276581, "step": 85, "step_time": 41.61511501099449 }, { "clip_ratio/high_max": 0.01953125, "clip_ratio/high_mean": 0.009765625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.3418670929968357, "epoch": 0.00172, "grad_norm": 1.680446743965149, "kl": 0.21228844951838255, "learning_rate": 9.999995370383725e-06, "loss": -0.0208, "step": 86, "step_time": 8.200968577024469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2714.0, "completions/max_terminated_length": 2714.0, "completions/mean_length": 2394.21875, "completions/mean_terminated_length": 2394.21875, "completions/min_length": 1009.0, "completions/min_terminated_length": 1009.0, "entropy": 0.33990855142474174, "epoch": 0.00174, "frac_reward_zero_std": 0.0, "grad_norm": 1.7447688579559326, "kl": 0.22580334031954408, "learning_rate": 9.999995183347268e-06, "loss": -0.0144, "num_tokens": 4523582.0, "reward": -0.10875000059604645, "reward_std": 0.17445093393325806, "rewards/rollout_reward_func/mean": -0.10875000059604645, "rewards/rollout_reward_func/std": 0.2984124720096588, "sampling/importance_sampling_ratio/max": 1.6535972356796265, "sampling/importance_sampling_ratio/mean": 0.8531072735786438, "sampling/importance_sampling_ratio/min": 0.1560264229774475, "sampling/sampling_logp_difference/max": 1.3071918487548828, "sampling/sampling_logp_difference/mean": 0.06737589836120605, "step": 87, "step_time": 38.91260410401446 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.34269775450229645, "epoch": 0.00176, "grad_norm": 1.6040964126586914, "kl": 0.21285296906717122, "learning_rate": 9.999994992607122e-06, "loss": -0.0147, "step": 88, "step_time": 8.108735417001299 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "completions/clipped_ratio": 0.0, "completions/max_length": 2733.0, "completions/max_terminated_length": 2733.0, "completions/mean_length": 2498.4375, "completions/mean_terminated_length": 2498.4375, "completions/min_length": 1149.0, "completions/min_terminated_length": 1149.0, "entropy": 0.41232921928167343, "epoch": 0.00178, "frac_reward_zero_std": 0.0, "grad_norm": 2.217099666595459, "kl": 0.15504596289247274, "learning_rate": 9.999994798163286e-06, "loss": -0.0326, "num_tokens": 4627049.0, "reward": -0.045250002294778824, "reward_std": 0.1063895896077156, "rewards/rollout_reward_func/mean": -0.045250002294778824, "rewards/rollout_reward_func/std": 0.19846490025520325, "sampling/importance_sampling_ratio/max": 1.697726845741272, "sampling/importance_sampling_ratio/mean": 0.9777263402938843, "sampling/importance_sampling_ratio/min": 0.22058866918087006, "sampling/sampling_logp_difference/max": 1.5214389562606812, "sampling/sampling_logp_difference/mean": 0.05762651562690735, "step": 89, "step_time": 41.47546880201844 }, { "clip_ratio/high_max": 0.025240384973585606, "clip_ratio/high_mean": 0.016526442486792803, "clip_ratio/low_mean": 0.009765625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026292067486792803, "entropy": 0.40969282388687134, "epoch": 0.0018, "grad_norm": 2.034275770187378, "kl": 0.1437628036364913, "learning_rate": 9.999994600015764e-06, "loss": -0.0348, "step": 90, "step_time": 8.164387071010424 }, { "clip_ratio/high_max": 0.020833333488553762, "clip_ratio/high_mean": 0.012369791744276881, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014322916744276881, "completions/clipped_ratio": 0.0, "completions/max_length": 2689.0, "completions/max_terminated_length": 2689.0, "completions/mean_length": 2442.15625, "completions/mean_terminated_length": 2442.15625, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "entropy": 0.3805011622607708, "epoch": 0.00182, "frac_reward_zero_std": 0.0, "grad_norm": 1.4305521249771118, "kl": 0.1604838757775724, "learning_rate": 9.99999439816455e-06, "loss": -0.1169, "num_tokens": 4728945.0, "reward": -0.07343750447034836, "reward_std": 0.24067096412181854, "rewards/rollout_reward_func/mean": -0.07343750447034836, "rewards/rollout_reward_func/std": 0.3149354159832001, "sampling/importance_sampling_ratio/max": 2.548600673675537, "sampling/importance_sampling_ratio/mean": 0.9048938155174255, "sampling/importance_sampling_ratio/min": 0.09455844014883041, "sampling/sampling_logp_difference/max": 1.6043697595596313, "sampling/sampling_logp_difference/mean": 0.0698060691356659, "step": 91, "step_time": 40.740327625004284 }, { "clip_ratio/high_max": 0.024739583488553762, "clip_ratio/high_mean": 0.014322916744276881, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014322916744276881, "entropy": 0.3806532882153988, "epoch": 0.00184, "grad_norm": 1.4468754529953003, "kl": 0.16916062962263823, "learning_rate": 9.999994192609649e-06, "loss": -0.1191, "step": 92, "step_time": 8.171024764007598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2754.0, "completions/max_terminated_length": 2754.0, "completions/mean_length": 2552.75, "completions/mean_terminated_length": 2552.75, "completions/min_length": 2287.0, "completions/min_terminated_length": 2287.0, "entropy": 0.3542626202106476, "epoch": 0.00186, "frac_reward_zero_std": 0.0, "grad_norm": 1.5995057821273804, "kl": 0.18657434731721878, "learning_rate": 9.99999398335106e-06, "loss": -0.1028, "num_tokens": 4833871.0, "reward": 0.008937498554587364, "reward_std": 0.05844488739967346, "rewards/rollout_reward_func/mean": 0.008937498554587364, "rewards/rollout_reward_func/std": 0.06495653092861176, "sampling/importance_sampling_ratio/max": 1.8651371002197266, "sampling/importance_sampling_ratio/mean": 0.9047431349754333, "sampling/importance_sampling_ratio/min": 0.2518555521965027, "sampling/sampling_logp_difference/max": 1.338068962097168, "sampling/sampling_logp_difference/mean": 0.05908910930156708, "step": 93, "step_time": 41.447705181002675 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.3570367135107517, "epoch": 0.00188, "grad_norm": 1.3854937553405762, "kl": 0.17741655930876732, "learning_rate": 9.999993770388785e-06, "loss": -0.1059, "step": 94, "step_time": 8.301304157990671 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 2725.0, "completions/max_terminated_length": 2725.0, "completions/mean_length": 2492.8125, "completions/mean_terminated_length": 2492.8125, "completions/min_length": 1657.0, "completions/min_terminated_length": 1657.0, "entropy": 0.3486361838877201, "epoch": 0.0019, "frac_reward_zero_std": 0.0, "grad_norm": 1.6948275566101074, "kl": 0.11026190128177404, "learning_rate": 9.99999355372282e-06, "loss": 0.0319, "num_tokens": 4937685.0, "reward": 0.0045624990016222, "reward_std": 0.16633789241313934, "rewards/rollout_reward_func/mean": 0.0045624990016222, "rewards/rollout_reward_func/std": 0.25424712896347046, "sampling/importance_sampling_ratio/max": 2.6231071949005127, "sampling/importance_sampling_ratio/mean": 0.9236529469490051, "sampling/importance_sampling_ratio/min": 0.2035244256258011, "sampling/sampling_logp_difference/max": 1.0063461065292358, "sampling/sampling_logp_difference/mean": 0.05296621471643448, "step": 95, "step_time": 40.5261353790047 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.010044642956927419, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011997767956927419, "entropy": 0.3515312075614929, "epoch": 0.00192, "grad_norm": 1.670708179473877, "kl": 0.1139525892212987, "learning_rate": 9.999993333353169e-06, "loss": 0.0283, "step": 96, "step_time": 8.195346737004002 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2674.0, "completions/max_terminated_length": 2674.0, "completions/mean_length": 2518.875, "completions/mean_terminated_length": 2518.875, "completions/min_length": 2312.0, "completions/min_terminated_length": 2312.0, "entropy": 0.3493756130337715, "epoch": 0.00194, "frac_reward_zero_std": 0.0, "grad_norm": 2.074188470840454, "kl": 0.12551532126963139, "learning_rate": 9.999993109279829e-06, "loss": -0.0301, "num_tokens": 5041376.0, "reward": -0.060187503695487976, "reward_std": 0.17671775817871094, "rewards/rollout_reward_func/mean": -0.060187503695487976, "rewards/rollout_reward_func/std": 0.270876407623291, "sampling/importance_sampling_ratio/max": 2.0745737552642822, "sampling/importance_sampling_ratio/mean": 1.0269553661346436, "sampling/importance_sampling_ratio/min": 0.3745565414428711, "sampling/sampling_logp_difference/max": 1.2888622283935547, "sampling/sampling_logp_difference/mean": 0.057082515209913254, "step": 97, "step_time": 40.23683304199949 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "entropy": 0.3484783321619034, "epoch": 0.00196, "grad_norm": 1.8363243341445923, "kl": 0.1370929814875126, "learning_rate": 9.999992881502803e-06, "loss": -0.035, "step": 98, "step_time": 8.031628980010282 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.009765625, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 2773.0, "completions/max_terminated_length": 2773.0, "completions/mean_length": 2592.90625, "completions/mean_terminated_length": 2592.90625, "completions/min_length": 1891.0, "completions/min_terminated_length": 1891.0, "entropy": 0.370455052703619, "epoch": 0.00198, "frac_reward_zero_std": 0.0, "grad_norm": 1.6820168495178223, "kl": 0.14302024198696017, "learning_rate": 9.999992650022092e-06, "loss": -0.1369, "num_tokens": 5148461.0, "reward": -0.013124998658895493, "reward_std": 0.13135413825511932, "rewards/rollout_reward_func/mean": -0.013124998658895493, "rewards/rollout_reward_func/std": 0.21369919180870056, "sampling/importance_sampling_ratio/max": 2.303262710571289, "sampling/importance_sampling_ratio/mean": 0.9167919158935547, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0467338562011719, "sampling/sampling_logp_difference/mean": 0.0629487931728363, "step": 99, "step_time": 40.73835782099923 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "entropy": 0.3696703165769577, "epoch": 0.002, "grad_norm": 1.7006328105926514, "kl": 0.13471043622121215, "learning_rate": 9.999992414837692e-06, "loss": -0.1399, "step": 100, "step_time": 9.27685954599292 }, { "clip_ratio/high_max": 0.007359307492151856, "clip_ratio/high_mean": 0.003679653746075928, "clip_ratio/low_mean": 0.007582720601931214, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011262374348007143, "completions/clipped_ratio": 0.0, "completions/max_length": 2720.0, "completions/max_terminated_length": 2720.0, "completions/mean_length": 2466.6875, "completions/mean_terminated_length": 2466.6875, "completions/min_length": 1575.0, "completions/min_terminated_length": 1575.0, "entropy": 0.33331993594765663, "epoch": 0.00202, "frac_reward_zero_std": 0.0, "grad_norm": 1.541299819946289, "kl": 0.07826073374599218, "learning_rate": 9.999992175949606e-06, "loss": -0.043, "num_tokens": 5250984.0, "reward": 0.07187500596046448, "reward_std": 0.3101885914802551, "rewards/rollout_reward_func/mean": 0.07187500596046448, "rewards/rollout_reward_func/std": 0.41012537479400635, "sampling/importance_sampling_ratio/max": 1.7896367311477661, "sampling/importance_sampling_ratio/mean": 0.9954925775527954, "sampling/importance_sampling_ratio/min": 0.32469430565834045, "sampling/sampling_logp_difference/max": 1.0306782722473145, "sampling/sampling_logp_difference/mean": 0.046852171421051025, "step": 101, "step_time": 39.81217833299888 }, { "clip_ratio/high_max": 0.01997091481462121, "clip_ratio/high_mean": 0.009985457407310605, "clip_ratio/low_mean": 0.009929753025062382, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.01991521066520363, "entropy": 0.33568425104022026, "epoch": 0.00204, "grad_norm": 1.5253196954727173, "kl": 0.0802880369592458, "learning_rate": 9.999991933357835e-06, "loss": -0.0472, "step": 102, "step_time": 8.12889104099304 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0018382353009656072, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003791360300965607, "completions/clipped_ratio": 0.0, "completions/max_length": 2744.0, "completions/max_terminated_length": 2744.0, "completions/mean_length": 2527.25, "completions/mean_terminated_length": 2527.25, "completions/min_length": 1738.0, "completions/min_terminated_length": 1738.0, "entropy": 0.34852392226457596, "epoch": 0.00206, "frac_reward_zero_std": 0.0, "grad_norm": 1.3484529256820679, "kl": 0.14044185122475028, "learning_rate": 9.999991687062379e-06, "loss": -0.2352, "num_tokens": 5355804.0, "reward": -0.03231249749660492, "reward_std": 0.11947058886289597, "rewards/rollout_reward_func/mean": -0.03231249749660492, "rewards/rollout_reward_func/std": 0.195997416973114, "sampling/importance_sampling_ratio/max": 2.030506134033203, "sampling/importance_sampling_ratio/mean": 0.9182260632514954, "sampling/importance_sampling_ratio/min": 0.12404513359069824, "sampling/sampling_logp_difference/max": 1.0576603412628174, "sampling/sampling_logp_difference/mean": 0.053935110569000244, "step": 103, "step_time": 40.22734483000386 }, { "clip_ratio/high_max": 0.008370535913854837, "clip_ratio/high_mean": 0.004185267956927419, "clip_ratio/low_mean": 0.011488970601931214, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015674238558858633, "entropy": 0.34627125039696693, "epoch": 0.00208, "grad_norm": 1.265508770942688, "kl": 0.15802726102992892, "learning_rate": 9.999991437063234e-06, "loss": -0.2393, "step": 104, "step_time": 8.25907339999685 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2740.0, "completions/max_terminated_length": 2740.0, "completions/mean_length": 2569.4375, "completions/mean_terminated_length": 2569.4375, "completions/min_length": 2413.0, "completions/min_terminated_length": 2413.0, "entropy": 0.3400811739265919, "epoch": 0.0021, "frac_reward_zero_std": 0.0, "grad_norm": 1.734525203704834, "kl": 0.14485831279307604, "learning_rate": 9.999991183360406e-06, "loss": -0.0094, "num_tokens": 5461781.0, "reward": -0.05031249672174454, "reward_std": 0.12431386858224869, "rewards/rollout_reward_func/mean": -0.05031249672174454, "rewards/rollout_reward_func/std": 0.25011590123176575, "sampling/importance_sampling_ratio/max": 2.130937337875366, "sampling/importance_sampling_ratio/mean": 1.0058907270431519, "sampling/importance_sampling_ratio/min": 0.20956012606620789, "sampling/sampling_logp_difference/max": 1.1093370914459229, "sampling/sampling_logp_difference/mean": 0.05707106366753578, "step": 105, "step_time": 41.4205684740009 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.33539753779768944, "epoch": 0.00212, "grad_norm": 1.7177813053131104, "kl": 0.14945369446650147, "learning_rate": 9.999990925953894e-06, "loss": -0.0119, "step": 106, "step_time": 8.722428562010464 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 2650.0, "completions/max_terminated_length": 2650.0, "completions/mean_length": 2511.96875, "completions/mean_terminated_length": 2511.96875, "completions/min_length": 2037.0, "completions/min_terminated_length": 2037.0, "entropy": 0.3455476462841034, "epoch": 0.00214, "frac_reward_zero_std": 0.0, "grad_norm": 1.5704108476638794, "kl": 0.14504612796008587, "learning_rate": 9.999990664843696e-06, "loss": -0.1358, "num_tokens": 5565383.0, "reward": -0.06875000149011612, "reward_std": 0.15667273104190826, "rewards/rollout_reward_func/mean": -0.06875000149011612, "rewards/rollout_reward_func/std": 0.2505837082862854, "sampling/importance_sampling_ratio/max": 1.963279128074646, "sampling/importance_sampling_ratio/mean": 0.8803051114082336, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3123910427093506, "sampling/sampling_logp_difference/mean": 0.06202516704797745, "step": 107, "step_time": 39.60044279100839 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.009765625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017578125, "entropy": 0.338915441185236, "epoch": 0.00216, "grad_norm": 1.1836293935775757, "kl": 0.18176286108791828, "learning_rate": 9.999990400029814e-06, "loss": -0.1392, "step": 108, "step_time": 8.004781586998433 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 2700.0, "completions/max_terminated_length": 2700.0, "completions/mean_length": 2556.59375, "completions/mean_terminated_length": 2556.59375, "completions/min_length": 2298.0, "completions/min_terminated_length": 2298.0, "entropy": 0.3399314768612385, "epoch": 0.00218, "frac_reward_zero_std": 0.0, "grad_norm": 1.8767611980438232, "kl": 0.19613626692444086, "learning_rate": 9.999990131512245e-06, "loss": -0.1203, "num_tokens": 5671035.0, "reward": 0.030062498524785042, "reward_std": 0.06542174518108368, "rewards/rollout_reward_func/mean": 0.030062498524785042, "rewards/rollout_reward_func/std": 0.06506668031215668, "sampling/importance_sampling_ratio/max": 2.1095526218414307, "sampling/importance_sampling_ratio/mean": 0.9953502416610718, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2929037809371948, "sampling/sampling_logp_difference/mean": 0.0701940506696701, "step": 109, "step_time": 40.07611987899145 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.007697610184550285, "clip_ratio/low_mean": 0.005744485300965607, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01344209536910057, "entropy": 0.33449552208185196, "epoch": 0.0022, "grad_norm": 1.8064794540405273, "kl": 0.22378786839544773, "learning_rate": 9.999989859290995e-06, "loss": -0.1237, "step": 110, "step_time": 8.101552588006598 }, { "clip_ratio/high_max": 0.022248641354963183, "clip_ratio/high_mean": 0.013077445677481592, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015030570677481592, "completions/clipped_ratio": 0.0, "completions/max_length": 2644.0, "completions/max_terminated_length": 2644.0, "completions/mean_length": 2361.90625, "completions/mean_terminated_length": 2361.90625, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "entropy": 0.32380833476781845, "epoch": 0.00222, "frac_reward_zero_std": 0.0, "grad_norm": 1.7965515851974487, "kl": 0.2527701100334525, "learning_rate": 9.99998958336606e-06, "loss": -0.067, "num_tokens": 5770467.0, "reward": -0.17624998092651367, "reward_std": 0.35133326053619385, "rewards/rollout_reward_func/mean": -0.17624998092651367, "rewards/rollout_reward_func/std": 0.3963848650455475, "sampling/importance_sampling_ratio/max": 2.2881264686584473, "sampling/importance_sampling_ratio/mean": 0.8733131289482117, "sampling/importance_sampling_ratio/min": 0.2075604349374771, "sampling/sampling_logp_difference/max": 1.3090085983276367, "sampling/sampling_logp_difference/mean": 0.07093898952007294, "step": 111, "step_time": 38.757764058995235 }, { "clip_ratio/high_max": 0.017968750093132257, "clip_ratio/high_mean": 0.010937499813735485, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014843749813735485, "entropy": 0.32395435497164726, "epoch": 0.00224, "grad_norm": 1.7358996868133545, "kl": 0.27141671627759933, "learning_rate": 9.999989303737442e-06, "loss": -0.0699, "step": 112, "step_time": 8.030561457002477 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 2722.0, "completions/max_terminated_length": 2722.0, "completions/mean_length": 2539.1875, "completions/mean_terminated_length": 2539.1875, "completions/min_length": 2354.0, "completions/min_terminated_length": 2354.0, "entropy": 0.3293020986020565, "epoch": 0.00226, "frac_reward_zero_std": 0.0, "grad_norm": 1.9367733001708984, "kl": 0.30573761742562056, "learning_rate": 9.999989020405141e-06, "loss": 0.0804, "num_tokens": 5875527.0, "reward": -0.009312499314546585, "reward_std": 0.06325964629650116, "rewards/rollout_reward_func/mean": -0.009312499314546585, "rewards/rollout_reward_func/std": 0.07523487508296967, "sampling/importance_sampling_ratio/max": 2.04264760017395, "sampling/importance_sampling_ratio/mean": 0.9071935415267944, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2561190128326416, "sampling/sampling_logp_difference/mean": 0.06177542358636856, "step": 113, "step_time": 39.97751610999694 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.33135970309376717, "epoch": 0.00228, "grad_norm": 1.8863413333892822, "kl": 0.2953490880317986, "learning_rate": 9.999988733369157e-06, "loss": 0.08, "step": 114, "step_time": 8.203803888012771 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 2766.0, "completions/max_terminated_length": 2766.0, "completions/mean_length": 2594.53125, "completions/mean_terminated_length": 2594.53125, "completions/min_length": 2435.0, "completions/min_terminated_length": 2435.0, "entropy": 0.312834270298481, "epoch": 0.0023, "frac_reward_zero_std": 0.0, "grad_norm": 1.5824787616729736, "kl": 0.19389449525624514, "learning_rate": 9.999988442629489e-06, "loss": 0.0764, "num_tokens": 5982430.0, "reward": 0.011749999597668648, "reward_std": 0.06366527080535889, "rewards/rollout_reward_func/mean": 0.011749999597668648, "rewards/rollout_reward_func/std": 0.06497592478990555, "sampling/importance_sampling_ratio/max": 2.5081026554107666, "sampling/importance_sampling_ratio/mean": 1.0248709917068481, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.7471835613250732, "sampling/sampling_logp_difference/mean": 0.06724181771278381, "step": 115, "step_time": 40.59423167099885 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.009765625, "clip_ratio/low_mean": 0.009765625, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.01953125, "entropy": 0.3156875427812338, "epoch": 0.00232, "grad_norm": 1.6595027446746826, "kl": 0.18415676709264517, "learning_rate": 9.99998814818614e-06, "loss": 0.074, "step": 116, "step_time": 8.306354760017712 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 2738.0, "completions/max_terminated_length": 2738.0, "completions/mean_length": 2564.59375, "completions/mean_terminated_length": 2564.59375, "completions/min_length": 2383.0, "completions/min_terminated_length": 2383.0, "entropy": 0.3278175815939903, "epoch": 0.00234, "frac_reward_zero_std": 0.0, "grad_norm": 1.980755090713501, "kl": 0.09133347170427442, "learning_rate": 9.999987850039108e-06, "loss": -0.0312, "num_tokens": 6088349.0, "reward": 0.004375000484287739, "reward_std": 0.041993025690317154, "rewards/rollout_reward_func/mean": 0.004375000484287739, "rewards/rollout_reward_func/std": 0.04905148595571518, "sampling/importance_sampling_ratio/max": 2.285834789276123, "sampling/importance_sampling_ratio/mean": 1.0151498317718506, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7270146608352661, "sampling/sampling_logp_difference/mean": 0.0533914715051651, "step": 117, "step_time": 41.6384668170067 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3312051221728325, "epoch": 0.00236, "grad_norm": 2.001918077468872, "kl": 0.09348489623516798, "learning_rate": 9.999987548188395e-06, "loss": -0.034, "step": 118, "step_time": 8.222705290994782 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 2696.0, "completions/max_terminated_length": 2696.0, "completions/mean_length": 2553.4375, "completions/mean_terminated_length": 2553.4375, "completions/min_length": 2060.0, "completions/min_terminated_length": 2060.0, "entropy": 0.32508161664009094, "epoch": 0.00238, "frac_reward_zero_std": 0.0, "grad_norm": 1.7715137004852295, "kl": 0.14226396940648556, "learning_rate": 9.999987242634e-06, "loss": -0.0903, "num_tokens": 6193617.0, "reward": -0.0234375, "reward_std": 0.10268107056617737, "rewards/rollout_reward_func/mean": -0.0234375, "rewards/rollout_reward_func/std": 0.17849929630756378, "sampling/importance_sampling_ratio/max": 2.5179085731506348, "sampling/importance_sampling_ratio/mean": 1.0467031002044678, "sampling/importance_sampling_ratio/min": 0.23064902424812317, "sampling/sampling_logp_difference/max": 1.435530662536621, "sampling/sampling_logp_difference/mean": 0.052883878350257874, "step": 119, "step_time": 40.86630449297809 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.009765625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.32609760016202927, "epoch": 0.0024, "grad_norm": 1.6669055223464966, "kl": 0.14041694393381476, "learning_rate": 9.999986933375924e-06, "loss": -0.0943, "step": 120, "step_time": 8.225507424009265 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 2762.0, "completions/max_terminated_length": 2762.0, "completions/mean_length": 2509.9375, "completions/mean_terminated_length": 2509.9375, "completions/min_length": 1604.0, "completions/min_terminated_length": 1604.0, "entropy": 0.34493784606456757, "epoch": 0.00242, "frac_reward_zero_std": 0.0, "grad_norm": 1.7307958602905273, "kl": 0.20689602987840772, "learning_rate": 9.999986620414169e-06, "loss": -0.028, "num_tokens": 6297378.0, "reward": 0.004999998956918716, "reward_std": 0.1766255795955658, "rewards/rollout_reward_func/mean": 0.004999998956918716, "rewards/rollout_reward_func/std": 0.28026482462882996, "sampling/importance_sampling_ratio/max": 2.4770820140838623, "sampling/importance_sampling_ratio/mean": 1.0248966217041016, "sampling/importance_sampling_ratio/min": 0.1811486929655075, "sampling/sampling_logp_difference/max": 1.7409013509750366, "sampling/sampling_logp_difference/mean": 0.06154333055019379, "step": 121, "step_time": 39.13895379099267 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.008091517956927419, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011997767724096775, "entropy": 0.34469663724303246, "epoch": 0.00244, "grad_norm": 1.7192891836166382, "kl": 0.21292951330542564, "learning_rate": 9.999986303748731e-06, "loss": -0.0298, "step": 122, "step_time": 9.244061531986517 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 2753.0, "completions/max_terminated_length": 2753.0, "completions/mean_length": 2531.1875, "completions/mean_terminated_length": 2531.1875, "completions/min_length": 1462.0, "completions/min_terminated_length": 1462.0, "entropy": 0.33442598581314087, "epoch": 0.00246, "frac_reward_zero_std": 0.0, "grad_norm": 1.6036715507507324, "kl": 0.15767440851777792, "learning_rate": 9.999985983379614e-06, "loss": -0.0708, "num_tokens": 6402259.0, "reward": -0.02025000751018524, "reward_std": 0.12010502070188522, "rewards/rollout_reward_func/mean": -0.02025000751018524, "rewards/rollout_reward_func/std": 0.2211160957813263, "sampling/importance_sampling_ratio/max": 2.1161069869995117, "sampling/importance_sampling_ratio/mean": 0.9873582720756531, "sampling/importance_sampling_ratio/min": 0.13119953870773315, "sampling/sampling_logp_difference/max": 1.1497358083724976, "sampling/sampling_logp_difference/mean": 0.06182745844125748, "step": 123, "step_time": 40.470048752991715 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.33676064386963844, "epoch": 0.00248, "grad_norm": 1.6138527393341064, "kl": 0.1523943403735757, "learning_rate": 9.999985659306817e-06, "loss": -0.0723, "step": 124, "step_time": 8.273418343997037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0022321429569274187, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022321429569274187, "completions/clipped_ratio": 0.0, "completions/max_length": 2735.0, "completions/max_terminated_length": 2735.0, "completions/mean_length": 2546.28125, "completions/mean_terminated_length": 2546.28125, "completions/min_length": 1617.0, "completions/min_terminated_length": 1617.0, "entropy": 0.3453407287597656, "epoch": 0.0025, "frac_reward_zero_std": 0.0, "grad_norm": 2.0110528469085693, "kl": 0.170327290892601, "learning_rate": 9.999985331530339e-06, "loss": -0.1445, "num_tokens": 6507286.0, "reward": 0.050312504172325134, "reward_std": 0.11686505377292633, "rewards/rollout_reward_func/mean": 0.050312504172325134, "rewards/rollout_reward_func/std": 0.19619746506214142, "sampling/importance_sampling_ratio/max": 2.439424991607666, "sampling/importance_sampling_ratio/mean": 0.8885781764984131, "sampling/importance_sampling_ratio/min": 0.27067622542381287, "sampling/sampling_logp_difference/max": 1.2129557132720947, "sampling/sampling_logp_difference/mean": 0.059535570442676544, "step": 125, "step_time": 40.06854912801646 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.3458435758948326, "epoch": 0.00252, "grad_norm": 2.0043210983276367, "kl": 0.19776208233088255, "learning_rate": 9.999985000050181e-06, "loss": -0.1484, "step": 126, "step_time": 8.200360839000496 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 2685.0, "completions/max_terminated_length": 2685.0, "completions/mean_length": 2440.0625, "completions/mean_terminated_length": 2440.0625, "completions/min_length": 663.0, "completions/min_terminated_length": 663.0, "entropy": 0.3563276380300522, "epoch": 0.00254, "frac_reward_zero_std": 0.125, "grad_norm": 1.6714493036270142, "kl": 0.10458008944988251, "learning_rate": 9.999984664866347e-06, "loss": -0.0615, "num_tokens": 6608666.0, "reward": -0.027812499552965164, "reward_std": 0.24362991750240326, "rewards/rollout_reward_func/mean": -0.027812499552965164, "rewards/rollout_reward_func/std": 0.3529391288757324, "sampling/importance_sampling_ratio/max": 2.106961727142334, "sampling/importance_sampling_ratio/mean": 1.0129467248916626, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7839922904968262, "sampling/sampling_logp_difference/mean": 0.05322550982236862, "step": 127, "step_time": 38.439512436008954 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.35617511719465256, "epoch": 0.00256, "grad_norm": 1.5934431552886963, "kl": 0.10316136293113232, "learning_rate": 9.999984325978833e-06, "loss": -0.0651, "step": 128, "step_time": 8.525172622001264 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 2727.0, "completions/max_terminated_length": 2727.0, "completions/mean_length": 2538.875, "completions/mean_terminated_length": 2538.875, "completions/min_length": 2183.0, "completions/min_terminated_length": 2183.0, "entropy": 0.3401091955602169, "epoch": 0.00258, "frac_reward_zero_std": 0.0, "grad_norm": 1.5359450578689575, "kl": 0.23094982374459505, "learning_rate": 9.99998398338764e-06, "loss": 0.0811, "num_tokens": 6713463.0, "reward": 0.006437500007450581, "reward_std": 0.045757949352264404, "rewards/rollout_reward_func/mean": 0.006437500007450581, "rewards/rollout_reward_func/std": 0.06285206973552704, "sampling/importance_sampling_ratio/max": 1.7960189580917358, "sampling/importance_sampling_ratio/mean": 0.829826831817627, "sampling/importance_sampling_ratio/min": 0.19573816657066345, "sampling/sampling_logp_difference/max": 1.1058683395385742, "sampling/sampling_logp_difference/mean": 0.06072615832090378, "step": 129, "step_time": 39.87553574399499 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.009765625, "clip_ratio/low_mean": 0.01171875, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.021484375, "entropy": 0.3394704796373844, "epoch": 0.0026, "grad_norm": 1.2305470705032349, "kl": 0.20992374047636986, "learning_rate": 9.99998363709277e-06, "loss": 0.0796, "step": 130, "step_time": 8.169040031993063 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 2687.0, "completions/max_terminated_length": 2687.0, "completions/mean_length": 2560.53125, "completions/mean_terminated_length": 2560.53125, "completions/min_length": 2391.0, "completions/min_terminated_length": 2391.0, "entropy": 0.3369971923530102, "epoch": 0.00262, "frac_reward_zero_std": 0.0, "grad_norm": 1.8376662731170654, "kl": 0.6742809629067779, "learning_rate": 9.999983287094222e-06, "loss": -0.1498, "num_tokens": 6818761.0, "reward": -0.012437498196959496, "reward_std": 0.10733288526535034, "rewards/rollout_reward_func/mean": -0.012437498196959496, "rewards/rollout_reward_func/std": 0.18053805828094482, "sampling/importance_sampling_ratio/max": 2.5355255603790283, "sampling/importance_sampling_ratio/mean": 0.9010251760482788, "sampling/importance_sampling_ratio/min": 0.15409857034683228, "sampling/sampling_logp_difference/max": 1.7746939659118652, "sampling/sampling_logp_difference/mean": 0.06549885869026184, "step": 131, "step_time": 40.05877101999795 }, { "clip_ratio/high_max": 0.02734375, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017578125, "entropy": 0.3364224396646023, "epoch": 0.00264, "grad_norm": 1.5281078815460205, "kl": 0.5020047454163432, "learning_rate": 9.999982933391998e-06, "loss": -0.1526, "step": 132, "step_time": 8.129041669002618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2657.0, "completions/max_terminated_length": 2657.0, "completions/mean_length": 2431.9375, "completions/mean_terminated_length": 2431.9375, "completions/min_length": 657.0, "completions/min_terminated_length": 657.0, "entropy": 0.3256659470498562, "epoch": 0.00266, "frac_reward_zero_std": 0.0, "grad_norm": 1.5615955591201782, "kl": 0.11903299344703555, "learning_rate": 9.999982575986095e-06, "loss": -0.1114, "num_tokens": 6919971.0, "reward": 0.049687501043081284, "reward_std": 0.26549428701400757, "rewards/rollout_reward_func/mean": 0.049687501043081284, "rewards/rollout_reward_func/std": 0.3750826418399811, "sampling/importance_sampling_ratio/max": 2.790414571762085, "sampling/importance_sampling_ratio/mean": 1.039184808731079, "sampling/importance_sampling_ratio/min": 0.362961083650589, "sampling/sampling_logp_difference/max": 0.7517553567886353, "sampling/sampling_logp_difference/mean": 0.05017857998609543, "step": 133, "step_time": 40.47903727200901 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "entropy": 0.32507436349987984, "epoch": 0.00268, "grad_norm": 1.4362146854400635, "kl": 0.11510731559246778, "learning_rate": 9.999982214876516e-06, "loss": -0.1129, "step": 134, "step_time": 8.01312270500057 }, { "clip_ratio/high_max": 0.01953125, "clip_ratio/high_mean": 0.009765625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "completions/clipped_ratio": 0.0, "completions/max_length": 2670.0, "completions/max_terminated_length": 2670.0, "completions/mean_length": 2546.34375, "completions/mean_terminated_length": 2546.34375, "completions/min_length": 2309.0, "completions/min_terminated_length": 2309.0, "entropy": 0.340265478938818, "epoch": 0.0027, "frac_reward_zero_std": 0.0, "grad_norm": 2.0948877334594727, "kl": 0.26178126875311136, "learning_rate": 9.999981850063262e-06, "loss": 0.0991, "num_tokens": 7025087.0, "reward": 0.0013749999925494194, "reward_std": 0.06908594816923141, "rewards/rollout_reward_func/mean": 0.0013749999925494194, "rewards/rollout_reward_func/std": 0.07447785884141922, "sampling/importance_sampling_ratio/max": 2.6498475074768066, "sampling/importance_sampling_ratio/mean": 1.0591254234313965, "sampling/importance_sampling_ratio/min": 0.1885811686515808, "sampling/sampling_logp_difference/max": 1.3395788669586182, "sampling/sampling_logp_difference/mean": 0.05880427360534668, "step": 135, "step_time": 41.25988252200477 }, { "clip_ratio/high_max": 0.01953125, "clip_ratio/high_mean": 0.01171875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01953125, "entropy": 0.3363325670361519, "epoch": 0.00272, "grad_norm": 1.810211181640625, "kl": 0.27428784500807524, "learning_rate": 9.99998148154633e-06, "loss": 0.0962, "step": 136, "step_time": 8.050127039008657 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2729.0, "completions/max_terminated_length": 2729.0, "completions/mean_length": 2506.28125, "completions/mean_terminated_length": 2506.28125, "completions/min_length": 2034.0, "completions/min_terminated_length": 2034.0, "entropy": 0.3250386714935303, "epoch": 0.00274, "frac_reward_zero_std": 0.0, "grad_norm": 1.5487583875656128, "kl": 0.2483032587915659, "learning_rate": 9.999981109325725e-06, "loss": -0.0189, "num_tokens": 7129006.0, "reward": -0.02031249925494194, "reward_std": 0.10975626111030579, "rewards/rollout_reward_func/mean": -0.02031249925494194, "rewards/rollout_reward_func/std": 0.1898808479309082, "sampling/importance_sampling_ratio/max": 1.8540207147598267, "sampling/importance_sampling_ratio/mean": 0.9523411989212036, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.999842643737793, "sampling/sampling_logp_difference/mean": 0.048174045979976654, "step": 137, "step_time": 40.41093556799751 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.32194047793745995, "epoch": 0.00276, "grad_norm": 1.530614972114563, "kl": 0.25904428493231535, "learning_rate": 9.999980733401442e-06, "loss": -0.0239, "step": 138, "step_time": 8.193952220994106 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 2705.0, "completions/max_terminated_length": 2705.0, "completions/mean_length": 2533.875, "completions/mean_terminated_length": 2533.875, "completions/min_length": 2022.0, "completions/min_terminated_length": 2022.0, "entropy": 0.34514220058918, "epoch": 0.00278, "frac_reward_zero_std": 0.0, "grad_norm": 1.8153398036956787, "kl": 0.20840087812393904, "learning_rate": 9.999980353773486e-06, "loss": -0.0015, "num_tokens": 7233542.0, "reward": -0.022499997168779373, "reward_std": 0.11739690601825714, "rewards/rollout_reward_func/mean": -0.022499997168779373, "rewards/rollout_reward_func/std": 0.20528501272201538, "sampling/importance_sampling_ratio/max": 2.392735242843628, "sampling/importance_sampling_ratio/mean": 0.9636785984039307, "sampling/importance_sampling_ratio/min": 0.17225754261016846, "sampling/sampling_logp_difference/max": 1.1478333473205566, "sampling/sampling_logp_difference/mean": 0.05775437504053116, "step": 139, "step_time": 41.039759424988006 }, { "clip_ratio/high_max": 0.023697916883975267, "clip_ratio/high_mean": 0.01575520820915699, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01770833320915699, "entropy": 0.34427405893802643, "epoch": 0.0028, "grad_norm": 1.8111648559570312, "kl": 0.20970841869711876, "learning_rate": 9.999979970441856e-06, "loss": -0.0059, "step": 140, "step_time": 8.234236395001062 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0020833334419876337, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004036458441987634, "completions/clipped_ratio": 0.0, "completions/max_length": 2671.0, "completions/max_terminated_length": 2671.0, "completions/mean_length": 2401.71875, "completions/mean_terminated_length": 2401.71875, "completions/min_length": 602.0, "completions/min_terminated_length": 602.0, "entropy": 0.30447014793753624, "epoch": 0.00282, "frac_reward_zero_std": 0.125, "grad_norm": 1.1894261837005615, "kl": 0.3195039564743638, "learning_rate": 9.999979583406551e-06, "loss": 0.0382, "num_tokens": 7333627.0, "reward": 0.06937500089406967, "reward_std": 0.3690631687641144, "rewards/rollout_reward_func/mean": 0.06937500089406967, "rewards/rollout_reward_func/std": 0.46480610966682434, "sampling/importance_sampling_ratio/max": 2.1372768878936768, "sampling/importance_sampling_ratio/mean": 0.9657130837440491, "sampling/importance_sampling_ratio/min": 0.05373276770114899, "sampling/sampling_logp_difference/max": 1.0698232650756836, "sampling/sampling_logp_difference/mean": 0.05248340219259262, "step": 141, "step_time": 37.874881055002334 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.3037009723484516, "epoch": 0.00284, "grad_norm": 1.1916241645812988, "kl": 0.2727260245010257, "learning_rate": 9.999979192667574e-06, "loss": 0.0377, "step": 142, "step_time": 7.981406920000154 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2700.0, "completions/max_terminated_length": 2700.0, "completions/mean_length": 2406.1875, "completions/mean_terminated_length": 2406.1875, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "entropy": 0.2937978897243738, "epoch": 0.00286, "frac_reward_zero_std": 0.0, "grad_norm": 1.4525103569030762, "kl": 0.12783690728247166, "learning_rate": 9.999978798224922e-06, "loss": -0.1101, "num_tokens": 7433724.0, "reward": -0.04500000178813934, "reward_std": 0.307174950838089, "rewards/rollout_reward_func/mean": -0.04500000178813934, "rewards/rollout_reward_func/std": 0.41188785433769226, "sampling/importance_sampling_ratio/max": 2.6429364681243896, "sampling/importance_sampling_ratio/mean": 1.0415544509887695, "sampling/importance_sampling_ratio/min": 0.36408287286758423, "sampling/sampling_logp_difference/max": 1.1776275634765625, "sampling/sampling_logp_difference/mean": 0.0455252081155777, "step": 143, "step_time": 39.24893150100979 }, { "clip_ratio/high_max": 0.012276785913854837, "clip_ratio/high_mean": 0.006138392956927419, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008091517956927419, "entropy": 0.2940533272922039, "epoch": 0.00288, "grad_norm": 1.2668437957763672, "kl": 0.1362289022654295, "learning_rate": 9.999978400078598e-06, "loss": -0.113, "step": 144, "step_time": 8.001272381996387 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2717.0, "completions/max_terminated_length": 2717.0, "completions/mean_length": 2491.625, "completions/mean_terminated_length": 2491.625, "completions/min_length": 1870.0, "completions/min_terminated_length": 1870.0, "entropy": 0.32774606347084045, "epoch": 0.0029, "frac_reward_zero_std": 0.0, "grad_norm": 1.4663074016571045, "kl": 0.2525828080251813, "learning_rate": 9.9999779982286e-06, "loss": -0.1612, "num_tokens": 7537148.0, "reward": -0.03437500074505806, "reward_std": 0.12437704205513, "rewards/rollout_reward_func/mean": -0.03437500074505806, "rewards/rollout_reward_func/std": 0.20053055882453918, "sampling/importance_sampling_ratio/max": 2.752513885498047, "sampling/importance_sampling_ratio/mean": 0.8781605958938599, "sampling/importance_sampling_ratio/min": 0.1707814782857895, "sampling/sampling_logp_difference/max": 0.9282255172729492, "sampling/sampling_logp_difference/mean": 0.05459430813789368, "step": 145, "step_time": 41.338485607011535 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.3271145783364773, "epoch": 0.00292, "grad_norm": 1.5428147315979004, "kl": 0.2190783964470029, "learning_rate": 9.999977592674933e-06, "loss": -0.162, "step": 146, "step_time": 8.144251859994256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 2694.0, "completions/max_terminated_length": 2694.0, "completions/mean_length": 2471.40625, "completions/mean_terminated_length": 2471.40625, "completions/min_length": 1665.0, "completions/min_terminated_length": 1665.0, "entropy": 0.2840565647929907, "epoch": 0.00294, "frac_reward_zero_std": 0.0, "grad_norm": 1.5519449710845947, "kl": 0.3378815334290266, "learning_rate": 9.999977183417593e-06, "loss": -0.0958, "num_tokens": 7640114.0, "reward": 0.03281249850988388, "reward_std": 0.15429173409938812, "rewards/rollout_reward_func/mean": 0.03281249850988388, "rewards/rollout_reward_func/std": 0.2244165688753128, "sampling/importance_sampling_ratio/max": 2.7828152179718018, "sampling/importance_sampling_ratio/mean": 1.010582685470581, "sampling/importance_sampling_ratio/min": 0.2255764752626419, "sampling/sampling_logp_difference/max": 1.4769656658172607, "sampling/sampling_logp_difference/mean": 0.05251479893922806, "step": 147, "step_time": 39.98198530799709 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.009765625, "entropy": 0.28489716723561287, "epoch": 0.00296, "grad_norm": 1.4790749549865723, "kl": 0.3447922170162201, "learning_rate": 9.999976770456581e-06, "loss": -0.0975, "step": 148, "step_time": 8.059031150987721 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2717.0, "completions/max_terminated_length": 2717.0, "completions/mean_length": 2457.5625, "completions/mean_terminated_length": 2457.5625, "completions/min_length": 1340.0, "completions/min_terminated_length": 1340.0, "entropy": 0.330892838537693, "epoch": 0.00298, "frac_reward_zero_std": 0.0, "grad_norm": 1.9201385974884033, "kl": 0.13803628273308277, "learning_rate": 9.999976353791898e-06, "loss": 0.0257, "num_tokens": 7742168.0, "reward": -0.06718750298023224, "reward_std": 0.21596568822860718, "rewards/rollout_reward_func/mean": -0.06718750298023224, "rewards/rollout_reward_func/std": 0.3021240234375, "sampling/importance_sampling_ratio/max": 2.435490369796753, "sampling/importance_sampling_ratio/mean": 1.1018002033233643, "sampling/importance_sampling_ratio/min": 0.3883662223815918, "sampling/sampling_logp_difference/max": 0.6033191680908203, "sampling/sampling_logp_difference/mean": 0.05041055008769035, "step": 149, "step_time": 39.8079579510013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3302891068160534, "epoch": 0.003, "grad_norm": 1.9833990335464478, "kl": 0.14259828627109528, "learning_rate": 9.999975933423546e-06, "loss": 0.022, "step": 150, "step_time": 8.779339686996536 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 2647.0, "completions/max_terminated_length": 2647.0, "completions/mean_length": 2507.3125, "completions/mean_terminated_length": 2507.3125, "completions/min_length": 1950.0, "completions/min_terminated_length": 1950.0, "entropy": 0.3101547583937645, "epoch": 0.00302, "frac_reward_zero_std": 0.0, "grad_norm": 1.518326759338379, "kl": 0.200548829510808, "learning_rate": 9.999975509351522e-06, "loss": -0.0199, "num_tokens": 7846003.0, "reward": 0.09437499940395355, "reward_std": 0.1686856895685196, "rewards/rollout_reward_func/mean": 0.09437499940395355, "rewards/rollout_reward_func/std": 0.26759305596351624, "sampling/importance_sampling_ratio/max": 2.2425479888916016, "sampling/importance_sampling_ratio/mean": 0.9671791791915894, "sampling/importance_sampling_ratio/min": 0.1983872652053833, "sampling/sampling_logp_difference/max": 0.8811240196228027, "sampling/sampling_logp_difference/mean": 0.056956760585308075, "step": 151, "step_time": 39.684381083003245 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0037913601845502853, "clip_ratio/low_min": 0.0036764706019312143, "clip_ratio/region_mean": 0.005744485184550285, "entropy": 0.3110111430287361, "epoch": 0.00304, "grad_norm": 1.555120587348938, "kl": 0.21855803951621056, "learning_rate": 9.99997508157583e-06, "loss": -0.0236, "step": 152, "step_time": 8.083289796006284 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2671.0, "completions/max_terminated_length": 2671.0, "completions/mean_length": 2501.78125, "completions/mean_terminated_length": 2501.78125, "completions/min_length": 649.0, "completions/min_terminated_length": 649.0, "entropy": 0.30790841951966286, "epoch": 0.00306, "frac_reward_zero_std": 0.0, "grad_norm": 1.939740777015686, "kl": 0.13009591028094292, "learning_rate": 9.999974650096467e-06, "loss": -0.0409, "num_tokens": 7949583.0, "reward": -0.06224999576807022, "reward_std": 0.18093439936637878, "rewards/rollout_reward_func/mean": -0.06224999576807022, "rewards/rollout_reward_func/std": 0.2806392312049866, "sampling/importance_sampling_ratio/max": 2.5082690715789795, "sampling/importance_sampling_ratio/mean": 1.0172232389450073, "sampling/importance_sampling_ratio/min": 0.11460259556770325, "sampling/sampling_logp_difference/max": 1.0105078220367432, "sampling/sampling_logp_difference/mean": 0.05251069366931915, "step": 153, "step_time": 39.95415290000528 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0071614584885537624, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.009114583488553762, "entropy": 0.30542241409420967, "epoch": 0.00308, "grad_norm": 1.721543312072754, "kl": 0.1375604411587119, "learning_rate": 9.999974214913438e-06, "loss": -0.0413, "step": 154, "step_time": 8.1919258509879 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 2685.0, "completions/max_terminated_length": 2685.0, "completions/mean_length": 2505.8125, "completions/mean_terminated_length": 2505.8125, "completions/min_length": 2028.0, "completions/min_terminated_length": 2028.0, "entropy": 0.31059348583221436, "epoch": 0.0031, "frac_reward_zero_std": 0.0, "grad_norm": 2.3106508255004883, "kl": 0.2599359452724457, "learning_rate": 9.99997377602674e-06, "loss": -0.0716, "num_tokens": 8053769.0, "reward": 0.03125, "reward_std": 0.1753545105457306, "rewards/rollout_reward_func/mean": 0.03125, "rewards/rollout_reward_func/std": 0.2829064428806305, "sampling/importance_sampling_ratio/max": 2.14701509475708, "sampling/importance_sampling_ratio/mean": 0.9985713362693787, "sampling/importance_sampling_ratio/min": 0.2423524558544159, "sampling/sampling_logp_difference/max": 0.7375087738037109, "sampling/sampling_logp_difference/mean": 0.05255991220474243, "step": 155, "step_time": 39.86996693698893 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.31180923245847225, "epoch": 0.00312, "grad_norm": 2.528000593185425, "kl": 0.2466150252148509, "learning_rate": 9.999973333436373e-06, "loss": -0.0732, "step": 156, "step_time": 8.722005063995312 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013671875, "completions/clipped_ratio": 0.0, "completions/max_length": 2651.0, "completions/max_terminated_length": 2651.0, "completions/mean_length": 2445.84375, "completions/mean_terminated_length": 2445.84375, "completions/min_length": 1298.0, "completions/min_terminated_length": 1298.0, "entropy": 0.3062910810112953, "epoch": 0.00314, "frac_reward_zero_std": 0.0, "grad_norm": 1.596366047859192, "kl": 0.2805585265159607, "learning_rate": 9.999972887142338e-06, "loss": 0.0232, "num_tokens": 8155656.0, "reward": -0.050624996423721313, "reward_std": 0.18446293473243713, "rewards/rollout_reward_func/mean": -0.050624996423721313, "rewards/rollout_reward_func/std": 0.28230544924736023, "sampling/importance_sampling_ratio/max": 1.973318099975586, "sampling/importance_sampling_ratio/mean": 0.9094000458717346, "sampling/importance_sampling_ratio/min": 0.2735903561115265, "sampling/sampling_logp_difference/max": 1.0791234970092773, "sampling/sampling_logp_difference/mean": 0.059729140251874924, "step": 157, "step_time": 38.55100798999047 }, { "clip_ratio/high_max": 0.028245192486792803, "clip_ratio/high_mean": 0.0219350962433964, "clip_ratio/low_mean": 0.008263221010565758, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.030198317021131516, "entropy": 0.30654633045196533, "epoch": 0.00316, "grad_norm": 1.384443759918213, "kl": 0.27098785899579525, "learning_rate": 9.999972437144638e-06, "loss": 0.0171, "step": 158, "step_time": 8.019240472996898 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "completions/clipped_ratio": 0.0, "completions/max_length": 2706.0, "completions/max_terminated_length": 2706.0, "completions/mean_length": 2569.59375, "completions/mean_terminated_length": 2569.59375, "completions/min_length": 2401.0, "completions/min_terminated_length": 2401.0, "entropy": 0.3133653476834297, "epoch": 0.00318, "frac_reward_zero_std": 0.0, "grad_norm": 1.126905918121338, "kl": 0.4180504083633423, "learning_rate": 9.99997198344327e-06, "loss": -0.0072, "num_tokens": 8261326.0, "reward": 0.022437501698732376, "reward_std": 0.04600679874420166, "rewards/rollout_reward_func/mean": 0.022437501698732376, "rewards/rollout_reward_func/std": 0.06265468150377274, "sampling/importance_sampling_ratio/max": 1.8128973245620728, "sampling/importance_sampling_ratio/mean": 0.8245859146118164, "sampling/importance_sampling_ratio/min": 0.15137887001037598, "sampling/sampling_logp_difference/max": 1.5356993675231934, "sampling/sampling_logp_difference/mean": 0.06314520537853241, "step": 159, "step_time": 40.667295111990825 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.3113729450851679, "epoch": 0.0032, "grad_norm": 1.070225715637207, "kl": 0.386994413100183, "learning_rate": 9.999971526038236e-06, "loss": -0.0099, "step": 160, "step_time": 8.18899186798808 }, { "clip_ratio/high_max": 0.014436141354963183, "clip_ratio/high_mean": 0.007218070677481592, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009171195677481592, "completions/clipped_ratio": 0.0, "completions/max_length": 2775.0, "completions/max_terminated_length": 2775.0, "completions/mean_length": 2458.0, "completions/mean_terminated_length": 2458.0, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "entropy": 0.28068962320685387, "epoch": 0.00322, "frac_reward_zero_std": 0.0, "grad_norm": 1.5223960876464844, "kl": 0.1943189101293683, "learning_rate": 9.999971064929537e-06, "loss": 0.0709, "num_tokens": 8363537.0, "reward": -0.04000000283122063, "reward_std": 0.17396603524684906, "rewards/rollout_reward_func/mean": -0.04000000283122063, "rewards/rollout_reward_func/std": 0.27076953649520874, "sampling/importance_sampling_ratio/max": 2.4551634788513184, "sampling/importance_sampling_ratio/mean": 0.9484375715255737, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0188813209533691, "sampling/sampling_logp_difference/mean": 0.058578573167324066, "step": 161, "step_time": 41.353516772003786 }, { "clip_ratio/high_max": 0.010529891354963183, "clip_ratio/high_mean": 0.005264945677481592, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007218070677481592, "entropy": 0.27924087457358837, "epoch": 0.00324, "grad_norm": 1.2550345659255981, "kl": 0.21171990223228931, "learning_rate": 9.999970600117172e-06, "loss": 0.0684, "step": 162, "step_time": 8.227772283993545 }, { "clip_ratio/high_max": 0.017968750093132257, "clip_ratio/high_mean": 0.008984375046566129, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010937500046566129, "completions/clipped_ratio": 0.0, "completions/max_length": 2698.0, "completions/max_terminated_length": 2698.0, "completions/mean_length": 2467.15625, "completions/mean_terminated_length": 2467.15625, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "entropy": 0.29255248606204987, "epoch": 0.00326, "frac_reward_zero_std": 0.0, "grad_norm": 1.8392517566680908, "kl": 0.22120904922485352, "learning_rate": 9.999970131601143e-06, "loss": -0.0554, "num_tokens": 8466834.0, "reward": 0.02281249687075615, "reward_std": 0.17198839783668518, "rewards/rollout_reward_func/mean": 0.02281249687075615, "rewards/rollout_reward_func/std": 0.27481648325920105, "sampling/importance_sampling_ratio/max": 2.9966037273406982, "sampling/importance_sampling_ratio/mean": 1.031958818435669, "sampling/importance_sampling_ratio/min": 0.21707558631896973, "sampling/sampling_logp_difference/max": 1.1190013885498047, "sampling/sampling_logp_difference/mean": 0.0631747841835022, "step": 163, "step_time": 40.34314790100325 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.008091517724096775, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008091517724096775, "entropy": 0.29565080255270004, "epoch": 0.00328, "grad_norm": 1.9357725381851196, "kl": 0.2302815355360508, "learning_rate": 9.99996965938145e-06, "loss": -0.0556, "step": 164, "step_time": 8.167869026008702 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "completions/clipped_ratio": 0.0, "completions/max_length": 2738.0, "completions/max_terminated_length": 2738.0, "completions/mean_length": 2552.15625, "completions/mean_terminated_length": 2552.15625, "completions/min_length": 1015.0, "completions/min_terminated_length": 1015.0, "entropy": 0.30549056455492973, "epoch": 0.0033, "frac_reward_zero_std": 0.0, "grad_norm": 2.3971192836761475, "kl": 0.3436704585328698, "learning_rate": 9.999969183458093e-06, "loss": -0.0144, "num_tokens": 8572678.0, "reward": 0.005937501788139343, "reward_std": 0.11331214010715485, "rewards/rollout_reward_func/mean": 0.005937501788139343, "rewards/rollout_reward_func/std": 0.19420406222343445, "sampling/importance_sampling_ratio/max": 2.7245781421661377, "sampling/importance_sampling_ratio/mean": 0.9707354307174683, "sampling/importance_sampling_ratio/min": 0.354126900434494, "sampling/sampling_logp_difference/max": 1.1669249534606934, "sampling/sampling_logp_difference/mean": 0.06437948346138, "step": 165, "step_time": 40.59159922699473 }, { "clip_ratio/high_max": 0.026154891354963183, "clip_ratio/high_mean": 0.013077445677481592, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02088994567748159, "entropy": 0.30958276242017746, "epoch": 0.00332, "grad_norm": 2.078913450241089, "kl": 0.30393845308572054, "learning_rate": 9.999968703831072e-06, "loss": -0.0176, "step": 166, "step_time": 8.28843492600572 }, { "clip_ratio/high_max": 0.02734375, "clip_ratio/high_mean": 0.013671875, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 2732.0, "completions/max_terminated_length": 2732.0, "completions/mean_length": 2549.9375, "completions/mean_terminated_length": 2549.9375, "completions/min_length": 2373.0, "completions/min_terminated_length": 2373.0, "entropy": 0.3152090422809124, "epoch": 0.00334, "frac_reward_zero_std": 0.0, "grad_norm": 1.8376330137252808, "kl": 0.5024630185216665, "learning_rate": 9.999968220500388e-06, "loss": -0.0287, "num_tokens": 8678291.0, "reward": 0.012812500819563866, "reward_std": 0.03377830609679222, "rewards/rollout_reward_func/mean": 0.012812500819563866, "rewards/rollout_reward_func/std": 0.05075141042470932, "sampling/importance_sampling_ratio/max": 1.9678910970687866, "sampling/importance_sampling_ratio/mean": 0.9133018255233765, "sampling/importance_sampling_ratio/min": 0.3517675995826721, "sampling/sampling_logp_difference/max": 1.355534553527832, "sampling/sampling_logp_difference/mean": 0.06513936817646027, "step": 167, "step_time": 41.46052245599276 }, { "clip_ratio/high_max": 0.01953125, "clip_ratio/high_mean": 0.013671875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021484375, "entropy": 0.31694196164608, "epoch": 0.00336, "grad_norm": 1.6731157302856445, "kl": 0.4716432988643646, "learning_rate": 9.99996773346604e-06, "loss": -0.0335, "step": 168, "step_time": 8.273968965004315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 2680.0, "completions/max_terminated_length": 2680.0, "completions/mean_length": 2482.625, "completions/mean_terminated_length": 2482.625, "completions/min_length": 1437.0, "completions/min_terminated_length": 1437.0, "entropy": 0.3059420734643936, "epoch": 0.00338, "frac_reward_zero_std": 0.0, "grad_norm": 1.5976994037628174, "kl": 0.37691117636859417, "learning_rate": 9.999967242728034e-06, "loss": -0.0395, "num_tokens": 8781391.0, "reward": 0.04149999842047691, "reward_std": 0.15404483675956726, "rewards/rollout_reward_func/mean": 0.04149999842047691, "rewards/rollout_reward_func/std": 0.25512754917144775, "sampling/importance_sampling_ratio/max": 2.3652420043945312, "sampling/importance_sampling_ratio/mean": 0.9670310020446777, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1599547863006592, "sampling/sampling_logp_difference/mean": 0.06705337762832642, "step": 169, "step_time": 39.37707168101042 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.009765625, "clip_ratio/low_mean": 0.009765625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01953125, "entropy": 0.3081233687698841, "epoch": 0.0034, "grad_norm": 1.4047455787658691, "kl": 0.37214960530400276, "learning_rate": 9.999966748286364e-06, "loss": -0.0409, "step": 170, "step_time": 8.105316379995202 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 2780.0, "completions/max_terminated_length": 2780.0, "completions/mean_length": 2585.71875, "completions/mean_terminated_length": 2585.71875, "completions/min_length": 2429.0, "completions/min_terminated_length": 2429.0, "entropy": 0.3306998014450073, "epoch": 0.00342, "frac_reward_zero_std": 0.0, "grad_norm": 1.908154845237732, "kl": 0.2468932829797268, "learning_rate": 9.999966250141033e-06, "loss": -0.0225, "num_tokens": 8887896.0, "reward": 0.024937499314546585, "reward_std": 0.04533165693283081, "rewards/rollout_reward_func/mean": 0.024937499314546585, "rewards/rollout_reward_func/std": 0.05147622898221016, "sampling/importance_sampling_ratio/max": 2.204131841659546, "sampling/importance_sampling_ratio/mean": 0.9951881170272827, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8882501125335693, "sampling/sampling_logp_difference/mean": 0.06724405288696289, "step": 171, "step_time": 41.23671616201318 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.33214887976646423, "epoch": 0.00344, "grad_norm": 1.7956488132476807, "kl": 0.26035095751285553, "learning_rate": 9.999965748292042e-06, "loss": -0.022, "step": 172, "step_time": 8.822865209003794 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.007134885177947581, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01299426017794758, "completions/clipped_ratio": 0.0, "completions/max_length": 2734.0, "completions/max_terminated_length": 2734.0, "completions/mean_length": 2594.1875, "completions/mean_terminated_length": 2594.1875, "completions/min_length": 2444.0, "completions/min_terminated_length": 2444.0, "entropy": 0.3237563855946064, "epoch": 0.00346, "frac_reward_zero_std": 0.0, "grad_norm": 1.5233908891677856, "kl": 0.5627510249614716, "learning_rate": 9.999965242739394e-06, "loss": -0.0095, "num_tokens": 8994761.0, "reward": 0.01656249910593033, "reward_std": 0.06224355101585388, "rewards/rollout_reward_func/mean": 0.01656249910593033, "rewards/rollout_reward_func/std": 0.06737517565488815, "sampling/importance_sampling_ratio/max": 2.673325777053833, "sampling/importance_sampling_ratio/mean": 0.9175304770469666, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.002936363220215, "sampling/sampling_logp_difference/mean": 0.09262394905090332, "step": 173, "step_time": 41.60413134700502 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.005181760177947581, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00908801017794758, "entropy": 0.32536034286022186, "epoch": 0.00348, "grad_norm": 1.4691952466964722, "kl": 0.5308522153645754, "learning_rate": 9.999964733483082e-06, "loss": -0.0117, "step": 174, "step_time": 8.211871264997171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004557291744276881, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004557291744276881, "completions/clipped_ratio": 0.0, "completions/max_length": 2716.0, "completions/max_terminated_length": 2716.0, "completions/mean_length": 2492.5, "completions/mean_terminated_length": 2492.5, "completions/min_length": 1020.0, "completions/min_terminated_length": 1020.0, "entropy": 0.3340105786919594, "epoch": 0.0035, "frac_reward_zero_std": 0.0, "grad_norm": 1.8298368453979492, "kl": 0.3679938018321991, "learning_rate": 9.999964220523113e-06, "loss": -0.0572, "num_tokens": 9098126.0, "reward": -0.07824999839067459, "reward_std": 0.1797890067100525, "rewards/rollout_reward_func/mean": -0.07824999839067459, "rewards/rollout_reward_func/std": 0.31904685497283936, "sampling/importance_sampling_ratio/max": 1.841526985168457, "sampling/importance_sampling_ratio/mean": 0.9063608050346375, "sampling/importance_sampling_ratio/min": 0.19973398745059967, "sampling/sampling_logp_difference/max": 1.0407519340515137, "sampling/sampling_logp_difference/mean": 0.06615821272134781, "step": 175, "step_time": 39.59234012100205 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.0072180707938969135, "clip_ratio/low_mean": 0.004557291744276881, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011775362538173795, "entropy": 0.33797262609004974, "epoch": 0.00352, "grad_norm": 1.5984530448913574, "kl": 0.36107587814331055, "learning_rate": 9.999963703859486e-06, "loss": -0.0569, "step": 176, "step_time": 8.214162019983632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 2752.0, "completions/max_terminated_length": 2752.0, "completions/mean_length": 2572.03125, "completions/mean_terminated_length": 2572.03125, "completions/min_length": 2314.0, "completions/min_terminated_length": 2314.0, "entropy": 0.3543061949312687, "epoch": 0.00354, "frac_reward_zero_std": 0.0, "grad_norm": 1.5375295877456665, "kl": 0.2578230034559965, "learning_rate": 9.999963183492201e-06, "loss": -0.1343, "num_tokens": 9203871.0, "reward": -0.027187500149011612, "reward_std": 0.11736124753952026, "rewards/rollout_reward_func/mean": -0.027187500149011612, "rewards/rollout_reward_func/std": 0.2549792528152466, "sampling/importance_sampling_ratio/max": 2.8757715225219727, "sampling/importance_sampling_ratio/mean": 0.8799307346343994, "sampling/importance_sampling_ratio/min": 0.1571628898382187, "sampling/sampling_logp_difference/max": 1.2581915855407715, "sampling/sampling_logp_difference/mean": 0.07375901937484741, "step": 177, "step_time": 41.12113772200246 }, { "clip_ratio/high_max": 0.015506628900766373, "clip_ratio/high_mean": 0.007753314450383186, "clip_ratio/low_mean": 0.0038470644503831863, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011600378900766373, "entropy": 0.3563588745892048, "epoch": 0.00356, "grad_norm": 1.5801756381988525, "kl": 0.24894549511373043, "learning_rate": 9.999962659421257e-06, "loss": -0.1372, "step": 178, "step_time": 9.163750717998482 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.03125, "completions/max_length": 2703.0, "completions/max_terminated_length": 2703.0, "completions/mean_length": 2544.84375, "completions/mean_terminated_length": 2544.0, "completions/min_length": 2408.0, "completions/min_terminated_length": 2408.0, "entropy": 0.35032501444220543, "epoch": 0.00358, "frac_reward_zero_std": 0.0, "grad_norm": 1.473603367805481, "kl": 0.3438793905079365, "learning_rate": 9.999962131646657e-06, "loss": -0.0244, "num_tokens": 9308232.0, "reward": 0.026249999180436134, "reward_std": 0.0644674301147461, "rewards/rollout_reward_func/mean": 0.026249999180436134, "rewards/rollout_reward_func/std": 0.06804884225130081, "sampling/importance_sampling_ratio/max": 2.7217800617218018, "sampling/importance_sampling_ratio/mean": 1.1029736995697021, "sampling/importance_sampling_ratio/min": 0.15165719389915466, "sampling/sampling_logp_difference/max": 1.3789944648742676, "sampling/sampling_logp_difference/mean": 0.06541239470243454, "step": 179, "step_time": 40.59504781400028 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.35000326856970787, "epoch": 0.0036, "grad_norm": 1.4767123460769653, "kl": 0.3850640542805195, "learning_rate": 9.999961600168402e-06, "loss": -0.0251, "step": 180, "step_time": 8.215517988006468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2703.0, "completions/max_terminated_length": 2703.0, "completions/mean_length": 2507.625, "completions/mean_terminated_length": 2507.625, "completions/min_length": 1636.0, "completions/min_terminated_length": 1636.0, "entropy": 0.3719157911837101, "epoch": 0.00362, "frac_reward_zero_std": 0.0, "grad_norm": 1.799222469329834, "kl": 0.2718762047588825, "learning_rate": 9.99996106498649e-06, "loss": -0.0178, "num_tokens": 9412169.0, "reward": 0.062187496572732925, "reward_std": 0.12166280299425125, "rewards/rollout_reward_func/mean": 0.062187496572732925, "rewards/rollout_reward_func/std": 0.23215307295322418, "sampling/importance_sampling_ratio/max": 2.1873276233673096, "sampling/importance_sampling_ratio/mean": 1.0135772228240967, "sampling/importance_sampling_ratio/min": 0.2740500867366791, "sampling/sampling_logp_difference/max": 0.9779013991355896, "sampling/sampling_logp_difference/mean": 0.061763741075992584, "step": 181, "step_time": 41.3249060379967 }, { "clip_ratio/high_max": 0.02734375, "clip_ratio/high_mean": 0.013671875, "clip_ratio/low_mean": 0.00569196417927742, "clip_ratio/low_min": 0.0035714285913854837, "clip_ratio/region_mean": 0.01936383917927742, "entropy": 0.3769003711640835, "epoch": 0.00364, "grad_norm": 1.6409553289413452, "kl": 0.26755072735249996, "learning_rate": 9.999960526100922e-06, "loss": -0.0218, "step": 182, "step_time": 8.139846026009764 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 2638.0, "completions/max_terminated_length": 2638.0, "completions/mean_length": 2519.4375, "completions/mean_terminated_length": 2519.4375, "completions/min_length": 2406.0, "completions/min_terminated_length": 2406.0, "entropy": 0.3503524139523506, "epoch": 0.00366, "frac_reward_zero_std": 0.0, "grad_norm": 1.1418262720108032, "kl": 0.27561422996222973, "learning_rate": 9.9999599835117e-06, "loss": -0.0738, "num_tokens": 9516483.0, "reward": 0.021562498062849045, "reward_std": 0.03244706243276596, "rewards/rollout_reward_func/mean": 0.021562498062849045, "rewards/rollout_reward_func/std": 0.04205291345715523, "sampling/importance_sampling_ratio/max": 1.8527733087539673, "sampling/importance_sampling_ratio/mean": 0.8473471403121948, "sampling/importance_sampling_ratio/min": 0.31423965096473694, "sampling/sampling_logp_difference/max": 0.8821654319763184, "sampling/sampling_logp_difference/mean": 0.0577574148774147, "step": 183, "step_time": 40.83836308600439 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.3548332415521145, "epoch": 0.00368, "grad_norm": 1.1759247779846191, "kl": 0.2614702060818672, "learning_rate": 9.999959437218823e-06, "loss": -0.0759, "step": 184, "step_time": 8.55310592699243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00463598920032382, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00463598920032382, "completions/clipped_ratio": 0.0, "completions/max_length": 2719.0, "completions/max_terminated_length": 2719.0, "completions/mean_length": 2476.6875, "completions/mean_terminated_length": 2476.6875, "completions/min_length": 1306.0, "completions/min_terminated_length": 1306.0, "entropy": 0.3436601832509041, "epoch": 0.0037, "frac_reward_zero_std": 0.0, "grad_norm": 1.3706753253936768, "kl": 0.30720300413668156, "learning_rate": 9.999958887222293e-06, "loss": 0.0834, "num_tokens": 9619759.0, "reward": 0.03656249865889549, "reward_std": 0.24551644921302795, "rewards/rollout_reward_func/mean": 0.03656249865889549, "rewards/rollout_reward_func/std": 0.34222203493118286, "sampling/importance_sampling_ratio/max": 2.1588592529296875, "sampling/importance_sampling_ratio/mean": 0.9483487606048584, "sampling/importance_sampling_ratio/min": 0.1842896193265915, "sampling/sampling_logp_difference/max": 0.9215145111083984, "sampling/sampling_logp_difference/mean": 0.05946381390094757, "step": 185, "step_time": 38.045946442005516 }, { "clip_ratio/high_max": 0.0234375, "clip_ratio/high_mean": 0.013671875, "clip_ratio/low_mean": 0.0018382353009656072, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015510110300965607, "entropy": 0.35040333122015, "epoch": 0.00372, "grad_norm": 1.1458492279052734, "kl": 0.2852174621075392, "learning_rate": 9.999958333522109e-06, "loss": 0.0825, "step": 186, "step_time": 8.192382211011136 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2683.0, "completions/max_terminated_length": 2683.0, "completions/mean_length": 2569.78125, "completions/mean_terminated_length": 2569.78125, "completions/min_length": 2393.0, "completions/min_terminated_length": 2393.0, "entropy": 0.3721434064209461, "epoch": 0.00374, "frac_reward_zero_std": 0.0, "grad_norm": 1.8181395530700684, "kl": 0.2232329212129116, "learning_rate": 9.999957776118273e-06, "loss": -0.0295, "num_tokens": 9726042.0, "reward": 0.03468749672174454, "reward_std": 0.06791973114013672, "rewards/rollout_reward_func/mean": 0.03468749672174454, "rewards/rollout_reward_func/std": 0.07025227695703506, "sampling/importance_sampling_ratio/max": 1.8577862977981567, "sampling/importance_sampling_ratio/mean": 1.1377999782562256, "sampling/importance_sampling_ratio/min": 0.3496271073818207, "sampling/sampling_logp_difference/max": 0.7724575996398926, "sampling/sampling_logp_difference/mean": 0.05710921436548233, "step": 187, "step_time": 41.54417477098468 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.37350033223629, "epoch": 0.00376, "grad_norm": 1.7502919435501099, "kl": 0.22246569395065308, "learning_rate": 9.999957215010786e-06, "loss": -0.0304, "step": 188, "step_time": 8.093648672998825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2706.0, "completions/max_terminated_length": 2706.0, "completions/mean_length": 2539.53125, "completions/mean_terminated_length": 2536.806396484375, "completions/min_length": 1820.0, "completions/min_terminated_length": 1820.0, "entropy": 0.36657416820526123, "epoch": 0.00378, "frac_reward_zero_std": 0.0, "grad_norm": 1.3249930143356323, "kl": 0.2342336755245924, "learning_rate": 9.999956650199647e-06, "loss": -0.1187, "num_tokens": 9830471.0, "reward": -0.054999999701976776, "reward_std": 0.1698419749736786, "rewards/rollout_reward_func/mean": -0.054999999701976776, "rewards/rollout_reward_func/std": 0.27874141931533813, "sampling/importance_sampling_ratio/max": 1.967926025390625, "sampling/importance_sampling_ratio/mean": 0.9288737773895264, "sampling/importance_sampling_ratio/min": 0.30164018273353577, "sampling/sampling_logp_difference/max": 0.8526759147644043, "sampling/sampling_logp_difference/mean": 0.05662436783313751, "step": 189, "step_time": 39.74040631599928 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.36383185535669327, "epoch": 0.0038, "grad_norm": 1.2447773218154907, "kl": 0.2557512894272804, "learning_rate": 9.999956081684854e-06, "loss": -0.1228, "step": 190, "step_time": 8.122944991009717 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 2738.0, "completions/max_terminated_length": 2738.0, "completions/mean_length": 2498.125, "completions/mean_terminated_length": 2498.125, "completions/min_length": 1619.0, "completions/min_terminated_length": 1619.0, "entropy": 0.35211504995822906, "epoch": 0.00382, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515205383300781, "kl": 0.22487179283052683, "learning_rate": 9.999955509466414e-06, "loss": -0.0227, "num_tokens": 9933907.0, "reward": 0.04243750125169754, "reward_std": 0.2674861550331116, "rewards/rollout_reward_func/mean": 0.04243750125169754, "rewards/rollout_reward_func/std": 0.3605915307998657, "sampling/importance_sampling_ratio/max": 2.468160629272461, "sampling/importance_sampling_ratio/mean": 0.975672721862793, "sampling/importance_sampling_ratio/min": 0.19411441683769226, "sampling/sampling_logp_difference/max": 0.8760786056518555, "sampling/sampling_logp_difference/mean": 0.05464397370815277, "step": 191, "step_time": 39.224735490002786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.3466275744140148, "epoch": 0.00384, "grad_norm": 1.3361579179763794, "kl": 0.23413463309407234, "learning_rate": 9.999954933544324e-06, "loss": -0.025, "step": 192, "step_time": 8.201387897010136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2755.0, "completions/max_terminated_length": 2755.0, "completions/mean_length": 2555.09375, "completions/mean_terminated_length": 2555.09375, "completions/min_length": 2369.0, "completions/min_terminated_length": 2369.0, "entropy": 0.35569561645388603, "epoch": 0.00386, "frac_reward_zero_std": 0.0, "grad_norm": 1.7639473676681519, "kl": 0.23214801028370857, "learning_rate": 9.999954353918583e-06, "loss": 0.005, "num_tokens": 10039509.0, "reward": 0.02437499910593033, "reward_std": 0.04079621285200119, "rewards/rollout_reward_func/mean": 0.02437499910593033, "rewards/rollout_reward_func/std": 0.04737887531518936, "sampling/importance_sampling_ratio/max": 2.841304063796997, "sampling/importance_sampling_ratio/mean": 1.0419715642929077, "sampling/importance_sampling_ratio/min": 0.37415075302124023, "sampling/sampling_logp_difference/max": 0.682642936706543, "sampling/sampling_logp_difference/mean": 0.055786363780498505, "step": 193, "step_time": 41.76466753900604 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.3506021201610565, "epoch": 0.00388, "grad_norm": 1.7473597526550293, "kl": 0.23999202623963356, "learning_rate": 9.999953770589195e-06, "loss": 0.0053, "step": 194, "step_time": 8.263513790996512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2735.0, "completions/max_terminated_length": 2735.0, "completions/mean_length": 2538.0625, "completions/mean_terminated_length": 2538.0625, "completions/min_length": 1788.0, "completions/min_terminated_length": 1788.0, "entropy": 0.37049463018774986, "epoch": 0.0039, "frac_reward_zero_std": 0.0, "grad_norm": 1.8461344242095947, "kl": 0.36803081817924976, "learning_rate": 9.999953183556157e-06, "loss": -0.0015, "num_tokens": 10144288.0, "reward": -0.03406250476837158, "reward_std": 0.18202264606952667, "rewards/rollout_reward_func/mean": -0.03406250476837158, "rewards/rollout_reward_func/std": 0.2601347267627716, "sampling/importance_sampling_ratio/max": 2.4161949157714844, "sampling/importance_sampling_ratio/mean": 1.04921293258667, "sampling/importance_sampling_ratio/min": 0.33067557215690613, "sampling/sampling_logp_difference/max": 0.6486377716064453, "sampling/sampling_logp_difference/mean": 0.059844404458999634, "step": 195, "step_time": 41.45346045300539 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.36830903589725494, "epoch": 0.00392, "grad_norm": 1.9099000692367554, "kl": 0.3604978770017624, "learning_rate": 9.999952592819472e-06, "loss": -0.0066, "step": 196, "step_time": 8.190971479001746 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2717.0, "completions/max_terminated_length": 2717.0, "completions/mean_length": 2420.9375, "completions/mean_terminated_length": 2420.9375, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "entropy": 0.35077469423413277, "epoch": 0.00394, "frac_reward_zero_std": 0.0, "grad_norm": 2.002080202102661, "kl": 0.3194318525493145, "learning_rate": 9.999951998379141e-06, "loss": -0.2282, "num_tokens": 10245383.0, "reward": -0.0068125054240226746, "reward_std": 0.19143234193325043, "rewards/rollout_reward_func/mean": -0.0068125054240226746, "rewards/rollout_reward_func/std": 0.3449188470840454, "sampling/importance_sampling_ratio/max": 2.887979507446289, "sampling/importance_sampling_ratio/mean": 1.1211514472961426, "sampling/importance_sampling_ratio/min": 0.1755070835351944, "sampling/sampling_logp_difference/max": 1.1164432764053345, "sampling/sampling_logp_difference/mean": 0.06820495426654816, "step": 197, "step_time": 38.74268743399443 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.3482266962528229, "epoch": 0.00396, "grad_norm": 1.8061574697494507, "kl": 0.3401306103914976, "learning_rate": 9.999951400235163e-06, "loss": -0.2299, "step": 198, "step_time": 8.126102788002754 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2668.0, "completions/max_terminated_length": 2668.0, "completions/mean_length": 2516.75, "completions/mean_terminated_length": 2516.75, "completions/min_length": 2034.0, "completions/min_terminated_length": 2034.0, "entropy": 0.342928946018219, "epoch": 0.00398, "frac_reward_zero_std": 0.0, "grad_norm": 1.2324050664901733, "kl": 0.3182620648294687, "learning_rate": 9.999950798387541e-06, "loss": -0.0755, "num_tokens": 10349358.0, "reward": 0.11312500387430191, "reward_std": 0.31299662590026855, "rewards/rollout_reward_func/mean": 0.11312500387430191, "rewards/rollout_reward_func/std": 0.39529845118522644, "sampling/importance_sampling_ratio/max": 2.2911791801452637, "sampling/importance_sampling_ratio/mean": 0.9121253490447998, "sampling/importance_sampling_ratio/min": 0.1949577033519745, "sampling/sampling_logp_difference/max": 1.2254157066345215, "sampling/sampling_logp_difference/mean": 0.05940123647451401, "step": 199, "step_time": 38.75293362401135 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.33788008987903595, "epoch": 0.004, "grad_norm": 1.184295415878296, "kl": 0.3464464507997036, "learning_rate": 9.999950192836272e-06, "loss": -0.0787, "step": 200, "step_time": 8.536211900005583 }, { "clip_ratio/high_max": 0.015395220601931214, "clip_ratio/high_mean": 0.007697610300965607, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011603860300965607, "completions/clipped_ratio": 0.0, "completions/max_length": 2759.0, "completions/max_terminated_length": 2759.0, "completions/mean_length": 2531.75, "completions/mean_terminated_length": 2531.75, "completions/min_length": 2288.0, "completions/min_terminated_length": 2288.0, "entropy": 0.3628573678433895, "epoch": 0.00402, "frac_reward_zero_std": 0.0, "grad_norm": 1.393696665763855, "kl": 0.4442015439271927, "learning_rate": 9.999949583581358e-06, "loss": -0.1296, "num_tokens": 10453571.0, "reward": 0.03125, "reward_std": 0.04873351752758026, "rewards/rollout_reward_func/mean": 0.03125, "rewards/rollout_reward_func/std": 0.060147665441036224, "sampling/importance_sampling_ratio/max": 2.2512755393981934, "sampling/importance_sampling_ratio/mean": 0.9395947456359863, "sampling/importance_sampling_ratio/min": 0.24555900692939758, "sampling/sampling_logp_difference/max": 0.942570686340332, "sampling/sampling_logp_difference/mean": 0.07334975898265839, "step": 201, "step_time": 42.58854235501349 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.00746193912345916, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01332131412345916, "entropy": 0.3568928651511669, "epoch": 0.00404, "grad_norm": 1.1539993286132812, "kl": 0.446804903447628, "learning_rate": 9.999948970622801e-06, "loss": -0.1318, "step": 202, "step_time": 8.253893647997756 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2730.0, "completions/max_terminated_length": 2730.0, "completions/mean_length": 2574.71875, "completions/mean_terminated_length": 2574.71875, "completions/min_length": 2381.0, "completions/min_terminated_length": 2381.0, "entropy": 0.3213490657508373, "epoch": 0.00406, "frac_reward_zero_std": 0.0, "grad_norm": 1.6174315214157104, "kl": 0.386203370988369, "learning_rate": 9.9999483539606e-06, "loss": -0.0016, "num_tokens": 10559322.0, "reward": 0.046687498688697815, "reward_std": 0.05125562474131584, "rewards/rollout_reward_func/mean": 0.046687498688697815, "rewards/rollout_reward_func/std": 0.062241170555353165, "sampling/importance_sampling_ratio/max": 2.171849012374878, "sampling/importance_sampling_ratio/mean": 0.9831439852714539, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.944068431854248, "sampling/sampling_logp_difference/mean": 0.06549233198165894, "step": 203, "step_time": 40.6149429290017 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013671875, "entropy": 0.3156820274889469, "epoch": 0.00408, "grad_norm": 1.6070283651351929, "kl": 0.4013451524078846, "learning_rate": 9.999947733594757e-06, "loss": -0.0065, "step": 204, "step_time": 8.203901119995862 }, { "clip_ratio/high_max": 0.007694128900766373, "clip_ratio/high_mean": 0.0038470644503831863, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009706439450383186, "completions/clipped_ratio": 0.0, "completions/max_length": 2716.0, "completions/max_terminated_length": 2716.0, "completions/mean_length": 2486.21875, "completions/mean_terminated_length": 2486.21875, "completions/min_length": 1017.0, "completions/min_terminated_length": 1017.0, "entropy": 0.3160020150244236, "epoch": 0.0041, "frac_reward_zero_std": 0.0, "grad_norm": 1.2670780420303345, "kl": 0.41101067140698433, "learning_rate": 9.99994710952527e-06, "loss": -0.1007, "num_tokens": 10662392.0, "reward": -0.006562499329447746, "reward_std": 0.11399046331644058, "rewards/rollout_reward_func/mean": -0.006562499329447746, "rewards/rollout_reward_func/std": 0.19189396500587463, "sampling/importance_sampling_ratio/max": 2.0264596939086914, "sampling/importance_sampling_ratio/mean": 0.8527935147285461, "sampling/importance_sampling_ratio/min": 0.07731558382511139, "sampling/sampling_logp_difference/max": 1.2514522075653076, "sampling/sampling_logp_difference/mean": 0.07297109067440033, "step": 205, "step_time": 40.33754772198881 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.31296201795339584, "epoch": 0.00412, "grad_norm": 1.2826589345932007, "kl": 0.4051324315369129, "learning_rate": 9.999946481752143e-06, "loss": -0.1028, "step": 206, "step_time": 8.621249777992489 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.006138392956927419, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008091517956927419, "completions/clipped_ratio": 0.03125, "completions/max_length": 2759.0, "completions/max_terminated_length": 2759.0, "completions/mean_length": 2496.40625, "completions/mean_terminated_length": 2496.386962890625, "completions/min_length": 1286.0, "completions/min_terminated_length": 1286.0, "entropy": 0.30267368629574776, "epoch": 0.00414, "frac_reward_zero_std": 0.0, "grad_norm": 1.4600504636764526, "kl": 0.24276616983115673, "learning_rate": 9.999945850275376e-06, "loss": 0.0199, "num_tokens": 10765406.0, "reward": 0.02281249687075615, "reward_std": 0.2987138628959656, "rewards/rollout_reward_func/mean": 0.02281249687075615, "rewards/rollout_reward_func/std": 0.40383031964302063, "sampling/importance_sampling_ratio/max": 2.8048996925354004, "sampling/importance_sampling_ratio/mean": 1.0050692558288574, "sampling/importance_sampling_ratio/min": 0.34381619095802307, "sampling/sampling_logp_difference/max": 0.9117152690887451, "sampling/sampling_logp_difference/mean": 0.0614398792386055, "step": 207, "step_time": 39.424753070990846 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.009765625, "clip_ratio/low_mean": 0.010044642956927419, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01981026795692742, "entropy": 0.3010764829814434, "epoch": 0.00416, "grad_norm": 1.2748208045959473, "kl": 0.2418710347265005, "learning_rate": 9.999945215094968e-06, "loss": 0.016, "step": 208, "step_time": 8.264535922004143 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.005989583441987634, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011848958441987634, "completions/clipped_ratio": 0.0, "completions/max_length": 2715.0, "completions/max_terminated_length": 2715.0, "completions/mean_length": 2515.71875, "completions/mean_terminated_length": 2515.71875, "completions/min_length": 1839.0, "completions/min_terminated_length": 1839.0, "entropy": 0.29903779923915863, "epoch": 0.00418, "frac_reward_zero_std": 0.0, "grad_norm": 1.4785264730453491, "kl": 0.3483623880892992, "learning_rate": 9.99994457621092e-06, "loss": -0.0908, "num_tokens": 10869297.0, "reward": 0.005625000223517418, "reward_std": 0.10760631412267685, "rewards/rollout_reward_func/mean": 0.005625000223517418, "rewards/rollout_reward_func/std": 0.19536462426185608, "sampling/importance_sampling_ratio/max": 2.326657772064209, "sampling/importance_sampling_ratio/mean": 1.0822689533233643, "sampling/importance_sampling_ratio/min": 0.0710102990269661, "sampling/sampling_logp_difference/max": 1.217003583908081, "sampling/sampling_logp_difference/mean": 0.0704931765794754, "step": 209, "step_time": 41.126067208992026 }, { "clip_ratio/high_max": 0.01907169120386243, "clip_ratio/high_mean": 0.011488970601931214, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015395220601931214, "entropy": 0.295239444822073, "epoch": 0.0042, "grad_norm": 2.135641574859619, "kl": 0.35493278689682484, "learning_rate": 9.999943933623233e-06, "loss": -0.0943, "step": 210, "step_time": 8.173974077988532 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.006138392956927419, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010044642956927419, "completions/clipped_ratio": 0.0, "completions/max_length": 2724.0, "completions/max_terminated_length": 2724.0, "completions/mean_length": 2539.84375, "completions/mean_terminated_length": 2539.84375, "completions/min_length": 1788.0, "completions/min_terminated_length": 1788.0, "entropy": 0.2810978293418884, "epoch": 0.00422, "frac_reward_zero_std": 0.0, "grad_norm": 1.446002721786499, "kl": 0.5180637389421463, "learning_rate": 9.999943287331909e-06, "loss": -0.0502, "num_tokens": 10973915.0, "reward": -0.04500000178813934, "reward_std": 0.17551785707473755, "rewards/rollout_reward_func/mean": -0.04500000178813934, "rewards/rollout_reward_func/std": 0.26568230986595154, "sampling/importance_sampling_ratio/max": 1.8341037034988403, "sampling/importance_sampling_ratio/mean": 0.8888924717903137, "sampling/importance_sampling_ratio/min": 0.3128281831741333, "sampling/sampling_logp_difference/max": 1.6654303073883057, "sampling/sampling_logp_difference/mean": 0.07020728290081024, "step": 211, "step_time": 39.432713571011845 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.014229910913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018136160913854837, "entropy": 0.2733327057212591, "epoch": 0.00424, "grad_norm": 1.2415473461151123, "kl": 0.5663974024355412, "learning_rate": 9.999942637336943e-06, "loss": -0.0513, "step": 212, "step_time": 8.662085884992848 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005078124813735485, "completions/clipped_ratio": 0.0, "completions/max_length": 2721.0, "completions/max_terminated_length": 2721.0, "completions/mean_length": 2471.5625, "completions/mean_terminated_length": 2471.5625, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "entropy": 0.25025843642652035, "epoch": 0.00426, "frac_reward_zero_std": 0.0, "grad_norm": 2.2709338665008545, "kl": 0.821235241368413, "learning_rate": 9.999941983638343e-06, "loss": -0.0337, "num_tokens": 11077168.0, "reward": 0.01850000023841858, "reward_std": 0.2203158140182495, "rewards/rollout_reward_func/mean": 0.01850000023841858, "rewards/rollout_reward_func/std": 0.31408196687698364, "sampling/importance_sampling_ratio/max": 2.0936169624328613, "sampling/importance_sampling_ratio/mean": 0.9659475088119507, "sampling/importance_sampling_ratio/min": 0.06496711075305939, "sampling/sampling_logp_difference/max": 2.544590473175049, "sampling/sampling_logp_difference/mean": 0.059280212968587875, "step": 213, "step_time": 38.46859342800599 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.24745017662644386, "epoch": 0.00428, "grad_norm": 1.4903650283813477, "kl": 0.577656701207161, "learning_rate": 9.999941326236106e-06, "loss": -0.037, "step": 214, "step_time": 8.155131259009067 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2718.0, "completions/max_terminated_length": 2718.0, "completions/mean_length": 2575.0625, "completions/mean_terminated_length": 2575.0625, "completions/min_length": 2392.0, "completions/min_terminated_length": 2392.0, "entropy": 0.2778208311647177, "epoch": 0.0043, "frac_reward_zero_std": 0.0, "grad_norm": 1.7159686088562012, "kl": 0.7443384360522032, "learning_rate": 9.999940665130233e-06, "loss": 0.0089, "num_tokens": 11182822.0, "reward": -0.024375002831220627, "reward_std": 0.10185873508453369, "rewards/rollout_reward_func/mean": -0.024375002831220627, "rewards/rollout_reward_func/std": 0.18804490566253662, "sampling/importance_sampling_ratio/max": 2.0679190158843994, "sampling/importance_sampling_ratio/mean": 0.9192431569099426, "sampling/importance_sampling_ratio/min": 0.14288972318172455, "sampling/sampling_logp_difference/max": 1.4988486766815186, "sampling/sampling_logp_difference/mean": 0.08602313697338104, "step": 215, "step_time": 40.16992479900364 }, { "clip_ratio/high_max": 0.01953125, "clip_ratio/high_mean": 0.009765625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.28077778965234756, "epoch": 0.00432, "grad_norm": 1.4250431060791016, "kl": 0.6058767847716808, "learning_rate": 9.999940000320726e-06, "loss": 0.006, "step": 216, "step_time": 8.206848283996806 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2706.0, "completions/max_terminated_length": 2706.0, "completions/mean_length": 2536.84375, "completions/mean_terminated_length": 2536.84375, "completions/min_length": 2381.0, "completions/min_terminated_length": 2381.0, "entropy": 0.28082339093089104, "epoch": 0.00434, "frac_reward_zero_std": 0.125, "grad_norm": 1.5246869325637817, "kl": 0.5135170202702284, "learning_rate": 9.999939331807582e-06, "loss": -0.1763, "num_tokens": 11287279.0, "reward": 0.017500000074505806, "reward_std": 0.04132015258073807, "rewards/rollout_reward_func/mean": 0.017500000074505806, "rewards/rollout_reward_func/std": 0.052915021777153015, "sampling/importance_sampling_ratio/max": 2.8221874237060547, "sampling/importance_sampling_ratio/mean": 0.8796664476394653, "sampling/importance_sampling_ratio/min": 0.07916179299354553, "sampling/sampling_logp_difference/max": 1.6334364414215088, "sampling/sampling_logp_difference/mean": 0.0864379033446312, "step": 217, "step_time": 41.50737186798506 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "entropy": 0.2852325811982155, "epoch": 0.00436, "grad_norm": 1.5693947076797485, "kl": 0.46704200468957424, "learning_rate": 9.999938659590807e-06, "loss": -0.1787, "step": 218, "step_time": 8.133669030990859 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 2785.0, "completions/max_terminated_length": 2785.0, "completions/mean_length": 2510.1875, "completions/mean_terminated_length": 2510.1875, "completions/min_length": 1017.0, "completions/min_terminated_length": 1017.0, "entropy": 0.28106593526899815, "epoch": 0.00438, "frac_reward_zero_std": 0.0, "grad_norm": 1.543866753578186, "kl": 0.582334304228425, "learning_rate": 9.999937983670399e-06, "loss": -0.004, "num_tokens": 11391012.0, "reward": 0.00937499850988388, "reward_std": 0.11015652120113373, "rewards/rollout_reward_func/mean": 0.00937499850988388, "rewards/rollout_reward_func/std": 0.19029581546783447, "sampling/importance_sampling_ratio/max": 2.0876622200012207, "sampling/importance_sampling_ratio/mean": 0.9376651048660278, "sampling/importance_sampling_ratio/min": 0.13859038054943085, "sampling/sampling_logp_difference/max": 1.4258623123168945, "sampling/sampling_logp_difference/mean": 0.07284407317638397, "step": 219, "step_time": 40.03303195400076 }, { "clip_ratio/high_max": 0.01953125, "clip_ratio/high_mean": 0.013671875, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01953125, "entropy": 0.282447412610054, "epoch": 0.0044, "grad_norm": 1.4197849035263062, "kl": 0.6128968577831984, "learning_rate": 9.999937304046356e-06, "loss": -0.0071, "step": 220, "step_time": 8.333434138992743 }, { "clip_ratio/high_max": 0.008072916883975267, "clip_ratio/high_mean": 0.005989583441987634, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005989583441987634, "completions/clipped_ratio": 0.0, "completions/max_length": 2701.0, "completions/max_terminated_length": 2701.0, "completions/mean_length": 2418.09375, "completions/mean_terminated_length": 2418.09375, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "entropy": 0.2881895024329424, "epoch": 0.00442, "frac_reward_zero_std": 0.0, "grad_norm": 1.5842020511627197, "kl": 0.5021013058722019, "learning_rate": 9.99993662071868e-06, "loss": 0.0788, "num_tokens": 11492112.0, "reward": 0.0020000003278255463, "reward_std": 0.23710773885250092, "rewards/rollout_reward_func/mean": 0.0020000003278255463, "rewards/rollout_reward_func/std": 0.3416566848754883, "sampling/importance_sampling_ratio/max": 2.047086715698242, "sampling/importance_sampling_ratio/mean": 0.8008227944374084, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.5975122451782227, "sampling/sampling_logp_difference/mean": 0.07265321910381317, "step": 221, "step_time": 39.7871799089844 }, { "clip_ratio/high_max": 0.0360576924867928, "clip_ratio/high_mean": 0.025841346010565758, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.025841346010565758, "entropy": 0.2943236604332924, "epoch": 0.00444, "grad_norm": 1.1485141515731812, "kl": 0.37606994807720184, "learning_rate": 9.999935933687375e-06, "loss": 0.0736, "step": 222, "step_time": 8.64790611598437 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 2734.0, "completions/max_terminated_length": 2734.0, "completions/mean_length": 2529.90625, "completions/mean_terminated_length": 2529.90625, "completions/min_length": 1315.0, "completions/min_terminated_length": 1315.0, "entropy": 0.3002755120396614, "epoch": 0.00446, "frac_reward_zero_std": 0.0, "grad_norm": 1.6034713983535767, "kl": 0.38438911363482475, "learning_rate": 9.99993524295244e-06, "loss": 0.0235, "num_tokens": 11596899.0, "reward": -0.016437498852610588, "reward_std": 0.10589368641376495, "rewards/rollout_reward_func/mean": -0.016437498852610588, "rewards/rollout_reward_func/std": 0.19297447800636292, "sampling/importance_sampling_ratio/max": 2.1702961921691895, "sampling/importance_sampling_ratio/mean": 0.8773471117019653, "sampling/importance_sampling_ratio/min": 0.1552795171737671, "sampling/sampling_logp_difference/max": 1.5045547485351562, "sampling/sampling_logp_difference/mean": 0.07015751302242279, "step": 223, "step_time": 41.07358841999667 }, { "clip_ratio/high_max": 0.01953125, "clip_ratio/high_mean": 0.01171875, "clip_ratio/low_mean": 0.013671875, "clip_ratio/low_min": 0.0078125, "clip_ratio/region_mean": 0.025390625, "entropy": 0.30388905853033066, "epoch": 0.00448, "grad_norm": 1.6556518077850342, "kl": 0.3493770435452461, "learning_rate": 9.999934548513875e-06, "loss": 0.0229, "step": 224, "step_time": 8.234722809000232 }, { "clip_ratio/high_max": 0.0234375, "clip_ratio/high_mean": 0.01171875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "completions/clipped_ratio": 0.03125, "completions/max_length": 2728.0, "completions/max_terminated_length": 2728.0, "completions/mean_length": 2476.4375, "completions/mean_terminated_length": 2477.51611328125, "completions/min_length": 1948.0, "completions/min_terminated_length": 1948.0, "entropy": 0.28977199271321297, "epoch": 0.0045, "frac_reward_zero_std": 0.0, "grad_norm": 1.9037975072860718, "kl": 0.3731118068099022, "learning_rate": 9.999933850371681e-06, "loss": -0.0358, "num_tokens": 11699859.0, "reward": 0.030812501907348633, "reward_std": 0.24771924316883087, "rewards/rollout_reward_func/mean": 0.030812501907348633, "rewards/rollout_reward_func/std": 0.3897912800312042, "sampling/importance_sampling_ratio/max": 2.3635151386260986, "sampling/importance_sampling_ratio/mean": 0.9108861684799194, "sampling/importance_sampling_ratio/min": 0.016115080565214157, "sampling/sampling_logp_difference/max": 1.8015650510787964, "sampling/sampling_logp_difference/mean": 0.06312668323516846, "step": 225, "step_time": 39.49097987600544 }, { "clip_ratio/high_max": 0.039583333767950535, "clip_ratio/high_mean": 0.021875000093132257, "clip_ratio/low_mean": 0.01171875, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.033593749860301614, "entropy": 0.28937546350061893, "epoch": 0.00452, "grad_norm": 1.3335111141204834, "kl": 0.38635273836553097, "learning_rate": 9.999933148525858e-06, "loss": -0.0415, "step": 226, "step_time": 8.190281943003356 }, { "clip_ratio/high_max": 0.018342391354963183, "clip_ratio/high_mean": 0.009171195677481592, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009171195677481592, "completions/clipped_ratio": 0.0, "completions/max_length": 2713.0, "completions/max_terminated_length": 2713.0, "completions/mean_length": 2423.1875, "completions/mean_terminated_length": 2423.1875, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "entropy": 0.32319737412035465, "epoch": 0.00454, "frac_reward_zero_std": 0.0, "grad_norm": 1.7308000326156616, "kl": 0.4490476939827204, "learning_rate": 9.999932442976408e-06, "loss": -0.046, "num_tokens": 11801299.0, "reward": -0.07231250405311584, "reward_std": 0.22803862392902374, "rewards/rollout_reward_func/mean": -0.07231250405311584, "rewards/rollout_reward_func/std": 0.3053916096687317, "sampling/importance_sampling_ratio/max": 2.4876017570495605, "sampling/importance_sampling_ratio/mean": 0.8915628790855408, "sampling/importance_sampling_ratio/min": 0.22262075543403625, "sampling/sampling_logp_difference/max": 1.2082533836364746, "sampling/sampling_logp_difference/mean": 0.05457788705825806, "step": 227, "step_time": 39.01180349101196 }, { "clip_ratio/high_max": 0.026154891354963183, "clip_ratio/high_mean": 0.015030570677481592, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020889945793896914, "entropy": 0.322303120046854, "epoch": 0.00456, "grad_norm": 1.5136457681655884, "kl": 0.46418991312384605, "learning_rate": 9.99993173372333e-06, "loss": -0.0493, "step": 228, "step_time": 9.05183739597851 }, { "clip_ratio/high_max": 0.009114583488553762, "clip_ratio/high_mean": 0.006510416744276881, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006510416744276881, "completions/clipped_ratio": 0.0, "completions/max_length": 2727.0, "completions/max_terminated_length": 2727.0, "completions/mean_length": 2484.28125, "completions/mean_terminated_length": 2484.28125, "completions/min_length": 1000.0, "completions/min_terminated_length": 1000.0, "entropy": 0.28009452298283577, "epoch": 0.00458, "frac_reward_zero_std": 0.0, "grad_norm": 1.4699045419692993, "kl": 0.3860022109001875, "learning_rate": 9.999931020766626e-06, "loss": 0.0266, "num_tokens": 11904330.0, "reward": 0.022187501192092896, "reward_std": 0.1748497486114502, "rewards/rollout_reward_func/mean": 0.022187501192092896, "rewards/rollout_reward_func/std": 0.28647732734680176, "sampling/importance_sampling_ratio/max": 2.1559696197509766, "sampling/importance_sampling_ratio/mean": 0.9133785963058472, "sampling/importance_sampling_ratio/min": 0.10093791782855988, "sampling/sampling_logp_difference/max": 1.0922698974609375, "sampling/sampling_logp_difference/mean": 0.05431393161416054, "step": 229, "step_time": 39.775225694000255 }, { "clip_ratio/high_max": 0.028645833488553762, "clip_ratio/high_mean": 0.014322916744276881, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01627604174427688, "entropy": 0.2788592744618654, "epoch": 0.0046, "grad_norm": 1.1860452890396118, "kl": 0.34408247936517, "learning_rate": 9.999930304106296e-06, "loss": 0.0194, "step": 230, "step_time": 8.248418390990992 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "completions/clipped_ratio": 0.0, "completions/max_length": 2766.0, "completions/max_terminated_length": 2766.0, "completions/mean_length": 2515.53125, "completions/mean_terminated_length": 2515.53125, "completions/min_length": 1013.0, "completions/min_terminated_length": 1013.0, "entropy": 0.27757587283849716, "epoch": 0.00462, "frac_reward_zero_std": 0.0, "grad_norm": 1.5308306217193604, "kl": 0.2734272815287113, "learning_rate": 9.99992958374234e-06, "loss": -0.1329, "num_tokens": 12008684.0, "reward": -0.022812500596046448, "reward_std": 0.12868362665176392, "rewards/rollout_reward_func/mean": -0.022812500596046448, "rewards/rollout_reward_func/std": 0.18984195590019226, "sampling/importance_sampling_ratio/max": 1.9966732263565063, "sampling/importance_sampling_ratio/mean": 0.9174094796180725, "sampling/importance_sampling_ratio/min": 0.1465490758419037, "sampling/sampling_logp_difference/max": 1.04158353805542, "sampling/sampling_logp_difference/mean": 0.05047091096639633, "step": 231, "step_time": 41.81639621800423 }, { "clip_ratio/high_max": 0.01953125, "clip_ratio/high_mean": 0.009765625, "clip_ratio/low_mean": 0.0038470644503831863, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013612689450383186, "entropy": 0.27455075457692146, "epoch": 0.00464, "grad_norm": 1.48916494846344, "kl": 0.2888593841344118, "learning_rate": 9.999928859674762e-06, "loss": -0.135, "step": 232, "step_time": 8.280193610000424 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 2694.0, "completions/max_terminated_length": 2694.0, "completions/mean_length": 2548.03125, "completions/mean_terminated_length": 2548.03125, "completions/min_length": 2429.0, "completions/min_terminated_length": 2429.0, "entropy": 0.29389649629592896, "epoch": 0.00466, "frac_reward_zero_std": 0.0, "grad_norm": 1.9648756980895996, "kl": 0.4565126970410347, "learning_rate": 9.999928131903557e-06, "loss": -0.0874, "num_tokens": 12114168.0, "reward": 0.008375001139938831, "reward_std": 0.0767504870891571, "rewards/rollout_reward_func/mean": 0.008375001139938831, "rewards/rollout_reward_func/std": 0.08380843698978424, "sampling/importance_sampling_ratio/max": 1.532104253768921, "sampling/importance_sampling_ratio/mean": 0.8359918594360352, "sampling/importance_sampling_ratio/min": 0.06460016965866089, "sampling/sampling_logp_difference/max": 1.3589129447937012, "sampling/sampling_logp_difference/mean": 0.05609213560819626, "step": 233, "step_time": 42.033232877001865 }, { "clip_ratio/high_max": 0.01953125, "clip_ratio/high_mean": 0.013556985184550285, "clip_ratio/low_mean": 0.01953125, "clip_ratio/low_min": 0.0078125, "clip_ratio/region_mean": 0.033088235184550285, "entropy": 0.2880493104457855, "epoch": 0.00468, "grad_norm": 1.3010988235473633, "kl": 0.5239596497267485, "learning_rate": 9.999927400428733e-06, "loss": -0.0924, "step": 234, "step_time": 8.519997568997496 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "completions/clipped_ratio": 0.0, "completions/max_length": 2655.0, "completions/max_terminated_length": 2655.0, "completions/mean_length": 2469.875, "completions/mean_terminated_length": 2469.875, "completions/min_length": 1325.0, "completions/min_terminated_length": 1325.0, "entropy": 0.25798671692609787, "epoch": 0.0047, "frac_reward_zero_std": 0.0, "grad_norm": 1.7982852458953857, "kl": 0.40759785287082195, "learning_rate": 9.999926665250287e-06, "loss": -0.0376, "num_tokens": 12216592.0, "reward": 0.03531249985098839, "reward_std": 0.16736529767513275, "rewards/rollout_reward_func/mean": 0.03531249985098839, "rewards/rollout_reward_func/std": 0.2541143298149109, "sampling/importance_sampling_ratio/max": 2.9104549884796143, "sampling/importance_sampling_ratio/mean": 0.8951402902603149, "sampling/importance_sampling_ratio/min": 0.101445771753788, "sampling/sampling_logp_difference/max": 1.7230520248413086, "sampling/sampling_logp_difference/mean": 0.0640086904168129, "step": 235, "step_time": 40.341744027995446 }, { "clip_ratio/high_max": 0.02734375, "clip_ratio/high_mean": 0.016075721010565758, "clip_ratio/low_mean": 0.01953125, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.03560697101056576, "entropy": 0.25308277271687984, "epoch": 0.00472, "grad_norm": 1.467527985572815, "kl": 0.4257641229778528, "learning_rate": 9.999925926368217e-06, "loss": -0.042, "step": 236, "step_time": 7.9956631579989335 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "completions/clipped_ratio": 0.0, "completions/max_length": 2712.0, "completions/max_terminated_length": 2712.0, "completions/mean_length": 2452.53125, "completions/mean_terminated_length": 2452.53125, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "entropy": 0.2688504420220852, "epoch": 0.00474, "frac_reward_zero_std": 0.0, "grad_norm": 1.3559819459915161, "kl": 0.18561278842389584, "learning_rate": 9.999925183782528e-06, "loss": -0.0083, "num_tokens": 12318451.0, "reward": -0.0006874985992908478, "reward_std": 0.11364820599555969, "rewards/rollout_reward_func/mean": -0.0006874985992908478, "rewards/rollout_reward_func/std": 0.19639894366264343, "sampling/importance_sampling_ratio/max": 1.7619540691375732, "sampling/importance_sampling_ratio/mean": 1.017667293548584, "sampling/importance_sampling_ratio/min": 0.13964326679706573, "sampling/sampling_logp_difference/max": 0.9589061737060547, "sampling/sampling_logp_difference/mean": 0.045998990535736084, "step": 237, "step_time": 40.39693818700471 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.26928373239934444, "epoch": 0.00476, "grad_norm": 1.3246057033538818, "kl": 0.19937893375754356, "learning_rate": 9.99992443749322e-06, "loss": -0.0084, "step": 238, "step_time": 8.14953701399645 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 2671.0, "completions/max_terminated_length": 2671.0, "completions/mean_length": 2486.78125, "completions/mean_terminated_length": 2486.78125, "completions/min_length": 1348.0, "completions/min_terminated_length": 1348.0, "entropy": 0.2892954871058464, "epoch": 0.00478, "frac_reward_zero_std": 0.0, "grad_norm": 1.781704306602478, "kl": 0.2630540318787098, "learning_rate": 9.99992368750029e-06, "loss": -0.0765, "num_tokens": 12421757.0, "reward": 0.01875000074505806, "reward_std": 0.18447396159172058, "rewards/rollout_reward_func/mean": 0.01875000074505806, "rewards/rollout_reward_func/std": 0.2730207145214081, "sampling/importance_sampling_ratio/max": 1.9122931957244873, "sampling/importance_sampling_ratio/mean": 1.0855708122253418, "sampling/importance_sampling_ratio/min": 0.31768321990966797, "sampling/sampling_logp_difference/max": 1.4190378189086914, "sampling/sampling_logp_difference/mean": 0.05036480724811554, "step": 239, "step_time": 41.20987309400516 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.010216346243396401, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.014122596243396401, "entropy": 0.28818782418966293, "epoch": 0.0048, "grad_norm": 1.4414920806884766, "kl": 0.2751742023974657, "learning_rate": 9.999922933803743e-06, "loss": -0.0806, "step": 240, "step_time": 8.008296779989905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2711.0, "completions/max_terminated_length": 2711.0, "completions/mean_length": 2487.875, "completions/mean_terminated_length": 2487.875, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "entropy": 0.24784068576991558, "epoch": 0.00482, "frac_reward_zero_std": 0.0, "grad_norm": 1.8702000379562378, "kl": 0.5258154030889273, "learning_rate": 9.999922176403579e-06, "loss": -0.098, "num_tokens": 12524685.0, "reward": 0.015625, "reward_std": 0.08874667435884476, "rewards/rollout_reward_func/mean": 0.015625, "rewards/rollout_reward_func/std": 0.15146461129188538, "sampling/importance_sampling_ratio/max": 2.4689791202545166, "sampling/importance_sampling_ratio/mean": 1.0352541208267212, "sampling/importance_sampling_ratio/min": 0.2149888277053833, "sampling/sampling_logp_difference/max": 1.6170353889465332, "sampling/sampling_logp_difference/mean": 0.051200881600379944, "step": 241, "step_time": 40.59959981197608 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.24527825601398945, "epoch": 0.00484, "grad_norm": 1.7831230163574219, "kl": 0.5612409617751837, "learning_rate": 9.999921415299796e-06, "loss": -0.1007, "step": 242, "step_time": 8.120403088985768 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.03125, "completions/max_length": 2669.0, "completions/max_terminated_length": 2669.0, "completions/mean_length": 2423.9375, "completions/mean_terminated_length": 2422.54833984375, "completions/min_length": 1038.0, "completions/min_terminated_length": 1038.0, "entropy": 0.27891671285033226, "epoch": 0.00486, "frac_reward_zero_std": 0.0, "grad_norm": 1.3307502269744873, "kl": 0.5300094112753868, "learning_rate": 9.9999206504924e-06, "loss": -0.1379, "num_tokens": 12626151.0, "reward": -0.0572500005364418, "reward_std": 0.22322788834571838, "rewards/rollout_reward_func/mean": -0.0572500005364418, "rewards/rollout_reward_func/std": 0.306132048368454, "sampling/importance_sampling_ratio/max": 2.0466506481170654, "sampling/importance_sampling_ratio/mean": 0.8789876699447632, "sampling/importance_sampling_ratio/min": 0.191788911819458, "sampling/sampling_logp_difference/max": 0.933627724647522, "sampling/sampling_logp_difference/mean": 0.05595602095127106, "step": 243, "step_time": 39.27026320499135 }, { "clip_ratio/high_max": 0.019644474843516946, "clip_ratio/high_mean": 0.009822237421758473, "clip_ratio/low_mean": 0.010416666744276881, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020238904166035354, "entropy": 0.27759014815092087, "epoch": 0.00488, "grad_norm": 1.225545883178711, "kl": 0.5295771025121212, "learning_rate": 9.999919881981385e-06, "loss": -0.1401, "step": 244, "step_time": 8.500708511011908 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.01171875, "completions/clipped_ratio": 0.0, "completions/max_length": 2716.0, "completions/max_terminated_length": 2716.0, "completions/mean_length": 2497.65625, "completions/mean_terminated_length": 2497.65625, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "entropy": 0.250592777505517, "epoch": 0.0049, "frac_reward_zero_std": 0.0, "grad_norm": 1.694082498550415, "kl": 0.4345118338242173, "learning_rate": 9.99991910976676e-06, "loss": -0.082, "num_tokens": 12729532.0, "reward": 0.0031249974854290485, "reward_std": 0.1167527362704277, "rewards/rollout_reward_func/mean": 0.0031249974854290485, "rewards/rollout_reward_func/std": 0.20330238342285156, "sampling/importance_sampling_ratio/max": 2.3724825382232666, "sampling/importance_sampling_ratio/mean": 0.9662559032440186, "sampling/importance_sampling_ratio/min": 0.043532487004995346, "sampling/sampling_logp_difference/max": 1.2320823669433594, "sampling/sampling_logp_difference/mean": 0.06254242360591888, "step": 245, "step_time": 41.82365344599384 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.013671875, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.021484375, "entropy": 0.24536587297916412, "epoch": 0.00492, "grad_norm": 1.4003206491470337, "kl": 0.4776889346539974, "learning_rate": 9.999918333848517e-06, "loss": -0.0839, "step": 246, "step_time": 8.143517907003115 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2674.0, "completions/max_terminated_length": 2674.0, "completions/mean_length": 2504.3125, "completions/mean_terminated_length": 2504.3125, "completions/min_length": 1564.0, "completions/min_terminated_length": 1564.0, "entropy": 0.23114917986094952, "epoch": 0.00494, "frac_reward_zero_std": 0.0, "grad_norm": 1.211030125617981, "kl": 0.3891689907759428, "learning_rate": 9.999917554226663e-06, "loss": -0.1096, "num_tokens": 12833664.0, "reward": 0.07406249642372131, "reward_std": 0.11376680433750153, "rewards/rollout_reward_func/mean": 0.07406249642372131, "rewards/rollout_reward_func/std": 0.2270122915506363, "sampling/importance_sampling_ratio/max": 2.0455029010772705, "sampling/importance_sampling_ratio/mean": 0.9785677194595337, "sampling/importance_sampling_ratio/min": 0.29773226380348206, "sampling/sampling_logp_difference/max": 1.1002979278564453, "sampling/sampling_logp_difference/mean": 0.04772745072841644, "step": 247, "step_time": 41.15195580800355 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.22527994960546494, "epoch": 0.00496, "grad_norm": 1.1739858388900757, "kl": 0.4161613564938307, "learning_rate": 9.999916770901197e-06, "loss": -0.1122, "step": 248, "step_time": 8.022777628990298 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2729.0, "completions/max_terminated_length": 2729.0, "completions/mean_length": 2477.5, "completions/mean_terminated_length": 2477.5, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "entropy": 0.24676493927836418, "epoch": 0.00498, "frac_reward_zero_std": 0.0, "grad_norm": 1.6627272367477417, "kl": 0.24644476640969515, "learning_rate": 9.999915983872118e-06, "loss": -0.0664, "num_tokens": 12936851.0, "reward": 0.002499997615814209, "reward_std": 0.11950606107711792, "rewards/rollout_reward_func/mean": 0.002499997615814209, "rewards/rollout_reward_func/std": 0.20879067480564117, "sampling/importance_sampling_ratio/max": 2.1652746200561523, "sampling/importance_sampling_ratio/mean": 1.0998907089233398, "sampling/importance_sampling_ratio/min": 0.40092340111732483, "sampling/sampling_logp_difference/max": 0.9734196662902832, "sampling/sampling_logp_difference/mean": 0.04127663001418114, "step": 249, "step_time": 41.39586626898381 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.24513536132872105, "epoch": 0.005, "grad_norm": 1.5830745697021484, "kl": 0.23460615891963243, "learning_rate": 9.99991519313943e-06, "loss": -0.0699, "step": 250, "step_time": 8.650949458977266 }, { "clip_ratio/high_max": 0.01953125, "clip_ratio/high_mean": 0.009765625, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013671875, "completions/clipped_ratio": 0.0, "completions/max_length": 2699.0, "completions/max_terminated_length": 2699.0, "completions/mean_length": 2560.125, "completions/mean_terminated_length": 2560.125, "completions/min_length": 2440.0, "completions/min_terminated_length": 2440.0, "entropy": 0.24511336907744408, "epoch": 0.00502, "frac_reward_zero_std": 0.0, "grad_norm": 1.6060280799865723, "kl": 0.24437487684190273, "learning_rate": 9.999914398703129e-06, "loss": -0.0899, "num_tokens": 13042591.0, "reward": 0.03218749910593033, "reward_std": 0.07532116770744324, "rewards/rollout_reward_func/mean": 0.03218749910593033, "rewards/rollout_reward_func/std": 0.0803048387169838, "sampling/importance_sampling_ratio/max": 2.953709363937378, "sampling/importance_sampling_ratio/mean": 1.006150245666504, "sampling/importance_sampling_ratio/min": 0.2181449681520462, "sampling/sampling_logp_difference/max": 1.0847293138504028, "sampling/sampling_logp_difference/mean": 0.05197053402662277, "step": 251, "step_time": 41.052220668010705 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.01171875, "clip_ratio/low_min": 0.0078125, "clip_ratio/region_mean": 0.01953125, "entropy": 0.24010136537253857, "epoch": 0.00504, "grad_norm": 1.5573564767837524, "kl": 0.24994135182350874, "learning_rate": 9.99991360056322e-06, "loss": -0.0979, "step": 252, "step_time": 8.086292344996764 }, { "clip_ratio/high_max": 0.013247282709926367, "clip_ratio/high_mean": 0.006623641354963183, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010529891354963183, "completions/clipped_ratio": 0.03125, "completions/max_length": 2742.0, "completions/max_terminated_length": 2742.0, "completions/mean_length": 2552.59375, "completions/mean_terminated_length": 2551.61279296875, "completions/min_length": 2362.0, "completions/min_terminated_length": 2362.0, "entropy": 0.26262959092855453, "epoch": 0.00506, "frac_reward_zero_std": 0.0, "grad_norm": 1.5103635787963867, "kl": 0.4165810886770487, "learning_rate": 9.999912798719703e-06, "loss": -0.0666, "num_tokens": 13147490.0, "reward": 0.036249998956918716, "reward_std": 0.07718061655759811, "rewards/rollout_reward_func/mean": 0.036249998956918716, "rewards/rollout_reward_func/std": 0.07884284108877182, "sampling/importance_sampling_ratio/max": 2.046628952026367, "sampling/importance_sampling_ratio/mean": 1.0505952835083008, "sampling/importance_sampling_ratio/min": 0.12941874563694, "sampling/sampling_logp_difference/max": 1.3702301979064941, "sampling/sampling_logp_difference/mean": 0.05030977725982666, "step": 253, "step_time": 40.4351113330049 }, { "clip_ratio/high_max": 0.006623641354963183, "clip_ratio/high_mean": 0.0033118206774815917, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005264945677481592, "entropy": 0.2602510582655668, "epoch": 0.00508, "grad_norm": 1.5389606952667236, "kl": 0.46528979297727346, "learning_rate": 9.999911993172577e-06, "loss": -0.0669, "step": 254, "step_time": 8.200509453003178 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "completions/clipped_ratio": 0.03125, "completions/max_length": 2738.0, "completions/max_terminated_length": 2738.0, "completions/mean_length": 2551.5, "completions/mean_terminated_length": 2547.0322265625, "completions/min_length": 2381.0, "completions/min_terminated_length": 2381.0, "entropy": 0.26974271424114704, "epoch": 0.0051, "frac_reward_zero_std": 0.0, "grad_norm": 1.5606977939605713, "kl": 0.6086874194443226, "learning_rate": 9.999911183921846e-06, "loss": -0.053, "num_tokens": 13253080.0, "reward": 0.027187500149011612, "reward_std": 0.05690932646393776, "rewards/rollout_reward_func/mean": 0.027187500149011612, "rewards/rollout_reward_func/std": 0.07035551965236664, "sampling/importance_sampling_ratio/max": 1.5175119638442993, "sampling/importance_sampling_ratio/mean": 0.8066157698631287, "sampling/importance_sampling_ratio/min": 0.0783872902393341, "sampling/sampling_logp_difference/max": 1.5892443656921387, "sampling/sampling_logp_difference/mean": 0.057283081114292145, "step": 255, "step_time": 40.19062403999851 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.011124320677481592, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.01893682067748159, "entropy": 0.26865614764392376, "epoch": 0.00512, "grad_norm": 1.0609331130981445, "kl": 0.5949009079486132, "learning_rate": 9.999910370967508e-06, "loss": -0.0575, "step": 256, "step_time": 9.087063239996496 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2723.0, "completions/max_terminated_length": 2723.0, "completions/mean_length": 2566.0, "completions/mean_terminated_length": 2566.0, "completions/min_length": 2383.0, "completions/min_terminated_length": 2383.0, "entropy": 0.26759802363812923, "epoch": 0.00514, "frac_reward_zero_std": 0.0, "grad_norm": 1.5336532592773438, "kl": 0.2863121032714844, "learning_rate": 9.999909554309565e-06, "loss": 0.0539, "num_tokens": 13358742.0, "reward": 0.027187500149011612, "reward_std": 0.05224156379699707, "rewards/rollout_reward_func/mean": 0.027187500149011612, "rewards/rollout_reward_func/std": 0.052254609763622284, "sampling/importance_sampling_ratio/max": 2.2579076290130615, "sampling/importance_sampling_ratio/mean": 1.145774006843567, "sampling/importance_sampling_ratio/min": 0.29213541746139526, "sampling/sampling_logp_difference/max": 1.19516921043396, "sampling/sampling_logp_difference/mean": 0.04962974041700363, "step": 257, "step_time": 40.889848869999696 }, { "clip_ratio/high_max": 0.018342391354963183, "clip_ratio/high_mean": 0.009171195677481592, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.01698369567748159, "entropy": 0.27176199294626713, "epoch": 0.00516, "grad_norm": 1.3151822090148926, "kl": 0.2789830360561609, "learning_rate": 9.999908733948019e-06, "loss": 0.0496, "step": 258, "step_time": 8.165247033000924 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 2657.0, "completions/max_terminated_length": 2657.0, "completions/mean_length": 2521.90625, "completions/mean_terminated_length": 2521.90625, "completions/min_length": 2107.0, "completions/min_terminated_length": 2107.0, "entropy": 0.23622582480311394, "epoch": 0.00518, "frac_reward_zero_std": 0.0, "grad_norm": 1.5044795274734497, "kl": 0.44032028317451477, "learning_rate": 9.999907909882866e-06, "loss": 0.0241, "num_tokens": 13463276.0, "reward": -0.002250001300126314, "reward_std": 0.08689141273498535, "rewards/rollout_reward_func/mean": -0.002250001300126314, "rewards/rollout_reward_func/std": 0.15675972402095795, "sampling/importance_sampling_ratio/max": 2.597571611404419, "sampling/importance_sampling_ratio/mean": 0.8917824029922485, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9829277992248535, "sampling/sampling_logp_difference/mean": 0.048439737409353256, "step": 259, "step_time": 40.84053165099613 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.23765388131141663, "epoch": 0.0052, "grad_norm": 1.459328293800354, "kl": 0.42800300754606724, "learning_rate": 9.999907082114113e-06, "loss": 0.023, "step": 260, "step_time": 8.04916648499784 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "completions/clipped_ratio": 0.0, "completions/max_length": 2638.0, "completions/max_terminated_length": 2638.0, "completions/mean_length": 2504.9375, "completions/mean_terminated_length": 2504.9375, "completions/min_length": 2209.0, "completions/min_terminated_length": 2209.0, "entropy": 0.2611371409147978, "epoch": 0.00522, "frac_reward_zero_std": 0.0, "grad_norm": 1.9348623752593994, "kl": 0.8414626121520996, "learning_rate": 9.999906250641757e-06, "loss": 0.0554, "num_tokens": 13566928.0, "reward": 0.03687499463558197, "reward_std": 0.17472779750823975, "rewards/rollout_reward_func/mean": 0.03687499463558197, "rewards/rollout_reward_func/std": 0.25037309527397156, "sampling/importance_sampling_ratio/max": 2.3028323650360107, "sampling/importance_sampling_ratio/mean": 0.8207236528396606, "sampling/importance_sampling_ratio/min": 0.11133048683404922, "sampling/sampling_logp_difference/max": 1.8047242164611816, "sampling/sampling_logp_difference/mean": 0.06321839988231659, "step": 261, "step_time": 40.17522978800116 }, { "clip_ratio/high_max": 0.0234375, "clip_ratio/high_mean": 0.01171875, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.017578125, "entropy": 0.266588669270277, "epoch": 0.00524, "grad_norm": 1.7961900234222412, "kl": 0.6727431304752827, "learning_rate": 9.9999054154658e-06, "loss": 0.0517, "step": 262, "step_time": 8.483183490003285 }, { "clip_ratio/high_max": 0.010529891354963183, "clip_ratio/high_mean": 0.005264945677481592, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007218070677481592, "completions/clipped_ratio": 0.03125, "completions/max_length": 2704.0, "completions/max_terminated_length": 2704.0, "completions/mean_length": 2549.53125, "completions/mean_terminated_length": 2544.54833984375, "completions/min_length": 2336.0, "completions/min_terminated_length": 2336.0, "entropy": 0.2517882902175188, "epoch": 0.00526, "frac_reward_zero_std": 0.0, "grad_norm": 2.2822296619415283, "kl": 0.27465381287038326, "learning_rate": 9.999904576586242e-06, "loss": -0.2304, "num_tokens": 13672548.0, "reward": 0.017374996095895767, "reward_std": 0.15025976300239563, "rewards/rollout_reward_func/mean": 0.017374996095895767, "rewards/rollout_reward_func/std": 0.2624521553516388, "sampling/importance_sampling_ratio/max": 2.4090371131896973, "sampling/importance_sampling_ratio/mean": 1.0326308012008667, "sampling/importance_sampling_ratio/min": 0.15842844545841217, "sampling/sampling_logp_difference/max": 1.7556763887405396, "sampling/sampling_logp_difference/mean": 0.037125516682863235, "step": 263, "step_time": 40.47858677999466 }, { "clip_ratio/high_max": 0.0234375, "clip_ratio/high_mean": 0.013077445793896914, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013077445793896914, "entropy": 0.2592135164886713, "epoch": 0.00528, "grad_norm": 1.7404944896697998, "kl": 0.2305660918354988, "learning_rate": 9.999903734003084e-06, "loss": -0.2328, "step": 264, "step_time": 8.174356406998413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 2761.0, "completions/max_terminated_length": 2761.0, "completions/mean_length": 2463.1875, "completions/mean_terminated_length": 2463.1875, "completions/min_length": 1167.0, "completions/min_terminated_length": 1167.0, "entropy": 0.2684826646000147, "epoch": 0.0053, "frac_reward_zero_std": 0.125, "grad_norm": 1.1070209741592407, "kl": 0.22240010928362608, "learning_rate": 9.999902887716329e-06, "loss": -0.0723, "num_tokens": 13774961.0, "reward": 0.0507499985396862, "reward_std": 0.1717434823513031, "rewards/rollout_reward_func/mean": 0.0507499985396862, "rewards/rollout_reward_func/std": 0.29026561975479126, "sampling/importance_sampling_ratio/max": 2.1808736324310303, "sampling/importance_sampling_ratio/mean": 1.1364681720733643, "sampling/importance_sampling_ratio/min": 0.30863240361213684, "sampling/sampling_logp_difference/max": 0.7152066230773926, "sampling/sampling_logp_difference/mean": 0.039849814027547836, "step": 265, "step_time": 40.26420498000516 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.2720201928168535, "epoch": 0.00532, "grad_norm": 0.9975273609161377, "kl": 0.2078150687739253, "learning_rate": 9.999902037725978e-06, "loss": -0.0747, "step": 266, "step_time": 8.240107650999562 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2718.0, "completions/max_terminated_length": 2718.0, "completions/mean_length": 2549.78125, "completions/mean_terminated_length": 2549.78125, "completions/min_length": 2220.0, "completions/min_terminated_length": 2220.0, "entropy": 0.2624861355870962, "epoch": 0.00534, "frac_reward_zero_std": 0.125, "grad_norm": 0.9669392108917236, "kl": 0.37778835743665695, "learning_rate": 9.999901184032026e-06, "loss": 0.0082, "num_tokens": 13880362.0, "reward": 0.037312500178813934, "reward_std": 0.043149061501026154, "rewards/rollout_reward_func/mean": 0.037312500178813934, "rewards/rollout_reward_func/std": 0.0625389888882637, "sampling/importance_sampling_ratio/max": 1.9471007585525513, "sampling/importance_sampling_ratio/mean": 0.9066208004951477, "sampling/importance_sampling_ratio/min": 0.1761004775762558, "sampling/sampling_logp_difference/max": 1.6079485416412354, "sampling/sampling_logp_difference/mean": 0.04763280600309372, "step": 267, "step_time": 42.28432625199639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0033118206774815917, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0033118206774815917, "entropy": 0.2652330882847309, "epoch": 0.00536, "grad_norm": 1.0828042030334473, "kl": 0.37003960087895393, "learning_rate": 9.999900326634479e-06, "loss": 0.0064, "step": 268, "step_time": 8.174676374001137 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0033118206774815917, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005264945677481592, "completions/clipped_ratio": 0.03125, "completions/max_length": 2634.0, "completions/max_terminated_length": 2634.0, "completions/mean_length": 2555.71875, "completions/mean_terminated_length": 2556.64501953125, "completions/min_length": 2424.0, "completions/min_terminated_length": 2424.0, "entropy": 0.3060521185398102, "epoch": 0.00538, "frac_reward_zero_std": 0.0, "grad_norm": 1.796020269393921, "kl": 0.24016336910426617, "learning_rate": 9.999899465533338e-06, "loss": -0.1678, "num_tokens": 13985568.0, "reward": 0.03531249985098839, "reward_std": 0.060174889862537384, "rewards/rollout_reward_func/mean": 0.03531249985098839, "rewards/rollout_reward_func/std": 0.06242333725094795, "sampling/importance_sampling_ratio/max": 2.0808072090148926, "sampling/importance_sampling_ratio/mean": 0.9158138632774353, "sampling/importance_sampling_ratio/min": 0.30511394143104553, "sampling/sampling_logp_difference/max": 1.0109217166900635, "sampling/sampling_logp_difference/mean": 0.03978275507688522, "step": 269, "step_time": 41.190066035007476 }, { "clip_ratio/high_max": 0.009341032709926367, "clip_ratio/high_mean": 0.004670516354963183, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010529891587793827, "entropy": 0.30568893253803253, "epoch": 0.0054, "grad_norm": 1.3609435558319092, "kl": 0.2305306103080511, "learning_rate": 9.999898600728599e-06, "loss": -0.1692, "step": 270, "step_time": 8.029273982981977 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 2813.0, "completions/max_terminated_length": 2813.0, "completions/mean_length": 2546.625, "completions/mean_terminated_length": 2546.625, "completions/min_length": 2405.0, "completions/min_terminated_length": 2405.0, "entropy": 0.2608827296644449, "epoch": 0.00542, "frac_reward_zero_std": 0.0, "grad_norm": 1.5197490453720093, "kl": 0.23819277435541153, "learning_rate": 9.99989773222027e-06, "loss": -0.0306, "num_tokens": 14090773.0, "reward": 0.02968749962747097, "reward_std": 0.03806859254837036, "rewards/rollout_reward_func/mean": 0.02968749962747097, "rewards/rollout_reward_func/std": 0.04582465440034866, "sampling/importance_sampling_ratio/max": 1.835666298866272, "sampling/importance_sampling_ratio/mean": 1.0339916944503784, "sampling/importance_sampling_ratio/min": 0.5572738647460938, "sampling/sampling_logp_difference/max": 0.5970277786254883, "sampling/sampling_logp_difference/mean": 0.03422776237130165, "step": 271, "step_time": 41.47874712299381 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.015625, "entropy": 0.25844705663621426, "epoch": 0.00544, "grad_norm": 1.385913372039795, "kl": 0.2487698830664158, "learning_rate": 9.999896860008346e-06, "loss": -0.0337, "step": 272, "step_time": 9.234016997012077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 2687.0, "completions/max_terminated_length": 2687.0, "completions/mean_length": 2528.03125, "completions/mean_terminated_length": 2528.03125, "completions/min_length": 1631.0, "completions/min_terminated_length": 1631.0, "entropy": 0.2662625387310982, "epoch": 0.00546, "frac_reward_zero_std": 0.0, "grad_norm": 1.4977853298187256, "kl": 0.3419014122337103, "learning_rate": 9.999895984092831e-06, "loss": 0.0226, "num_tokens": 14195896.0, "reward": 0.055937498807907104, "reward_std": 0.11488151550292969, "rewards/rollout_reward_func/mean": 0.055937498807907104, "rewards/rollout_reward_func/std": 0.19372175633907318, "sampling/importance_sampling_ratio/max": 2.1736485958099365, "sampling/importance_sampling_ratio/mean": 0.8548949956893921, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2168569564819336, "sampling/sampling_logp_difference/mean": 0.05159320309758186, "step": 273, "step_time": 40.65139246901526 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.26714046485722065, "epoch": 0.00548, "grad_norm": 1.2127920389175415, "kl": 0.3403823059052229, "learning_rate": 9.999895104473725e-06, "loss": 0.0179, "step": 274, "step_time": 8.083952489992953 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2748.0, "completions/max_terminated_length": 2748.0, "completions/mean_length": 2558.0, "completions/mean_terminated_length": 2558.0, "completions/min_length": 2395.0, "completions/min_terminated_length": 2395.0, "entropy": 0.25967661291360855, "epoch": 0.0055, "frac_reward_zero_std": 0.0, "grad_norm": 1.3464138507843018, "kl": 0.2503334507346153, "learning_rate": 9.99989422115103e-06, "loss": -0.0259, "num_tokens": 14300978.0, "reward": 0.0003124997019767761, "reward_std": 0.05055317282676697, "rewards/rollout_reward_func/mean": 0.0003124997019767761, "rewards/rollout_reward_func/std": 0.052883580327034, "sampling/importance_sampling_ratio/max": 1.650503158569336, "sampling/importance_sampling_ratio/mean": 1.0017483234405518, "sampling/importance_sampling_ratio/min": 0.39458227157592773, "sampling/sampling_logp_difference/max": 0.954500675201416, "sampling/sampling_logp_difference/mean": 0.03852980583906174, "step": 275, "step_time": 40.93569000100979 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.009765625, "entropy": 0.259623683989048, "epoch": 0.00552, "grad_norm": 1.112290859222412, "kl": 0.2553240805864334, "learning_rate": 9.999893334124745e-06, "loss": -0.0344, "step": 276, "step_time": 8.276769702002639 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2681.0, "completions/max_terminated_length": 2681.0, "completions/mean_length": 2450.15625, "completions/mean_terminated_length": 2450.15625, "completions/min_length": 1182.0, "completions/min_terminated_length": 1182.0, "entropy": 0.2840597964823246, "epoch": 0.00554, "frac_reward_zero_std": 0.0, "grad_norm": 2.009488105773926, "kl": 0.8314682450145483, "learning_rate": 9.99989244339487e-06, "loss": -0.0156, "num_tokens": 14402590.0, "reward": -0.04656250402331352, "reward_std": 0.17522019147872925, "rewards/rollout_reward_func/mean": -0.04656250402331352, "rewards/rollout_reward_func/std": 0.2510700821876526, "sampling/importance_sampling_ratio/max": 2.424591302871704, "sampling/importance_sampling_ratio/mean": 1.0289294719696045, "sampling/importance_sampling_ratio/min": 0.3011532127857208, "sampling/sampling_logp_difference/max": 0.9531664848327637, "sampling/sampling_logp_difference/mean": 0.04903682321310043, "step": 277, "step_time": 38.54316767599812 }, { "clip_ratio/high_max": 0.01953125, "clip_ratio/high_mean": 0.009765625, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013671875, "entropy": 0.289629552513361, "epoch": 0.00556, "grad_norm": 1.7072014808654785, "kl": 0.6213064789772034, "learning_rate": 9.999891548961409e-06, "loss": -0.0216, "step": 278, "step_time": 8.533860239993373 }, { "clip_ratio/high_max": 0.02734375, "clip_ratio/high_mean": 0.013671875, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017578125, "completions/clipped_ratio": 0.0, "completions/max_length": 2745.0, "completions/max_terminated_length": 2745.0, "completions/mean_length": 2451.25, "completions/mean_terminated_length": 2451.25, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "entropy": 0.27815776132047176, "epoch": 0.00558, "frac_reward_zero_std": 0.0, "grad_norm": 1.5860378742218018, "kl": 0.4317518621683121, "learning_rate": 9.999890650824362e-06, "loss": -0.1737, "num_tokens": 14504584.0, "reward": -0.03437500074505806, "reward_std": 0.18133552372455597, "rewards/rollout_reward_func/mean": -0.03437500074505806, "rewards/rollout_reward_func/std": 0.2963099777698517, "sampling/importance_sampling_ratio/max": 2.2448441982269287, "sampling/importance_sampling_ratio/mean": 0.9443175792694092, "sampling/importance_sampling_ratio/min": 0.33127740025520325, "sampling/sampling_logp_difference/max": 1.2472968101501465, "sampling/sampling_logp_difference/mean": 0.04774777963757515, "step": 279, "step_time": 40.06899703200179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0047940341755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0047940341755747795, "entropy": 0.28123702481389046, "epoch": 0.0056, "grad_norm": 1.623889684677124, "kl": 0.3846174022182822, "learning_rate": 9.999889748983727e-06, "loss": -0.1747, "step": 280, "step_time": 8.234413726997445 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 2735.0, "completions/max_terminated_length": 2735.0, "completions/mean_length": 2562.5, "completions/mean_terminated_length": 2562.5, "completions/min_length": 2294.0, "completions/min_terminated_length": 2294.0, "entropy": 0.28916312009096146, "epoch": 0.00562, "frac_reward_zero_std": 0.0, "grad_norm": 1.611228108406067, "kl": 0.36890734918415546, "learning_rate": 9.999888843439508e-06, "loss": -0.0793, "num_tokens": 14610050.0, "reward": 0.07218749821186066, "reward_std": 0.12699973583221436, "rewards/rollout_reward_func/mean": 0.07218749821186066, "rewards/rollout_reward_func/std": 0.22096173465251923, "sampling/importance_sampling_ratio/max": 2.3326196670532227, "sampling/importance_sampling_ratio/mean": 1.0808019638061523, "sampling/importance_sampling_ratio/min": 0.31041672825813293, "sampling/sampling_logp_difference/max": 0.9732460975646973, "sampling/sampling_logp_difference/mean": 0.04236404597759247, "step": 281, "step_time": 39.99579230199015 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.288146510720253, "epoch": 0.00564, "grad_norm": 1.490877628326416, "kl": 0.3540782444179058, "learning_rate": 9.999887934191706e-06, "loss": -0.0836, "step": 282, "step_time": 8.19389802900696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2688.0, "completions/max_terminated_length": 2688.0, "completions/mean_length": 2480.78125, "completions/mean_terminated_length": 2480.78125, "completions/min_length": 1477.0, "completions/min_terminated_length": 1477.0, "entropy": 0.285388445481658, "epoch": 0.00566, "frac_reward_zero_std": 0.0, "grad_norm": 1.3937627077102661, "kl": 0.27888352051377296, "learning_rate": 9.99988702124032e-06, "loss": -0.0913, "num_tokens": 14713203.0, "reward": -0.02093750238418579, "reward_std": 0.17884045839309692, "rewards/rollout_reward_func/mean": -0.02093750238418579, "rewards/rollout_reward_func/std": 0.2822188436985016, "sampling/importance_sampling_ratio/max": 1.673972725868225, "sampling/importance_sampling_ratio/mean": 0.9629006385803223, "sampling/importance_sampling_ratio/min": 0.18777649104595184, "sampling/sampling_logp_difference/max": 1.1442310810089111, "sampling/sampling_logp_difference/mean": 0.04829523712396622, "step": 283, "step_time": 40.134448617005546 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.28673238307237625, "epoch": 0.00568, "grad_norm": 1.3932827711105347, "kl": 0.26963402703404427, "learning_rate": 9.99988610458535e-06, "loss": -0.0945, "step": 284, "step_time": 9.01298976700491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2680.0, "completions/max_terminated_length": 2680.0, "completions/mean_length": 2563.0625, "completions/mean_terminated_length": 2563.0625, "completions/min_length": 2370.0, "completions/min_terminated_length": 2370.0, "entropy": 0.2974665407091379, "epoch": 0.0057, "frac_reward_zero_std": 0.0, "grad_norm": 1.531980276107788, "kl": 0.2993941828608513, "learning_rate": 9.999885184226803e-06, "loss": -0.0469, "num_tokens": 14818454.0, "reward": 0.046937502920627594, "reward_std": 0.06454025954008102, "rewards/rollout_reward_func/mean": 0.046937502920627594, "rewards/rollout_reward_func/std": 0.07435742765665054, "sampling/importance_sampling_ratio/max": 1.6543657779693604, "sampling/importance_sampling_ratio/mean": 0.9675144553184509, "sampling/importance_sampling_ratio/min": 0.44254976511001587, "sampling/sampling_logp_difference/max": 0.790473461151123, "sampling/sampling_logp_difference/mean": 0.05086861550807953, "step": 285, "step_time": 40.94515013400087 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.2969541698694229, "epoch": 0.00572, "grad_norm": 1.4943524599075317, "kl": 0.2985868602991104, "learning_rate": 9.999884260164672e-06, "loss": -0.0504, "step": 286, "step_time": 8.124935107989586 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0625, "completions/max_length": 2706.0, "completions/max_terminated_length": 2706.0, "completions/mean_length": 2444.1875, "completions/mean_terminated_length": 2441.60009765625, "completions/min_length": 1153.0, "completions/min_terminated_length": 1153.0, "entropy": 0.38077646121382713, "epoch": 0.00574, "frac_reward_zero_std": 0.0, "grad_norm": 2.0460994243621826, "kl": 0.5203682221472263, "learning_rate": 9.999883332398963e-06, "loss": -0.2072, "num_tokens": 14919885.0, "reward": -0.10437499731779099, "reward_std": 0.21541458368301392, "rewards/rollout_reward_func/mean": -0.10437499731779099, "rewards/rollout_reward_func/std": 0.38278213143348694, "sampling/importance_sampling_ratio/max": 2.872018575668335, "sampling/importance_sampling_ratio/mean": 0.9405064582824707, "sampling/importance_sampling_ratio/min": 1.4718626317744565e-10, "sampling/sampling_logp_difference/max": 16.81309700012207, "sampling/sampling_logp_difference/mean": 0.09047360718250275, "step": 287, "step_time": 38.90498008499708 }, { "clip_ratio/high_max": 0.01678685937076807, "clip_ratio/high_mean": 0.008393429685384035, "clip_ratio/low_mean": 0.005085495300590992, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013478924985975027, "entropy": 0.3831331916153431, "epoch": 0.00576, "grad_norm": 1.58086359500885, "kl": 0.4781217612326145, "learning_rate": 9.999882400929674e-06, "loss": -0.2108, "step": 288, "step_time": 8.091997119998268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0072180707938969135, "clip_ratio/low_min": 0.0027173913549631834, "clip_ratio/region_mean": 0.0072180707938969135, "completions/clipped_ratio": 0.0, "completions/max_length": 2729.0, "completions/max_terminated_length": 2729.0, "completions/mean_length": 2518.25, "completions/mean_terminated_length": 2518.25, "completions/min_length": 1788.0, "completions/min_terminated_length": 1788.0, "entropy": 0.3387117236852646, "epoch": 0.00578, "frac_reward_zero_std": 0.0, "grad_norm": 1.3513823747634888, "kl": 0.9334820155054331, "learning_rate": 9.99988146575681e-06, "loss": -0.0198, "num_tokens": 15024124.0, "reward": 0.01056249625980854, "reward_std": 0.18217787146568298, "rewards/rollout_reward_func/mean": 0.01056249625980854, "rewards/rollout_reward_func/std": 0.29733407497406006, "sampling/importance_sampling_ratio/max": 1.9554623365402222, "sampling/importance_sampling_ratio/mean": 0.7435547113418579, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.4795942306518555, "sampling/sampling_logp_difference/mean": 0.057336658239364624, "step": 289, "step_time": 39.99209722400701 }, { "clip_ratio/high_max": 0.017574606114067137, "clip_ratio/high_mean": 0.008787303057033569, "clip_ratio/low_mean": 0.0013586956774815917, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01014599873451516, "entropy": 0.3432500846683979, "epoch": 0.0058, "grad_norm": 1.1709140539169312, "kl": 0.8469271510839462, "learning_rate": 9.999880526880366e-06, "loss": -0.0209, "step": 290, "step_time": 8.202411102996848 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.006023503257893026, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009929753257893026, "completions/clipped_ratio": 0.0, "completions/max_length": 2665.0, "completions/max_terminated_length": 2665.0, "completions/mean_length": 2454.25, "completions/mean_terminated_length": 2454.25, "completions/min_length": 1069.0, "completions/min_terminated_length": 1069.0, "entropy": 0.30111620761454105, "epoch": 0.00582, "frac_reward_zero_std": 0.125, "grad_norm": 1.09559166431427, "kl": 0.43403775803744793, "learning_rate": 9.999879584300349e-06, "loss": 0.035, "num_tokens": 15125746.0, "reward": 0.028999999165534973, "reward_std": 0.17057111859321594, "rewards/rollout_reward_func/mean": 0.028999999165534973, "rewards/rollout_reward_func/std": 0.28665322065353394, "sampling/importance_sampling_ratio/max": 1.8856747150421143, "sampling/importance_sampling_ratio/mean": 1.0690852403640747, "sampling/importance_sampling_ratio/min": 0.49591487646102905, "sampling/sampling_logp_difference/max": 1.3662643432617188, "sampling/sampling_logp_difference/mean": 0.047344714403152466, "step": 291, "step_time": 39.35214727100538 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3075068034231663, "epoch": 0.00584, "grad_norm": 1.0851396322250366, "kl": 0.39455585554242134, "learning_rate": 9.999878638016756e-06, "loss": 0.0324, "step": 292, "step_time": 8.023776346992236 }, { "clip_ratio/high_max": 0.0037878789007663727, "clip_ratio/high_mean": 0.0018939394503831863, "clip_ratio/low_mean": 0.0018382353009656072, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037321747513487935, "completions/clipped_ratio": 0.0, "completions/max_length": 2712.0, "completions/max_terminated_length": 2712.0, "completions/mean_length": 2457.25, "completions/mean_terminated_length": 2457.25, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "entropy": 0.2864312566816807, "epoch": 0.00586, "frac_reward_zero_std": 0.0, "grad_norm": 1.0765045881271362, "kl": 0.3095027208328247, "learning_rate": 9.99987768802959e-06, "loss": -0.0325, "num_tokens": 15228123.0, "reward": 0.029375001788139343, "reward_std": 0.31879091262817383, "rewards/rollout_reward_func/mean": 0.029375001788139343, "rewards/rollout_reward_func/std": 0.41879963874816895, "sampling/importance_sampling_ratio/max": 1.6113137006759644, "sampling/importance_sampling_ratio/mean": 0.8610400557518005, "sampling/importance_sampling_ratio/min": 0.25993117690086365, "sampling/sampling_logp_difference/max": 0.8391194343566895, "sampling/sampling_logp_difference/mean": 0.04624336585402489, "step": 293, "step_time": 38.63772508600232 }, { "clip_ratio/high_max": 0.015885416883975267, "clip_ratio/high_mean": 0.007942708441987634, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011848958441987634, "entropy": 0.2931174673140049, "epoch": 0.00588, "grad_norm": 0.8959990739822388, "kl": 0.2937803156673908, "learning_rate": 9.99987673433885e-06, "loss": -0.0353, "step": 294, "step_time": 8.147136625986604 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2764.0, "completions/max_terminated_length": 2764.0, "completions/mean_length": 2505.09375, "completions/mean_terminated_length": 2505.09375, "completions/min_length": 1295.0, "completions/min_terminated_length": 1295.0, "entropy": 0.3480229079723358, "epoch": 0.0059, "frac_reward_zero_std": 0.0, "grad_norm": 1.3411896228790283, "kl": 0.24798605777323246, "learning_rate": 9.999875776944539e-06, "loss": 0.0081, "num_tokens": 15332692.0, "reward": -0.0015624996740370989, "reward_std": 0.07828617095947266, "rewards/rollout_reward_func/mean": -0.0015624996740370989, "rewards/rollout_reward_func/std": 0.0892300084233284, "sampling/importance_sampling_ratio/max": 1.658194899559021, "sampling/importance_sampling_ratio/mean": 0.9416136741638184, "sampling/importance_sampling_ratio/min": 0.3543452024459839, "sampling/sampling_logp_difference/max": 0.45158815383911133, "sampling/sampling_logp_difference/mean": 0.042671412229537964, "step": 295, "step_time": 41.45644505799282 }, { "clip_ratio/high_max": 0.015395220601931214, "clip_ratio/high_mean": 0.007697610300965607, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007697610300965607, "entropy": 0.35514455288648605, "epoch": 0.00592, "grad_norm": 1.2621599435806274, "kl": 0.23556838184595108, "learning_rate": 9.999874815846656e-06, "loss": 0.0039, "step": 296, "step_time": 8.289278871998249 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 2792.0, "completions/max_terminated_length": 2792.0, "completions/mean_length": 2558.21875, "completions/mean_terminated_length": 2558.21875, "completions/min_length": 732.0, "completions/min_terminated_length": 732.0, "entropy": 0.331636942923069, "epoch": 0.00594, "frac_reward_zero_std": 0.0, "grad_norm": 1.0715786218643188, "kl": 0.319051219150424, "learning_rate": 9.999873851045202e-06, "loss": -0.0962, "num_tokens": 15438054.0, "reward": -0.015625, "reward_std": 0.12047843635082245, "rewards/rollout_reward_func/mean": -0.015625, "rewards/rollout_reward_func/std": 0.1917649358510971, "sampling/importance_sampling_ratio/max": 1.5377081632614136, "sampling/importance_sampling_ratio/mean": 0.8829039335250854, "sampling/importance_sampling_ratio/min": 0.27864915132522583, "sampling/sampling_logp_difference/max": 0.9243254661560059, "sampling/sampling_logp_difference/mean": 0.05171851068735123, "step": 297, "step_time": 40.14633480698831 }, { "clip_ratio/high_max": 0.01953125, "clip_ratio/high_mean": 0.01171875, "clip_ratio/low_mean": 0.005800189450383186, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017518939450383186, "entropy": 0.3420854024589062, "epoch": 0.00596, "grad_norm": 0.9368278384208679, "kl": 0.321561723947525, "learning_rate": 9.999872882540181e-06, "loss": -0.1013, "step": 298, "step_time": 8.38549888000125 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0033118206774815917, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005264945677481592, "completions/clipped_ratio": 0.0, "completions/max_length": 2659.0, "completions/max_terminated_length": 2659.0, "completions/mean_length": 2429.96875, "completions/mean_terminated_length": 2429.96875, "completions/min_length": 1326.0, "completions/min_terminated_length": 1326.0, "entropy": 0.3992190286517143, "epoch": 0.00598, "frac_reward_zero_std": 0.0, "grad_norm": 2.0931684970855713, "kl": 0.2717692907899618, "learning_rate": 9.999871910331592e-06, "loss": 0.0356, "num_tokens": 15539177.0, "reward": 0.006874997168779373, "reward_std": 0.13145233690738678, "rewards/rollout_reward_func/mean": 0.006874997168779373, "rewards/rollout_reward_func/std": 0.227219358086586, "sampling/importance_sampling_ratio/max": 1.9978420734405518, "sampling/importance_sampling_ratio/mean": 1.1109166145324707, "sampling/importance_sampling_ratio/min": 0.28789734840393066, "sampling/sampling_logp_difference/max": 1.2725002765655518, "sampling/sampling_logp_difference/mean": 0.04605109244585037, "step": 299, "step_time": 39.261059892996855 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.004185267956927419, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010044642956927419, "entropy": 0.4111786112189293, "epoch": 0.006, "grad_norm": 1.949076771736145, "kl": 0.263056967407465, "learning_rate": 9.999870934419434e-06, "loss": 0.0301, "step": 300, "step_time": 8.51909788800549 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.009765625, "completions/clipped_ratio": 0.0, "completions/max_length": 2695.0, "completions/max_terminated_length": 2695.0, "completions/mean_length": 2472.09375, "completions/mean_terminated_length": 2472.09375, "completions/min_length": 1107.0, "completions/min_terminated_length": 1107.0, "entropy": 0.3823564685881138, "epoch": 0.00602, "frac_reward_zero_std": 0.0, "grad_norm": 0.9711835980415344, "kl": 0.24662657268345356, "learning_rate": 9.999869954803708e-06, "loss": 0.0073, "num_tokens": 15642001.0, "reward": -0.02225000225007534, "reward_std": 0.2383129745721817, "rewards/rollout_reward_func/mean": -0.02225000225007534, "rewards/rollout_reward_func/std": 0.34792760014533997, "sampling/importance_sampling_ratio/max": 2.2050437927246094, "sampling/importance_sampling_ratio/mean": 0.9682941436767578, "sampling/importance_sampling_ratio/min": 0.21761596202850342, "sampling/sampling_logp_difference/max": 0.6958246231079102, "sampling/sampling_logp_difference/mean": 0.04852741211652756, "step": 301, "step_time": 39.94850182400842 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.3918728120625019, "epoch": 0.00604, "grad_norm": 1.0577794313430786, "kl": 0.24203552678227425, "learning_rate": 9.999868971484418e-06, "loss": 0.0044, "step": 302, "step_time": 8.059815880988026 }, { "clip_ratio/high_max": 0.010937500046566129, "clip_ratio/high_mean": 0.005468750023283064, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007421875023283064, "completions/clipped_ratio": 0.03125, "completions/max_length": 2725.0, "completions/max_terminated_length": 2725.0, "completions/mean_length": 2496.375, "completions/mean_terminated_length": 2498.806396484375, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "entropy": 0.4163655452430248, "epoch": 0.00606, "frac_reward_zero_std": 0.0, "grad_norm": 1.3073053359985352, "kl": 0.26006307639181614, "learning_rate": 9.999867984461565e-06, "loss": -0.0813, "num_tokens": 15745188.0, "reward": -0.010625004768371582, "reward_std": 0.12208487838506699, "rewards/rollout_reward_func/mean": -0.010625004768371582, "rewards/rollout_reward_func/std": 0.20892871916294098, "sampling/importance_sampling_ratio/max": 2.2923381328582764, "sampling/importance_sampling_ratio/mean": 1.1277439594268799, "sampling/importance_sampling_ratio/min": 0.47326311469078064, "sampling/sampling_logp_difference/max": 0.8685226440429688, "sampling/sampling_logp_difference/mean": 0.044995859265327454, "step": 303, "step_time": 40.56155824400048 }, { "clip_ratio/high_max": 0.013654891401529312, "clip_ratio/high_mean": 0.006827445700764656, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006827445700764656, "entropy": 0.42382679879665375, "epoch": 0.00608, "grad_norm": 1.2154861688613892, "kl": 0.26275861263275146, "learning_rate": 9.999866993735148e-06, "loss": -0.0856, "step": 304, "step_time": 8.188002009002957 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2720.0, "completions/max_terminated_length": 2720.0, "completions/mean_length": 2543.78125, "completions/mean_terminated_length": 2543.78125, "completions/min_length": 1320.0, "completions/min_terminated_length": 1320.0, "entropy": 0.4170003570616245, "epoch": 0.0061, "frac_reward_zero_std": 0.0, "grad_norm": 0.9974532723426819, "kl": 0.4499119780957699, "learning_rate": 9.99986599930517e-06, "loss": -0.0162, "num_tokens": 15849934.0, "reward": 0.06724999845027924, "reward_std": 0.10333256423473358, "rewards/rollout_reward_func/mean": 0.06724999845027924, "rewards/rollout_reward_func/std": 0.19078291952610016, "sampling/importance_sampling_ratio/max": 1.4692877531051636, "sampling/importance_sampling_ratio/mean": 0.8283271789550781, "sampling/importance_sampling_ratio/min": 0.20920588076114655, "sampling/sampling_logp_difference/max": 1.1342777013778687, "sampling/sampling_logp_difference/mean": 0.0569009929895401, "step": 305, "step_time": 40.22573580900644 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.42669572681188583, "epoch": 0.00612, "grad_norm": 0.966021716594696, "kl": 0.4435073509812355, "learning_rate": 9.999865001171628e-06, "loss": -0.0204, "step": 306, "step_time": 9.38175861498894 }, { "clip_ratio/high_max": 0.006623641354963183, "clip_ratio/high_mean": 0.0033118206774815917, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0033118206774815917, "completions/clipped_ratio": 0.0, "completions/max_length": 2688.0, "completions/max_terminated_length": 2688.0, "completions/mean_length": 2498.71875, "completions/mean_terminated_length": 2498.71875, "completions/min_length": 1621.0, "completions/min_terminated_length": 1621.0, "entropy": 0.4614461474120617, "epoch": 0.00614, "frac_reward_zero_std": 0.0, "grad_norm": 1.7010865211486816, "kl": 0.26754089444875717, "learning_rate": 9.999863999334527e-06, "loss": -0.0394, "num_tokens": 15953702.0, "reward": -0.018437497317790985, "reward_std": 0.13519792258739471, "rewards/rollout_reward_func/mean": -0.018437497317790985, "rewards/rollout_reward_func/std": 0.22020678222179413, "sampling/importance_sampling_ratio/max": 1.7836391925811768, "sampling/importance_sampling_ratio/mean": 1.0035184621810913, "sampling/importance_sampling_ratio/min": 0.5491339564323425, "sampling/sampling_logp_difference/max": 0.7282247543334961, "sampling/sampling_logp_difference/mean": 0.037387143820524216, "step": 307, "step_time": 39.49807976799639 }, { "clip_ratio/high_max": 0.006623641354963183, "clip_ratio/high_mean": 0.0033118206774815917, "clip_ratio/low_mean": 0.0033735795877873898, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006685400381684303, "entropy": 0.46664348989725113, "epoch": 0.00616, "grad_norm": 1.189290165901184, "kl": 0.26736610010266304, "learning_rate": 9.999862993793865e-06, "loss": -0.0438, "step": 308, "step_time": 8.11864329600212 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 2704.0, "completions/max_terminated_length": 2704.0, "completions/mean_length": 2507.71875, "completions/mean_terminated_length": 2507.71875, "completions/min_length": 1357.0, "completions/min_terminated_length": 1357.0, "entropy": 0.45517005771398544, "epoch": 0.00618, "frac_reward_zero_std": 0.0, "grad_norm": 1.906209111213684, "kl": 1.1019243709743023, "learning_rate": 9.999861984549646e-06, "loss": 0.0762, "num_tokens": 16057213.0, "reward": 0.02031249925494194, "reward_std": 0.07454593479633331, "rewards/rollout_reward_func/mean": 0.02031249925494194, "rewards/rollout_reward_func/std": 0.08433743566274643, "sampling/importance_sampling_ratio/max": 2.71134352684021, "sampling/importance_sampling_ratio/mean": 1.1158506870269775, "sampling/importance_sampling_ratio/min": 0.40567341446876526, "sampling/sampling_logp_difference/max": 0.7820019721984863, "sampling/sampling_logp_difference/mean": 0.05114158242940903, "step": 309, "step_time": 39.111676544001966 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.45416729897260666, "epoch": 0.0062, "grad_norm": 1.8273950815200806, "kl": 0.976007841527462, "learning_rate": 9.99986097160187e-06, "loss": 0.0695, "step": 310, "step_time": 8.081985718003125 }, { "clip_ratio/high_max": 0.0033783784601837397, "clip_ratio/high_mean": 0.0016891892300918698, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00364231423009187, "completions/clipped_ratio": 0.0, "completions/max_length": 2718.0, "completions/max_terminated_length": 2718.0, "completions/mean_length": 2470.25, "completions/mean_terminated_length": 2470.25, "completions/min_length": 1356.0, "completions/min_terminated_length": 1356.0, "entropy": 0.47122444957494736, "epoch": 0.00622, "frac_reward_zero_std": 0.0, "grad_norm": 2.186075210571289, "kl": 0.27417293563485146, "learning_rate": 9.999859954950535e-06, "loss": -0.0863, "num_tokens": 16159452.0, "reward": 0.0234375, "reward_std": 0.0810212641954422, "rewards/rollout_reward_func/mean": 0.0234375, "rewards/rollout_reward_func/std": 0.09953470528125763, "sampling/importance_sampling_ratio/max": 1.6803293228149414, "sampling/importance_sampling_ratio/mean": 0.907636821269989, "sampling/importance_sampling_ratio/min": 0.40213456749916077, "sampling/sampling_logp_difference/max": 0.9505100250244141, "sampling/sampling_logp_difference/mean": 0.048430029302835464, "step": 311, "step_time": 38.429599290000624 }, { "clip_ratio/high_max": 0.006756756920367479, "clip_ratio/high_mean": 0.0033783784601837397, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00728462846018374, "entropy": 0.4736868105828762, "epoch": 0.00624, "grad_norm": 1.6169250011444092, "kl": 0.2743623908609152, "learning_rate": 9.999858934595648e-06, "loss": -0.0915, "step": 312, "step_time": 9.370438503989135 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 2758.0, "completions/max_terminated_length": 2758.0, "completions/mean_length": 2513.5625, "completions/mean_terminated_length": 2513.5625, "completions/min_length": 1564.0, "completions/min_terminated_length": 1564.0, "entropy": 0.4444139339029789, "epoch": 0.00626, "frac_reward_zero_std": 0.0, "grad_norm": 0.9361081719398499, "kl": 0.29342188127338886, "learning_rate": 9.999857910537204e-06, "loss": -0.0681, "num_tokens": 16263096.0, "reward": 0.0018750019371509552, "reward_std": 0.11465057730674744, "rewards/rollout_reward_func/mean": 0.0018750019371509552, "rewards/rollout_reward_func/std": 0.18235798180103302, "sampling/importance_sampling_ratio/max": 1.9316868782043457, "sampling/importance_sampling_ratio/mean": 0.9139528274536133, "sampling/importance_sampling_ratio/min": 0.3404541313648224, "sampling/sampling_logp_difference/max": 0.85491943359375, "sampling/sampling_logp_difference/mean": 0.04442521184682846, "step": 313, "step_time": 40.23155441199924 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.44232723861932755, "epoch": 0.00628, "grad_norm": 0.871970534324646, "kl": 0.30031659826636314, "learning_rate": 9.999856882775207e-06, "loss": -0.0709, "step": 314, "step_time": 8.274247516004834 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 2775.0, "completions/max_terminated_length": 2775.0, "completions/mean_length": 2577.53125, "completions/mean_terminated_length": 2577.53125, "completions/min_length": 2379.0, "completions/min_terminated_length": 2379.0, "entropy": 0.45178864896297455, "epoch": 0.0063, "frac_reward_zero_std": 0.0, "grad_norm": 1.0735334157943726, "kl": 0.28461507335305214, "learning_rate": 9.999855851309658e-06, "loss": 0.0263, "num_tokens": 16369024.0, "reward": 0.028999999165534973, "reward_std": 0.05209977924823761, "rewards/rollout_reward_func/mean": 0.028999999165534973, "rewards/rollout_reward_func/std": 0.0565662607550621, "sampling/importance_sampling_ratio/max": 1.7128064632415771, "sampling/importance_sampling_ratio/mean": 1.0174338817596436, "sampling/importance_sampling_ratio/min": 0.26667365431785583, "sampling/sampling_logp_difference/max": 0.8575940132141113, "sampling/sampling_logp_difference/mean": 0.04983839392662048, "step": 315, "step_time": 42.039271956993616 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.4506670571863651, "epoch": 0.00632, "grad_norm": 1.147428274154663, "kl": 0.28853642009198666, "learning_rate": 9.999854816140558e-06, "loss": 0.0242, "step": 316, "step_time": 8.311607102994458 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 2640.0, "completions/max_terminated_length": 2640.0, "completions/mean_length": 2452.21875, "completions/mean_terminated_length": 2452.21875, "completions/min_length": 600.0, "completions/min_terminated_length": 600.0, "entropy": 0.43626048415899277, "epoch": 0.00634, "frac_reward_zero_std": 0.0, "grad_norm": 1.334524393081665, "kl": 0.26437416300177574, "learning_rate": 9.999853777267907e-06, "loss": -0.0384, "num_tokens": 16470702.0, "reward": 0.012187500484287739, "reward_std": 0.11599558591842651, "rewards/rollout_reward_func/mean": 0.012187500484287739, "rewards/rollout_reward_func/std": 0.19708740711212158, "sampling/importance_sampling_ratio/max": 1.745671033859253, "sampling/importance_sampling_ratio/mean": 1.029480218887329, "sampling/importance_sampling_ratio/min": 0.3974767029285431, "sampling/sampling_logp_difference/max": 0.8084192276000977, "sampling/sampling_logp_difference/mean": 0.046191953122615814, "step": 317, "step_time": 40.42614766400948 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0024038462433964014, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004356971010565758, "entropy": 0.4329594671726227, "epoch": 0.00636, "grad_norm": 1.2568414211273193, "kl": 0.2660627197474241, "learning_rate": 9.999852734691707e-06, "loss": -0.0389, "step": 318, "step_time": 8.729695830006676 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.03125, "completions/max_length": 2713.0, "completions/max_terminated_length": 2713.0, "completions/mean_length": 2463.28125, "completions/mean_terminated_length": 2459.48388671875, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "entropy": 0.4093780815601349, "epoch": 0.00638, "frac_reward_zero_std": 0.0, "grad_norm": 0.9679552316665649, "kl": 0.25920648127794266, "learning_rate": 9.999851688411959e-06, "loss": -0.1556, "num_tokens": 16573266.0, "reward": -0.044062502682209015, "reward_std": 0.18410122394561768, "rewards/rollout_reward_func/mean": -0.044062502682209015, "rewards/rollout_reward_func/std": 0.27064865827560425, "sampling/importance_sampling_ratio/max": 1.572900414466858, "sampling/importance_sampling_ratio/mean": 0.9186617136001587, "sampling/importance_sampling_ratio/min": 0.25353381037712097, "sampling/sampling_logp_difference/max": 0.6579174995422363, "sampling/sampling_logp_difference/mean": 0.040446434170007706, "step": 319, "step_time": 38.13957930600009 }, { "clip_ratio/high_max": 0.006623641354963183, "clip_ratio/high_mean": 0.0033118206774815917, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0033118206774815917, "entropy": 0.40444113314151764, "epoch": 0.0064, "grad_norm": 0.9774654507637024, "kl": 0.2562381671741605, "learning_rate": 9.999850638428662e-06, "loss": -0.158, "step": 320, "step_time": 8.146959678997518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 3095.0, "completions/max_terminated_length": 3095.0, "completions/mean_length": 2866.9375, "completions/mean_terminated_length": 2866.9375, "completions/min_length": 1676.0, "completions/min_terminated_length": 1676.0, "entropy": 0.40554892271757126, "epoch": 0.00642, "frac_reward_zero_std": 0.125, "grad_norm": 1.0908348560333252, "kl": 0.2836337350308895, "learning_rate": 9.99984958474182e-06, "loss": -0.1248, "num_tokens": 16688566.0, "reward": -0.08843749761581421, "reward_std": 0.16875624656677246, "rewards/rollout_reward_func/mean": -0.08843749761581421, "rewards/rollout_reward_func/std": 0.2703416049480438, "sampling/importance_sampling_ratio/max": 2.3541653156280518, "sampling/importance_sampling_ratio/mean": 0.9503089189529419, "sampling/importance_sampling_ratio/min": 0.2944200336933136, "sampling/sampling_logp_difference/max": 0.7107000350952148, "sampling/sampling_logp_difference/mean": 0.045195046812295914, "step": 321, "step_time": 41.59577051900851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005642361124046147, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005642361124046147, "entropy": 0.4032099135220051, "epoch": 0.00644, "grad_norm": 0.9083360433578491, "kl": 0.2763382289558649, "learning_rate": 9.999848527351434e-06, "loss": -0.1272, "step": 322, "step_time": 9.127111081004841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3129.0, "completions/max_terminated_length": 3129.0, "completions/mean_length": 2868.71875, "completions/mean_terminated_length": 2868.71875, "completions/min_length": 2418.0, "completions/min_terminated_length": 2418.0, "entropy": 0.4014575108885765, "epoch": 0.00646, "frac_reward_zero_std": 0.125, "grad_norm": 1.3745849132537842, "kl": 0.3217862006276846, "learning_rate": 9.999847466257501e-06, "loss": -0.0052, "num_tokens": 16803971.0, "reward": 0.03343750163912773, "reward_std": 0.08887212723493576, "rewards/rollout_reward_func/mean": 0.03343750163912773, "rewards/rollout_reward_func/std": 0.18239639699459076, "sampling/importance_sampling_ratio/max": 2.078562021255493, "sampling/importance_sampling_ratio/mean": 1.028127670288086, "sampling/importance_sampling_ratio/min": 0.30963262915611267, "sampling/sampling_logp_difference/max": 0.856985330581665, "sampling/sampling_logp_difference/mean": 0.04022815078496933, "step": 323, "step_time": 42.87567100799788 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.005208333372138441, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 0.4006517715752125, "epoch": 0.00648, "grad_norm": 1.3022223711013794, "kl": 0.32581575214862823, "learning_rate": 9.999846401460027e-06, "loss": -0.01, "step": 324, "step_time": 9.157112452005094 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 3087.0, "completions/max_terminated_length": 3087.0, "completions/mean_length": 2870.625, "completions/mean_terminated_length": 2870.625, "completions/min_length": 1981.0, "completions/min_terminated_length": 1981.0, "entropy": 0.39831674844026566, "epoch": 0.0065, "frac_reward_zero_std": 0.125, "grad_norm": 1.0668267011642456, "kl": 0.2798405773937702, "learning_rate": 9.999845332959009e-06, "loss": -0.1367, "num_tokens": 16919173.0, "reward": -0.03593750298023224, "reward_std": 0.10618676245212555, "rewards/rollout_reward_func/mean": -0.03593750298023224, "rewards/rollout_reward_func/std": 0.2160026729106903, "sampling/importance_sampling_ratio/max": 2.6769375801086426, "sampling/importance_sampling_ratio/mean": 1.0329153537750244, "sampling/importance_sampling_ratio/min": 0.25040295720100403, "sampling/sampling_logp_difference/max": 0.8099632263183594, "sampling/sampling_logp_difference/mean": 0.047026216983795166, "step": 325, "step_time": 42.815426508008386 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0018382353009656072, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005310457549057901, "entropy": 0.39466894418001175, "epoch": 0.00652, "grad_norm": 1.083456039428711, "kl": 0.2784327268600464, "learning_rate": 9.999844260754452e-06, "loss": -0.1412, "step": 326, "step_time": 9.105896988992754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3089.0, "completions/max_terminated_length": 3089.0, "completions/mean_length": 2871.90625, "completions/mean_terminated_length": 2871.90625, "completions/min_length": 2233.0, "completions/min_terminated_length": 2233.0, "entropy": 0.46241355314850807, "epoch": 0.00654, "frac_reward_zero_std": 0.25, "grad_norm": 1.365126132965088, "kl": 0.4783841446042061, "learning_rate": 9.999843184846355e-06, "loss": 0.0579, "num_tokens": 17034934.0, "reward": -0.004687502980232239, "reward_std": 0.2152300775051117, "rewards/rollout_reward_func/mean": -0.004687502980232239, "rewards/rollout_reward_func/std": 0.3430412709712982, "sampling/importance_sampling_ratio/max": 2.176283121109009, "sampling/importance_sampling_ratio/mean": 0.9271606802940369, "sampling/importance_sampling_ratio/min": 0.1929233819246292, "sampling/sampling_logp_difference/max": 1.0973834991455078, "sampling/sampling_logp_difference/mean": 0.04590904712677002, "step": 327, "step_time": 40.807620658990345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "entropy": 0.4616936966776848, "epoch": 0.00656, "grad_norm": 1.2530312538146973, "kl": 0.4540918581187725, "learning_rate": 9.999842105234718e-06, "loss": 0.0566, "step": 328, "step_time": 9.13476190099027 }, { "clip_ratio/high_max": 0.0027173913549631834, "clip_ratio/high_mean": 0.0013586956774815917, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013586956774815917, "completions/clipped_ratio": 0.0, "completions/max_length": 3089.0, "completions/max_terminated_length": 3089.0, "completions/mean_length": 2894.5, "completions/mean_terminated_length": 2894.5, "completions/min_length": 2710.0, "completions/min_terminated_length": 2710.0, "entropy": 0.4044684022665024, "epoch": 0.00658, "frac_reward_zero_std": 0.125, "grad_norm": 1.2801345586776733, "kl": 0.29569943621754646, "learning_rate": 9.999841021919543e-06, "loss": 0.028, "num_tokens": 17150980.0, "reward": 0.008749999105930328, "reward_std": 0.033390406519174576, "rewards/rollout_reward_func/mean": 0.008749999105930328, "rewards/rollout_reward_func/std": 0.03669842332601547, "sampling/importance_sampling_ratio/max": 1.9134129285812378, "sampling/importance_sampling_ratio/mean": 0.9784555435180664, "sampling/importance_sampling_ratio/min": 7.799136336750223e-10, "sampling/sampling_logp_difference/max": 9.550741195678711, "sampling/sampling_logp_difference/mean": 0.07563318312168121, "step": 329, "step_time": 42.932895781996194 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.004151570028625429, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0058876811526715755, "entropy": 0.40519747510552406, "epoch": 0.0066, "grad_norm": 1.2309134006500244, "kl": 0.3003210015594959, "learning_rate": 9.999839934900832e-06, "loss": 0.0236, "step": 330, "step_time": 9.125532052996277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3089.0, "completions/max_terminated_length": 3089.0, "completions/mean_length": 2796.0625, "completions/mean_terminated_length": 2796.0625, "completions/min_length": 1274.0, "completions/min_terminated_length": 1274.0, "entropy": 0.38685447722673416, "epoch": 0.00662, "frac_reward_zero_std": 0.125, "grad_norm": 0.919028639793396, "kl": 0.37493203580379486, "learning_rate": 9.999838844178584e-06, "loss": -0.0388, "num_tokens": 17263747.0, "reward": -0.0508125014603138, "reward_std": 0.14278432726860046, "rewards/rollout_reward_func/mean": -0.0508125014603138, "rewards/rollout_reward_func/std": 0.251789391040802, "sampling/importance_sampling_ratio/max": 1.6912277936935425, "sampling/importance_sampling_ratio/mean": 0.9828731417655945, "sampling/importance_sampling_ratio/min": 0.2559169828891754, "sampling/sampling_logp_difference/max": 1.3397047519683838, "sampling/sampling_logp_difference/mean": 0.04227147251367569, "step": 331, "step_time": 42.20254211300198 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005208333372138441, "entropy": 0.38628678396344185, "epoch": 0.00664, "grad_norm": 0.8906185030937195, "kl": 0.354989618062973, "learning_rate": 9.999837749752804e-06, "loss": -0.0404, "step": 332, "step_time": 9.063552910993167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0016447368543595076, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016447368543595076, "completions/clipped_ratio": 0.0, "completions/max_length": 3035.0, "completions/max_terminated_length": 3035.0, "completions/mean_length": 2776.28125, "completions/mean_terminated_length": 2776.28125, "completions/min_length": 1117.0, "completions/min_terminated_length": 1117.0, "entropy": 0.415100060403347, "epoch": 0.00666, "frac_reward_zero_std": 0.0, "grad_norm": 1.906931757926941, "kl": 0.371683020144701, "learning_rate": 9.999836651623489e-06, "loss": 0.0841, "num_tokens": 17375797.0, "reward": -0.04500000178813934, "reward_std": 0.11221498996019363, "rewards/rollout_reward_func/mean": -0.04500000178813934, "rewards/rollout_reward_func/std": 0.19660012423992157, "sampling/importance_sampling_ratio/max": 2.257512331008911, "sampling/importance_sampling_ratio/mean": 0.9947605729103088, "sampling/importance_sampling_ratio/min": 0.2314082235097885, "sampling/sampling_logp_difference/max": 0.7822046279907227, "sampling/sampling_logp_difference/mean": 0.04253339767456055, "step": 333, "step_time": 41.125564955997106 }, { "clip_ratio/high_max": 0.006850600708276033, "clip_ratio/high_mean": 0.0034253003541380167, "clip_ratio/low_mean": 0.0032986111473292112, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006723911501467228, "entropy": 0.4193525053560734, "epoch": 0.00668, "grad_norm": 2.091108560562134, "kl": 0.3612441271543503, "learning_rate": 9.99983554979064e-06, "loss": 0.0828, "step": 334, "step_time": 10.233171193001908 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005208333372138441, "completions/clipped_ratio": 0.03125, "completions/max_length": 3031.0, "completions/max_terminated_length": 3031.0, "completions/mean_length": 2885.46875, "completions/mean_terminated_length": 2884.61279296875, "completions/min_length": 1654.0, "completions/min_terminated_length": 1654.0, "entropy": 0.4298589825630188, "epoch": 0.0067, "frac_reward_zero_std": 0.0, "grad_norm": 1.805285096168518, "kl": 0.5581814311444759, "learning_rate": 9.999834444254261e-06, "loss": -0.0295, "num_tokens": 17491709.0, "reward": 0.045625001192092896, "reward_std": 0.13078023493289948, "rewards/rollout_reward_func/mean": 0.045625001192092896, "rewards/rollout_reward_func/std": 0.1976301074028015, "sampling/importance_sampling_ratio/max": 2.2158279418945312, "sampling/importance_sampling_ratio/mean": 1.0222558975219727, "sampling/importance_sampling_ratio/min": 0.21492932736873627, "sampling/sampling_logp_difference/max": 1.4235153198242188, "sampling/sampling_logp_difference/mean": 0.043566472828388214, "step": 335, "step_time": 41.51342618201306 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "entropy": 0.4308301880955696, "epoch": 0.00672, "grad_norm": 1.9020830392837524, "kl": 0.4618966430425644, "learning_rate": 9.999833335014352e-06, "loss": -0.0332, "step": 336, "step_time": 9.007301712008484 }, { "clip_ratio/high_max": 0.0033783784601837397, "clip_ratio/high_mean": 0.0016891892300918698, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016891892300918698, "completions/clipped_ratio": 0.0, "completions/max_length": 3094.0, "completions/max_terminated_length": 3094.0, "completions/mean_length": 2867.5, "completions/mean_terminated_length": 2867.5, "completions/min_length": 2390.0, "completions/min_terminated_length": 2390.0, "entropy": 0.36968783289194107, "epoch": 0.00674, "frac_reward_zero_std": 0.0, "grad_norm": 1.1672717332839966, "kl": 0.30326209031045437, "learning_rate": 9.999832222070915e-06, "loss": -0.0387, "num_tokens": 17606576.0, "reward": 0.048750001937150955, "reward_std": 0.10282598435878754, "rewards/rollout_reward_func/mean": 0.048750001937150955, "rewards/rollout_reward_func/std": 0.21335643529891968, "sampling/importance_sampling_ratio/max": 2.6462104320526123, "sampling/importance_sampling_ratio/mean": 0.9777675271034241, "sampling/importance_sampling_ratio/min": 0.2115698605775833, "sampling/sampling_logp_difference/max": 0.7081310749053955, "sampling/sampling_logp_difference/mean": 0.04413309693336487, "step": 337, "step_time": 41.790469111991115 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.01215277798473835, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.01736111135687679, "entropy": 0.3639492504298687, "epoch": 0.00676, "grad_norm": 0.9901954531669617, "kl": 0.32282317988574505, "learning_rate": 9.999831105423947e-06, "loss": -0.0425, "step": 338, "step_time": 9.130401213995356 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0625, "completions/max_length": 3035.0, "completions/max_terminated_length": 3035.0, "completions/mean_length": 2874.03125, "completions/mean_terminated_length": 2872.10009765625, "completions/min_length": 2728.0, "completions/min_terminated_length": 2728.0, "entropy": 0.44732575863599777, "epoch": 0.00678, "frac_reward_zero_std": 0.5, "grad_norm": 0.9907981157302856, "kl": 0.34680038318037987, "learning_rate": 9.999829985073454e-06, "loss": -0.0037, "num_tokens": 17722240.0, "reward": 0.019687499850988388, "reward_std": 0.04209454730153084, "rewards/rollout_reward_func/mean": 0.019687499850988388, "rewards/rollout_reward_func/std": 0.06765588372945786, "sampling/importance_sampling_ratio/max": 2.3998336791992188, "sampling/importance_sampling_ratio/mean": 1.0840644836425781, "sampling/importance_sampling_ratio/min": 0.3234512209892273, "sampling/sampling_logp_difference/max": 0.5507974624633789, "sampling/sampling_logp_difference/mean": 0.045105550438165665, "step": 339, "step_time": 43.570160997980565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.007708333316259086, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007708333316259086, "entropy": 0.43794915825128555, "epoch": 0.0068, "grad_norm": 0.9109601974487305, "kl": 0.39482543990015984, "learning_rate": 9.999828861019437e-06, "loss": -0.0068, "step": 340, "step_time": 9.491892766018282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3138.0, "completions/max_terminated_length": 3138.0, "completions/mean_length": 2891.6875, "completions/mean_terminated_length": 2891.6875, "completions/min_length": 2728.0, "completions/min_terminated_length": 2728.0, "entropy": 0.37105344980955124, "epoch": 0.00682, "frac_reward_zero_std": 0.25, "grad_norm": 1.1111348867416382, "kl": 0.31960206665098667, "learning_rate": 9.999827733261892e-06, "loss": -0.021, "num_tokens": 17838273.0, "reward": 0.015625, "reward_std": 0.04219720885157585, "rewards/rollout_reward_func/mean": 0.015625, "rewards/rollout_reward_func/std": 0.0623692162334919, "sampling/importance_sampling_ratio/max": 2.786299228668213, "sampling/importance_sampling_ratio/mean": 1.0912487506866455, "sampling/importance_sampling_ratio/min": 0.47787514328956604, "sampling/sampling_logp_difference/max": 0.7863068580627441, "sampling/sampling_logp_difference/mean": 0.0443718284368515, "step": 341, "step_time": 43.160379425004066 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.010416666744276881, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012152777868323028, "entropy": 0.36442771926522255, "epoch": 0.00684, "grad_norm": 0.9744181036949158, "kl": 0.35923222079873085, "learning_rate": 9.999826601800824e-06, "loss": -0.0279, "step": 342, "step_time": 9.232285209007387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.03125, "completions/max_length": 3061.0, "completions/max_terminated_length": 3061.0, "completions/mean_length": 2847.1875, "completions/mean_terminated_length": 2850.9677734375, "completions/min_length": 1435.0, "completions/min_terminated_length": 1435.0, "entropy": 0.36714230850338936, "epoch": 0.00686, "frac_reward_zero_std": 0.125, "grad_norm": 1.2039743661880493, "kl": 0.48806050047278404, "learning_rate": 9.999825466636233e-06, "loss": 0.0756, "num_tokens": 17953390.0, "reward": -0.02656250260770321, "reward_std": 0.09342575073242188, "rewards/rollout_reward_func/mean": -0.02656250260770321, "rewards/rollout_reward_func/std": 0.19873161613941193, "sampling/importance_sampling_ratio/max": 2.302706718444824, "sampling/importance_sampling_ratio/mean": 1.0160350799560547, "sampling/importance_sampling_ratio/min": 0.3926359713077545, "sampling/sampling_logp_difference/max": 0.729058027267456, "sampling/sampling_logp_difference/mean": 0.04609353467822075, "step": 343, "step_time": 41.09620379800617 }, { "clip_ratio/high_max": 0.005972222192212939, "clip_ratio/high_mean": 0.0029861110961064696, "clip_ratio/low_mean": 0.008680555620230734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011666666832752526, "entropy": 0.3644689805805683, "epoch": 0.00688, "grad_norm": 1.2154171466827393, "kl": 0.5221099816262722, "learning_rate": 9.999824327768121e-06, "loss": 0.0713, "step": 344, "step_time": 9.09271458301373 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 3139.0, "completions/max_terminated_length": 3139.0, "completions/mean_length": 2892.75, "completions/mean_terminated_length": 2892.75, "completions/min_length": 2692.0, "completions/min_terminated_length": 2692.0, "entropy": 0.36128484085202217, "epoch": 0.0069, "frac_reward_zero_std": 0.125, "grad_norm": 1.27363920211792, "kl": 0.35059909522533417, "learning_rate": 9.99982318519649e-06, "loss": 0.0944, "num_tokens": 18069396.0, "reward": 0.023749999701976776, "reward_std": 0.04824655503034592, "rewards/rollout_reward_func/mean": 0.023749999701976776, "rewards/rollout_reward_func/std": 0.05993275344371796, "sampling/importance_sampling_ratio/max": 2.097679615020752, "sampling/importance_sampling_ratio/mean": 1.0275201797485352, "sampling/importance_sampling_ratio/min": 0.33659499883651733, "sampling/sampling_logp_difference/max": 0.8113938570022583, "sampling/sampling_logp_difference/mean": 0.04761318117380142, "step": 345, "step_time": 44.28958571599651 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005208333372138441, "entropy": 0.3594363294541836, "epoch": 0.00692, "grad_norm": 1.2742985486984253, "kl": 0.36022347025573254, "learning_rate": 9.999822038921339e-06, "loss": 0.0911, "step": 346, "step_time": 9.229209993995028 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.03125, "completions/max_length": 3096.0, "completions/max_terminated_length": 3096.0, "completions/mean_length": 2891.53125, "completions/mean_terminated_length": 2903.419189453125, "completions/min_length": 2523.0, "completions/min_terminated_length": 2743.0, "entropy": 0.4008607342839241, "epoch": 0.00694, "frac_reward_zero_std": 0.125, "grad_norm": 6.8911871910095215, "kl": 0.4811227209866047, "learning_rate": 9.99982088894267e-06, "loss": 1.0538, "num_tokens": 18185863.0, "reward": 0.0018749996088445187, "reward_std": 0.053620822727680206, "rewards/rollout_reward_func/mean": 0.0018749996088445187, "rewards/rollout_reward_func/std": 0.07095830142498016, "sampling/importance_sampling_ratio/max": 2.899974822998047, "sampling/importance_sampling_ratio/mean": 1.0153355598449707, "sampling/importance_sampling_ratio/min": 0.29657140374183655, "sampling/sampling_logp_difference/max": 1.0978436470031738, "sampling/sampling_logp_difference/mean": 0.05201619863510132, "step": 347, "step_time": 42.26013625401538 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.003906250116415322, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0073784723645076156, "entropy": 0.39887847751379013, "epoch": 0.00696, "grad_norm": 5.855113506317139, "kl": 0.4611879512667656, "learning_rate": 9.999819735260483e-06, "loss": 1.045, "step": 348, "step_time": 9.195489862002432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 3180.0, "completions/max_terminated_length": 3180.0, "completions/mean_length": 2724.71875, "completions/mean_terminated_length": 2724.71875, "completions/min_length": 588.0, "completions/min_terminated_length": 588.0, "entropy": 0.340648140758276, "epoch": 0.00698, "frac_reward_zero_std": 0.0, "grad_norm": 1.6283471584320068, "kl": 0.41364636458456516, "learning_rate": 9.999818577874782e-06, "loss": -0.0522, "num_tokens": 18296679.0, "reward": -0.017812497913837433, "reward_std": 0.21186214685440063, "rewards/rollout_reward_func/mean": -0.017812497913837433, "rewards/rollout_reward_func/std": 0.3267013728618622, "sampling/importance_sampling_ratio/max": 2.397644519805908, "sampling/importance_sampling_ratio/mean": 1.040598750114441, "sampling/importance_sampling_ratio/min": 0.2771419286727905, "sampling/sampling_logp_difference/max": 0.5928010940551758, "sampling/sampling_logp_difference/mean": 0.043335799127817154, "step": 349, "step_time": 40.2587990250031 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.007068452658131719, "clip_ratio/low_min": 0.004464285913854837, "clip_ratio/region_mean": 0.014012897154316306, "entropy": 0.342149056494236, "epoch": 0.007, "grad_norm": 1.0604374408721924, "kl": 0.4015281666070223, "learning_rate": 9.999817416785565e-06, "loss": -0.0561, "step": 350, "step_time": 9.21041988900106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 3111.0, "completions/max_terminated_length": 3111.0, "completions/mean_length": 2883.34375, "completions/mean_terminated_length": 2883.34375, "completions/min_length": 2304.0, "completions/min_terminated_length": 2304.0, "entropy": 0.3668610565364361, "epoch": 0.00702, "frac_reward_zero_std": 0.0, "grad_norm": 1.3156640529632568, "kl": 0.5218501426279545, "learning_rate": 9.999816251992836e-06, "loss": -0.1535, "num_tokens": 18412354.0, "reward": -0.0028124996460974216, "reward_std": 0.030912719666957855, "rewards/rollout_reward_func/mean": -0.0028124996460974216, "rewards/rollout_reward_func/std": 0.03656495362520218, "sampling/importance_sampling_ratio/max": 1.953458309173584, "sampling/importance_sampling_ratio/mean": 0.966667115688324, "sampling/importance_sampling_ratio/min": 0.17398038506507874, "sampling/sampling_logp_difference/max": 1.056915521621704, "sampling/sampling_logp_difference/mean": 0.044610172510147095, "step": 351, "step_time": 44.04102398098621 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666744276881, "entropy": 0.36827195063233376, "epoch": 0.00704, "grad_norm": 1.2125022411346436, "kl": 0.5398768447339535, "learning_rate": 9.999815083496593e-06, "loss": -0.1541, "step": 352, "step_time": 9.19050597500609 }, { "clip_ratio/high_max": 0.001700680237263441, "clip_ratio/high_mean": 0.0008503401186317205, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025864513590931892, "completions/clipped_ratio": 0.0625, "completions/max_length": 3061.0, "completions/max_terminated_length": 3061.0, "completions/mean_length": 2861.5, "completions/mean_terminated_length": 2857.633544921875, "completions/min_length": 1941.0, "completions/min_terminated_length": 1941.0, "entropy": 0.45415887236595154, "epoch": 0.00706, "frac_reward_zero_std": 0.125, "grad_norm": 2.05373477935791, "kl": 0.484494686126709, "learning_rate": 9.99981391129684e-06, "loss": -0.2377, "num_tokens": 18527528.0, "reward": 0.007187499664723873, "reward_std": 0.04172711446881294, "rewards/rollout_reward_func/mean": 0.007187499664723873, "rewards/rollout_reward_func/std": 0.061760954558849335, "sampling/importance_sampling_ratio/max": 2.0689685344696045, "sampling/importance_sampling_ratio/mean": 1.0267624855041504, "sampling/importance_sampling_ratio/min": 0.4079192876815796, "sampling/sampling_logp_difference/max": 1.1987524032592773, "sampling/sampling_logp_difference/mean": 0.046385399997234344, "step": 353, "step_time": 42.472357441991335 }, { "clip_ratio/high_max": 0.006779738934710622, "clip_ratio/high_mean": 0.003389869467355311, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003389869467355311, "entropy": 0.4550458900630474, "epoch": 0.00708, "grad_norm": 1.9996660947799683, "kl": 0.4796774350106716, "learning_rate": 9.999812735393578e-06, "loss": -0.2464, "step": 354, "step_time": 9.066415241984942 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.03125, "completions/max_length": 3044.0, "completions/max_terminated_length": 3044.0, "completions/mean_length": 2909.34375, "completions/mean_terminated_length": 2909.67724609375, "completions/min_length": 2728.0, "completions/min_terminated_length": 2728.0, "entropy": 0.36910005658864975, "epoch": 0.0071, "frac_reward_zero_std": 0.125, "grad_norm": 1.3549381494522095, "kl": 0.41893161833286285, "learning_rate": 9.999811555786805e-06, "loss": -0.1236, "num_tokens": 18644345.0, "reward": 0.009687500074505806, "reward_std": 0.0260639488697052, "rewards/rollout_reward_func/mean": 0.009687500074505806, "rewards/rollout_reward_func/std": 0.03306731954216957, "sampling/importance_sampling_ratio/max": 1.8643220663070679, "sampling/importance_sampling_ratio/mean": 1.0049974918365479, "sampling/importance_sampling_ratio/min": 5.805537139867252e-22, "sampling/sampling_logp_difference/max": 15.75226879119873, "sampling/sampling_logp_difference/mean": 0.11400792747735977, "step": 355, "step_time": 42.56496216600499 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005208333372138441, "entropy": 0.367544986307621, "epoch": 0.00712, "grad_norm": 1.193587303161621, "kl": 0.41215749084949493, "learning_rate": 9.999810372476526e-06, "loss": -0.1262, "step": 356, "step_time": 9.78580809700361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0015625000232830644, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015625000232830644, "completions/clipped_ratio": 0.0, "completions/max_length": 3050.0, "completions/max_terminated_length": 3050.0, "completions/mean_length": 2820.96875, "completions/mean_terminated_length": 2820.96875, "completions/min_length": 1553.0, "completions/min_terminated_length": 1553.0, "entropy": 0.38109608739614487, "epoch": 0.00714, "frac_reward_zero_std": 0.125, "grad_norm": 1.147459626197815, "kl": 0.3814978711307049, "learning_rate": 9.99980918546274e-06, "loss": 0.0172, "num_tokens": 18758190.0, "reward": -0.019999999552965164, "reward_std": 0.05093258619308472, "rewards/rollout_reward_func/mean": -0.019999999552965164, "rewards/rollout_reward_func/std": 0.07330889254808426, "sampling/importance_sampling_ratio/max": 1.8108247518539429, "sampling/importance_sampling_ratio/mean": 1.094908356666565, "sampling/importance_sampling_ratio/min": 0.24349598586559296, "sampling/sampling_logp_difference/max": 0.6769726276397705, "sampling/sampling_logp_difference/mean": 0.04435478150844574, "step": 357, "step_time": 42.417569707999064 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "entropy": 0.3823031932115555, "epoch": 0.00716, "grad_norm": 1.1464769840240479, "kl": 0.36772672832012177, "learning_rate": 9.999807994745449e-06, "loss": 0.0154, "step": 358, "step_time": 9.048302499992133 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 3050.0, "completions/max_terminated_length": 3050.0, "completions/mean_length": 2754.90625, "completions/mean_terminated_length": 2754.90625, "completions/min_length": 664.0, "completions/min_terminated_length": 664.0, "entropy": 0.3804786093533039, "epoch": 0.00718, "frac_reward_zero_std": 0.0, "grad_norm": 0.8674813508987427, "kl": 0.7694353275001049, "learning_rate": 9.999806800324652e-06, "loss": -0.1264, "num_tokens": 18869464.0, "reward": -0.08562499284744263, "reward_std": 0.26934367418289185, "rewards/rollout_reward_func/mean": -0.08562499284744263, "rewards/rollout_reward_func/std": 0.41496941447257996, "sampling/importance_sampling_ratio/max": 1.7930198907852173, "sampling/importance_sampling_ratio/mean": 0.8733890056610107, "sampling/importance_sampling_ratio/min": 0.1389148235321045, "sampling/sampling_logp_difference/max": 2.197103500366211, "sampling/sampling_logp_difference/mean": 0.057078905403614044, "step": 359, "step_time": 40.268112329998985 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.003574346425011754, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005310457549057901, "entropy": 0.3829173110425472, "epoch": 0.0072, "grad_norm": 1.289261817932129, "kl": 0.8045071884989738, "learning_rate": 9.999805602200355e-06, "loss": -0.1284, "step": 360, "step_time": 9.023047832975863 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 3122.0, "completions/max_terminated_length": 3122.0, "completions/mean_length": 2811.6875, "completions/mean_terminated_length": 2811.6875, "completions/min_length": 2029.0, "completions/min_terminated_length": 2029.0, "entropy": 0.37365251034498215, "epoch": 0.00722, "frac_reward_zero_std": 0.25, "grad_norm": 1.1524171829223633, "kl": 0.3612435422837734, "learning_rate": 9.999804400372553e-06, "loss": -0.0505, "num_tokens": 18983136.0, "reward": -0.009312499314546585, "reward_std": 0.0296157393604517, "rewards/rollout_reward_func/mean": -0.009312499314546585, "rewards/rollout_reward_func/std": 0.05717795714735985, "sampling/importance_sampling_ratio/max": 1.9707398414611816, "sampling/importance_sampling_ratio/mean": 1.0513246059417725, "sampling/importance_sampling_ratio/min": 0.38958939909935, "sampling/sampling_logp_difference/max": 0.6325550079345703, "sampling/sampling_logp_difference/mean": 0.042889274656772614, "step": 361, "step_time": 40.995919900989975 }, { "clip_ratio/high_max": 0.01736111124046147, "clip_ratio/high_mean": 0.008680555620230734, "clip_ratio/low_mean": 0.005208333372138441, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013888888992369175, "entropy": 0.377502653747797, "epoch": 0.00724, "grad_norm": 1.0273391008377075, "kl": 0.34734607115387917, "learning_rate": 9.999803194841253e-06, "loss": -0.0535, "step": 362, "step_time": 10.375120690987387 }, { "clip_ratio/high_max": 0.006850600708276033, "clip_ratio/high_mean": 0.0034253003541380167, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005161411478184164, "completions/clipped_ratio": 0.0, "completions/max_length": 3006.0, "completions/max_terminated_length": 3006.0, "completions/mean_length": 2882.75, "completions/mean_terminated_length": 2882.75, "completions/min_length": 2511.0, "completions/min_terminated_length": 2511.0, "entropy": 0.36708877235651016, "epoch": 0.00726, "frac_reward_zero_std": 0.0, "grad_norm": 1.0917654037475586, "kl": 0.35414266400039196, "learning_rate": 9.999801985606451e-06, "loss": -0.0491, "num_tokens": 19098506.0, "reward": 0.02918749861419201, "reward_std": 0.05671289935708046, "rewards/rollout_reward_func/mean": 0.02918749861419201, "rewards/rollout_reward_func/std": 0.06964584439992905, "sampling/importance_sampling_ratio/max": 1.5679417848587036, "sampling/importance_sampling_ratio/mean": 0.9486284255981445, "sampling/importance_sampling_ratio/min": 0.35323700308799744, "sampling/sampling_logp_difference/max": 0.720933198928833, "sampling/sampling_logp_difference/mean": 0.04150357097387314, "step": 363, "step_time": 41.8443327970017 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 0.36721600964665413, "epoch": 0.00728, "grad_norm": 1.1443796157836914, "kl": 0.3532056175172329, "learning_rate": 9.999800772668154e-06, "loss": -0.0496, "step": 364, "step_time": 8.918639629009704 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 3044.0, "completions/max_terminated_length": 3044.0, "completions/mean_length": 2879.5, "completions/mean_terminated_length": 2879.5, "completions/min_length": 2698.0, "completions/min_terminated_length": 2698.0, "entropy": 0.35799355804920197, "epoch": 0.0073, "frac_reward_zero_std": 0.125, "grad_norm": 1.0182232856750488, "kl": 0.29627991281449795, "learning_rate": 9.999799556026358e-06, "loss": -0.0254, "num_tokens": 19214265.0, "reward": 0.019375000149011612, "reward_std": 0.033470965921878815, "rewards/rollout_reward_func/mean": 0.019375000149011612, "rewards/rollout_reward_func/std": 0.047785647213459015, "sampling/importance_sampling_ratio/max": 1.9531196355819702, "sampling/importance_sampling_ratio/mean": 1.0406684875488281, "sampling/importance_sampling_ratio/min": 0.38062629103660583, "sampling/sampling_logp_difference/max": 0.47331881523132324, "sampling/sampling_logp_difference/mean": 0.03672315180301666, "step": 365, "step_time": 41.54418921201432 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.008680555736646056, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008680555736646056, "entropy": 0.3587690182030201, "epoch": 0.00732, "grad_norm": 0.9805796146392822, "kl": 0.29380563274025917, "learning_rate": 9.999798335681066e-06, "loss": -0.028, "step": 366, "step_time": 9.014416432997677 }, { "clip_ratio/high_max": 0.008472222136333585, "clip_ratio/high_mean": 0.004236111068166792, "clip_ratio/low_mean": 0.0018382353009656072, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0060743463691323996, "completions/clipped_ratio": 0.0, "completions/max_length": 3022.0, "completions/max_terminated_length": 3022.0, "completions/mean_length": 2685.65625, "completions/mean_terminated_length": 2685.65625, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "entropy": 0.4594216123223305, "epoch": 0.00734, "frac_reward_zero_std": 0.0, "grad_norm": 1.1722557544708252, "kl": 0.7497690804302692, "learning_rate": 9.99979711163228e-06, "loss": -0.1539, "num_tokens": 19323711.0, "reward": -0.06843750178813934, "reward_std": 0.2887628674507141, "rewards/rollout_reward_func/mean": -0.06843750178813934, "rewards/rollout_reward_func/std": 0.3824210464954376, "sampling/importance_sampling_ratio/max": 2.1770527362823486, "sampling/importance_sampling_ratio/mean": 0.8495855331420898, "sampling/importance_sampling_ratio/min": 0.11624876409769058, "sampling/sampling_logp_difference/max": 1.3837306499481201, "sampling/sampling_logp_difference/mean": 0.05317322537302971, "step": 367, "step_time": 39.4116383000146 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006200397037900984, "entropy": 0.4617273025214672, "epoch": 0.00736, "grad_norm": 1.0443613529205322, "kl": 0.7092177867889404, "learning_rate": 9.999795883880002e-06, "loss": -0.1565, "step": 368, "step_time": 9.343507058991236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 3099.0, "completions/max_terminated_length": 3099.0, "completions/mean_length": 2932.46875, "completions/mean_terminated_length": 2932.46875, "completions/min_length": 2739.0, "completions/min_terminated_length": 2739.0, "entropy": 0.37396084517240524, "epoch": 0.00738, "frac_reward_zero_std": 0.375, "grad_norm": 1.0847474336624146, "kl": 0.45913570187985897, "learning_rate": 9.999794652424228e-06, "loss": 0.0163, "num_tokens": 19441108.0, "reward": 0.0228124987334013, "reward_std": 0.034713372588157654, "rewards/rollout_reward_func/mean": 0.0228124987334013, "rewards/rollout_reward_func/std": 0.047940440475940704, "sampling/importance_sampling_ratio/max": 2.0848076343536377, "sampling/importance_sampling_ratio/mean": 0.9931883811950684, "sampling/importance_sampling_ratio/min": 0.39272943139076233, "sampling/sampling_logp_difference/max": 0.545560359954834, "sampling/sampling_logp_difference/mean": 0.03934935852885246, "step": 369, "step_time": 42.14784694199625 }, { "clip_ratio/high_max": 0.0024999999441206455, "clip_ratio/high_mean": 0.0012499999720603228, "clip_ratio/low_mean": 0.005208333372138441, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006458333344198763, "entropy": 0.3823331892490387, "epoch": 0.0074, "grad_norm": 1.1212048530578613, "kl": 0.45100564509630203, "learning_rate": 9.999793417264967e-06, "loss": 0.0129, "step": 370, "step_time": 9.197131928005547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3129.0, "completions/max_terminated_length": 3129.0, "completions/mean_length": 2884.8125, "completions/mean_terminated_length": 2884.8125, "completions/min_length": 1562.0, "completions/min_terminated_length": 1562.0, "entropy": 0.39911437407135963, "epoch": 0.00742, "frac_reward_zero_std": 0.25, "grad_norm": 0.948852002620697, "kl": 0.4024066887795925, "learning_rate": 9.999792178402215e-06, "loss": -0.006, "num_tokens": 19557067.0, "reward": 0.052187494933605194, "reward_std": 0.11198446899652481, "rewards/rollout_reward_func/mean": 0.052187494933605194, "rewards/rollout_reward_func/std": 0.23370416462421417, "sampling/importance_sampling_ratio/max": 1.9198561906814575, "sampling/importance_sampling_ratio/mean": 0.9719012975692749, "sampling/importance_sampling_ratio/min": 0.4445561170578003, "sampling/sampling_logp_difference/max": 0.6034307479858398, "sampling/sampling_logp_difference/mean": 0.03672725707292557, "step": 371, "step_time": 41.623454950007726 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "entropy": 0.40554217994213104, "epoch": 0.00744, "grad_norm": 0.936628520488739, "kl": 0.39416519552469254, "learning_rate": 9.999790935835974e-06, "loss": -0.0082, "step": 372, "step_time": 9.187493012999767 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.03125, "completions/max_length": 3139.0, "completions/max_terminated_length": 3139.0, "completions/mean_length": 2842.71875, "completions/mean_terminated_length": 2836.935302734375, "completions/min_length": 1748.0, "completions/min_terminated_length": 1748.0, "entropy": 0.3921775594353676, "epoch": 0.00746, "frac_reward_zero_std": 0.0, "grad_norm": 2.032710313796997, "kl": 0.5857649389654398, "learning_rate": 9.999789689566245e-06, "loss": 0.0834, "num_tokens": 19671541.0, "reward": 0.010624999180436134, "reward_std": 0.06529676914215088, "rewards/rollout_reward_func/mean": 0.010624999180436134, "rewards/rollout_reward_func/std": 0.07615508139133453, "sampling/importance_sampling_ratio/max": 1.830490231513977, "sampling/importance_sampling_ratio/mean": 0.9969909191131592, "sampling/importance_sampling_ratio/min": 0.5150743126869202, "sampling/sampling_logp_difference/max": 0.8365006446838379, "sampling/sampling_logp_difference/mean": 0.03898666799068451, "step": 373, "step_time": 43.149007094005356 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0024999999441206455, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005972222192212939, "entropy": 0.3927090987563133, "epoch": 0.00748, "grad_norm": 1.5853525400161743, "kl": 0.5225659962743521, "learning_rate": 9.999788439593031e-06, "loss": 0.0768, "step": 374, "step_time": 9.183209674003592 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0030381944961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004774305620230734, "completions/clipped_ratio": 0.0, "completions/max_length": 3078.0, "completions/max_terminated_length": 3078.0, "completions/mean_length": 2853.59375, "completions/mean_terminated_length": 2853.59375, "completions/min_length": 2066.0, "completions/min_terminated_length": 2066.0, "entropy": 0.49389762803912163, "epoch": 0.0075, "frac_reward_zero_std": 0.125, "grad_norm": 1.9626293182373047, "kl": 0.34199972450733185, "learning_rate": 9.999787185916332e-06, "loss": 0.0764, "num_tokens": 19786483.0, "reward": -0.025937503203749657, "reward_std": 0.10800082981586456, "rewards/rollout_reward_func/mean": -0.025937503203749657, "rewards/rollout_reward_func/std": 0.20182807743549347, "sampling/importance_sampling_ratio/max": 2.4222664833068848, "sampling/importance_sampling_ratio/mean": 1.0802035331726074, "sampling/importance_sampling_ratio/min": 0.32061922550201416, "sampling/sampling_logp_difference/max": 0.8447649478912354, "sampling/sampling_logp_difference/mean": 0.047866012901067734, "step": 375, "step_time": 43.30249591200118 }, { "clip_ratio/high_max": 0.005434782709926367, "clip_ratio/high_mean": 0.0027173913549631834, "clip_ratio/low_mean": 0.005208333372138441, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007925724727101624, "entropy": 0.4943077266216278, "epoch": 0.00752, "grad_norm": 1.645762324333191, "kl": 0.335026815533638, "learning_rate": 9.999785928536149e-06, "loss": 0.0703, "step": 376, "step_time": 9.22750177400303 }, { "clip_ratio/high_max": 0.009444444440305233, "clip_ratio/high_mean": 0.006458333344198763, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006458333344198763, "completions/clipped_ratio": 0.0, "completions/max_length": 2993.0, "completions/max_terminated_length": 2993.0, "completions/mean_length": 2863.90625, "completions/mean_terminated_length": 2863.90625, "completions/min_length": 2730.0, "completions/min_terminated_length": 2730.0, "entropy": 0.4318506419658661, "epoch": 0.00754, "frac_reward_zero_std": 0.0, "grad_norm": 1.5574791431427002, "kl": 0.39671508595347404, "learning_rate": 9.999784667452484e-06, "loss": 0.0282, "num_tokens": 19901626.0, "reward": 0.048124998807907104, "reward_std": 0.09666197746992111, "rewards/rollout_reward_func/mean": 0.048124998807907104, "rewards/rollout_reward_func/std": 0.21012187004089355, "sampling/importance_sampling_ratio/max": 1.8046319484710693, "sampling/importance_sampling_ratio/mean": 0.9822795987129211, "sampling/importance_sampling_ratio/min": 0.3637678623199463, "sampling/sampling_logp_difference/max": 0.6638021469116211, "sampling/sampling_logp_difference/mean": 0.04675694555044174, "step": 377, "step_time": 42.498453273998166 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 0.43226491659879684, "epoch": 0.00756, "grad_norm": 1.3629974126815796, "kl": 0.389215424656868, "learning_rate": 9.999783402665337e-06, "loss": 0.0279, "step": 378, "step_time": 8.923731430011685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2952.0, "completions/max_terminated_length": 2952.0, "completions/mean_length": 2829.75, "completions/mean_terminated_length": 2829.75, "completions/min_length": 2634.0, "completions/min_terminated_length": 2634.0, "entropy": 0.41170306876301765, "epoch": 0.00758, "frac_reward_zero_std": 0.0, "grad_norm": 1.12644362449646, "kl": 0.2883479632437229, "learning_rate": 9.999782134174711e-06, "loss": -0.0153, "num_tokens": 20015346.0, "reward": 0.04062499850988388, "reward_std": 0.11773143708705902, "rewards/rollout_reward_func/mean": 0.04062499850988388, "rewards/rollout_reward_func/std": 0.2285146564245224, "sampling/importance_sampling_ratio/max": 1.940473198890686, "sampling/importance_sampling_ratio/mean": 1.0289239883422852, "sampling/importance_sampling_ratio/min": 0.442949116230011, "sampling/sampling_logp_difference/max": 0.48134374618530273, "sampling/sampling_logp_difference/mean": 0.037492986768484116, "step": 379, "step_time": 43.1588148939918 }, { "clip_ratio/high_max": 0.020833333721384406, "clip_ratio/high_mean": 0.010416666860692203, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01215277798473835, "entropy": 0.41560249775648117, "epoch": 0.0076, "grad_norm": 1.1223279237747192, "kl": 0.2790991757065058, "learning_rate": 9.999780861980606e-06, "loss": -0.0182, "step": 380, "step_time": 8.82811975498771 }, { "clip_ratio/high_max": 0.0008445946150459349, "clip_ratio/high_mean": 0.00042229730752296746, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0038945196429267526, "completions/clipped_ratio": 0.03125, "completions/max_length": 3095.0, "completions/max_terminated_length": 3032.0, "completions/mean_length": 2888.90625, "completions/mean_terminated_length": 2882.258056640625, "completions/min_length": 2744.0, "completions/min_terminated_length": 2744.0, "entropy": 0.46602268517017365, "epoch": 0.00762, "frac_reward_zero_std": 0.125, "grad_norm": 1.0885858535766602, "kl": 0.31250107660889626, "learning_rate": 9.999779586083026e-06, "loss": -0.0936, "num_tokens": 20131915.0, "reward": 0.043437499552965164, "reward_std": 0.09384177625179291, "rewards/rollout_reward_func/mean": 0.043437499552965164, "rewards/rollout_reward_func/std": 0.21206526458263397, "sampling/importance_sampling_ratio/max": 2.0978126525878906, "sampling/importance_sampling_ratio/mean": 0.9543898105621338, "sampling/importance_sampling_ratio/min": 0.3171147108078003, "sampling/sampling_logp_difference/max": 0.505486249923706, "sampling/sampling_logp_difference/mean": 0.03892179951071739, "step": 381, "step_time": 42.904207296000095 }, { "clip_ratio/high_max": 0.012950450414791703, "clip_ratio/high_mean": 0.006475225207395852, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009947447455488145, "entropy": 0.4669545851647854, "epoch": 0.00764, "grad_norm": 1.0434297323226929, "kl": 0.30925088562071323, "learning_rate": 9.999778306481967e-06, "loss": -0.0977, "step": 382, "step_time": 9.16786321499967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 3044.0, "completions/max_terminated_length": 3044.0, "completions/mean_length": 2866.03125, "completions/mean_terminated_length": 2866.03125, "completions/min_length": 2663.0, "completions/min_terminated_length": 2663.0, "entropy": 0.4193827398121357, "epoch": 0.00766, "frac_reward_zero_std": 0.125, "grad_norm": 1.5363037586212158, "kl": 0.28570458851754665, "learning_rate": 9.999777023177434e-06, "loss": -0.0633, "num_tokens": 20247423.0, "reward": 0.012437500059604645, "reward_std": 0.03864006698131561, "rewards/rollout_reward_func/mean": 0.012437500059604645, "rewards/rollout_reward_func/std": 0.04760519042611122, "sampling/importance_sampling_ratio/max": 1.9009040594100952, "sampling/importance_sampling_ratio/mean": 1.0575485229492188, "sampling/importance_sampling_ratio/min": 0.2904524803161621, "sampling/sampling_logp_difference/max": 0.7339715957641602, "sampling/sampling_logp_difference/mean": 0.03420642018318176, "step": 383, "step_time": 41.434931342999334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "entropy": 0.4187196418642998, "epoch": 0.00768, "grad_norm": 1.4340078830718994, "kl": 0.28724461793899536, "learning_rate": 9.999775736169428e-06, "loss": -0.066, "step": 384, "step_time": 9.791899923991878 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008680555620230734, "completions/clipped_ratio": 0.03125, "completions/max_length": 3010.0, "completions/max_terminated_length": 3010.0, "completions/mean_length": 2776.15625, "completions/mean_terminated_length": 2773.12890625, "completions/min_length": 1602.0, "completions/min_terminated_length": 1602.0, "entropy": 0.45826057717204094, "epoch": 0.0077, "frac_reward_zero_std": 0.125, "grad_norm": 1.7409992218017578, "kl": 0.3255313076078892, "learning_rate": 9.99977444545795e-06, "loss": -0.0843, "num_tokens": 20360386.0, "reward": 0.11749999225139618, "reward_std": 0.20005768537521362, "rewards/rollout_reward_func/mean": 0.11749999225139618, "rewards/rollout_reward_func/std": 0.3573288023471832, "sampling/importance_sampling_ratio/max": 1.6205370426177979, "sampling/importance_sampling_ratio/mean": 0.9788841009140015, "sampling/importance_sampling_ratio/min": 0.4105953872203827, "sampling/sampling_logp_difference/max": 0.9278536438941956, "sampling/sampling_logp_difference/mean": 0.03812399506568909, "step": 385, "step_time": 39.60660175299563 }, { "clip_ratio/high_max": 0.004248619778081775, "clip_ratio/high_mean": 0.0021243098890408874, "clip_ratio/low_mean": 0.003574346425011754, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005698656314052641, "entropy": 0.45393380895256996, "epoch": 0.00772, "grad_norm": 1.9177309274673462, "kl": 0.339593093842268, "learning_rate": 9.999773151043e-06, "loss": -0.0865, "step": 386, "step_time": 8.906994643999496 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005208333372138441, "completions/clipped_ratio": 0.0, "completions/max_length": 3082.0, "completions/max_terminated_length": 3082.0, "completions/mean_length": 2868.125, "completions/mean_terminated_length": 2868.125, "completions/min_length": 2607.0, "completions/min_terminated_length": 2607.0, "entropy": 0.42804471775889397, "epoch": 0.00774, "frac_reward_zero_std": 0.0, "grad_norm": 1.6265130043029785, "kl": 0.39343132451176643, "learning_rate": 9.999771852924581e-06, "loss": -0.1124, "num_tokens": 20475280.0, "reward": 0.009687500074505806, "reward_std": 0.049455564469099045, "rewards/rollout_reward_func/mean": 0.009687500074505806, "rewards/rollout_reward_func/std": 0.05811470001935959, "sampling/importance_sampling_ratio/max": 2.3641629219055176, "sampling/importance_sampling_ratio/mean": 1.0422842502593994, "sampling/importance_sampling_ratio/min": 0.5156167149543762, "sampling/sampling_logp_difference/max": 0.5555129051208496, "sampling/sampling_logp_difference/mean": 0.03403444588184357, "step": 387, "step_time": 41.99420854899654 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0029861112125217915, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006458333460614085, "entropy": 0.42400502040982246, "epoch": 0.00776, "grad_norm": 1.9294583797454834, "kl": 0.4124392867088318, "learning_rate": 9.999770551102692e-06, "loss": -0.1169, "step": 388, "step_time": 9.106344018990058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005208333372138441, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005208333372138441, "completions/clipped_ratio": 0.0, "completions/max_length": 3153.0, "completions/max_terminated_length": 3153.0, "completions/mean_length": 2858.8125, "completions/mean_terminated_length": 2858.8125, "completions/min_length": 1947.0, "completions/min_terminated_length": 1947.0, "entropy": 0.4226231873035431, "epoch": 0.00778, "frac_reward_zero_std": 0.125, "grad_norm": 1.3834099769592285, "kl": 0.3721582032740116, "learning_rate": 9.999769245577337e-06, "loss": -0.1146, "num_tokens": 20590096.0, "reward": 0.019687499850988388, "reward_std": 0.05479207634925842, "rewards/rollout_reward_func/mean": 0.019687499850988388, "rewards/rollout_reward_func/std": 0.07186386734247208, "sampling/importance_sampling_ratio/max": 1.890951156616211, "sampling/importance_sampling_ratio/mean": 1.0242218971252441, "sampling/importance_sampling_ratio/min": 0.4614587426185608, "sampling/sampling_logp_difference/max": 0.7055144309997559, "sampling/sampling_logp_difference/mean": 0.04106433689594269, "step": 389, "step_time": 41.78263785800664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "entropy": 0.4172542728483677, "epoch": 0.0078, "grad_norm": 1.3047667741775513, "kl": 0.3840856868773699, "learning_rate": 9.999767936348516e-06, "loss": -0.1166, "step": 390, "step_time": 10.408334011008264 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004577020299620926, "completions/clipped_ratio": 0.0, "completions/max_length": 3019.0, "completions/max_terminated_length": 3019.0, "completions/mean_length": 2887.90625, "completions/mean_terminated_length": 2887.90625, "completions/min_length": 2633.0, "completions/min_terminated_length": 2633.0, "entropy": 0.44282426685094833, "epoch": 0.00782, "frac_reward_zero_std": 0.0, "grad_norm": 1.0893396139144897, "kl": 0.3564727380871773, "learning_rate": 9.999766623416231e-06, "loss": -0.1109, "num_tokens": 20705858.0, "reward": -0.017812497913837433, "reward_std": 0.11623667925596237, "rewards/rollout_reward_func/mean": -0.017812497913837433, "rewards/rollout_reward_func/std": 0.2033388316631317, "sampling/importance_sampling_ratio/max": 1.5181227922439575, "sampling/importance_sampling_ratio/mean": 0.8538496494293213, "sampling/importance_sampling_ratio/min": 0.15931598842144012, "sampling/sampling_logp_difference/max": 1.129029631614685, "sampling/sampling_logp_difference/mean": 0.04136330261826515, "step": 391, "step_time": 41.96269005000795 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005208333372138441, "entropy": 0.4361180029809475, "epoch": 0.00784, "grad_norm": 1.076133370399475, "kl": 0.3754804767668247, "learning_rate": 9.999765306780483e-06, "loss": -0.1136, "step": 392, "step_time": 9.008581123009208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3042.0, "completions/max_terminated_length": 3042.0, "completions/mean_length": 2740.0625, "completions/mean_terminated_length": 2740.0625, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "entropy": 0.36063743010163307, "epoch": 0.00786, "frac_reward_zero_std": 0.125, "grad_norm": 1.2201619148254395, "kl": 0.3248476982116699, "learning_rate": 9.999763986441271e-06, "loss": -0.0414, "num_tokens": 20816808.0, "reward": -0.019999999552965164, "reward_std": 0.15125074982643127, "rewards/rollout_reward_func/mean": -0.019999999552965164, "rewards/rollout_reward_func/std": 0.30613091588020325, "sampling/importance_sampling_ratio/max": 1.5498796701431274, "sampling/importance_sampling_ratio/mean": 1.0666983127593994, "sampling/importance_sampling_ratio/min": 0.414148211479187, "sampling/sampling_logp_difference/max": 0.6058452129364014, "sampling/sampling_logp_difference/mean": 0.03376508131623268, "step": 393, "step_time": 40.33082154300064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0045770201832056046, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.0045770201832056046, "entropy": 0.3569633737206459, "epoch": 0.00788, "grad_norm": 1.142669677734375, "kl": 0.32624403573572636, "learning_rate": 9.999762662398599e-06, "loss": -0.0454, "step": 394, "step_time": 9.000440201001766 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.03125, "completions/max_length": 3137.0, "completions/max_terminated_length": 3137.0, "completions/mean_length": 2868.34375, "completions/mean_terminated_length": 2860.0, "completions/min_length": 1590.0, "completions/min_terminated_length": 1590.0, "entropy": 0.3322554640471935, "epoch": 0.0079, "frac_reward_zero_std": 0.0, "grad_norm": 1.2661420106887817, "kl": 0.4089338220655918, "learning_rate": 9.999761334652469e-06, "loss": -0.1237, "num_tokens": 20932270.0, "reward": 0.07843749970197678, "reward_std": 0.16960112750530243, "rewards/rollout_reward_func/mean": 0.07843749970197678, "rewards/rollout_reward_func/std": 0.25980275869369507, "sampling/importance_sampling_ratio/max": 1.9775424003601074, "sampling/importance_sampling_ratio/mean": 0.9813051223754883, "sampling/importance_sampling_ratio/min": 0.36013737320899963, "sampling/sampling_logp_difference/max": 0.9966578483581543, "sampling/sampling_logp_difference/mean": 0.03481311723589897, "step": 395, "step_time": 41.39294739000616 }, { "clip_ratio/high_max": 0.004352503921836615, "clip_ratio/high_mean": 0.0021762519609183073, "clip_ratio/low_mean": 0.012500000302679837, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.014676252263598144, "entropy": 0.32350194081664085, "epoch": 0.00792, "grad_norm": 1.1234732866287231, "kl": 0.42060235887765884, "learning_rate": 9.999760003202882e-06, "loss": -0.1291, "step": 396, "step_time": 9.728634478997265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3056.0, "completions/max_terminated_length": 3056.0, "completions/mean_length": 2912.625, "completions/mean_terminated_length": 2912.625, "completions/min_length": 2377.0, "completions/min_terminated_length": 2377.0, "entropy": 0.3340754322707653, "epoch": 0.00794, "frac_reward_zero_std": 0.0, "grad_norm": 0.8994367122650146, "kl": 0.4744664132595062, "learning_rate": 9.999758668049834e-06, "loss": 0.0768, "num_tokens": 21049220.0, "reward": 0.056312501430511475, "reward_std": 0.11309725791215897, "rewards/rollout_reward_func/mean": 0.056312501430511475, "rewards/rollout_reward_func/std": 0.22624170780181885, "sampling/importance_sampling_ratio/max": 1.8281199932098389, "sampling/importance_sampling_ratio/mean": 0.8943976163864136, "sampling/importance_sampling_ratio/min": 0.25516989827156067, "sampling/sampling_logp_difference/max": 0.9102246761322021, "sampling/sampling_logp_difference/mean": 0.04049454256892204, "step": 397, "step_time": 41.44624646900047 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.005208333372138441, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.008680555736646056, "entropy": 0.32801349833607674, "epoch": 0.00796, "grad_norm": 0.8551763892173767, "kl": 0.4777488671243191, "learning_rate": 9.999757329193334e-06, "loss": 0.0744, "step": 398, "step_time": 9.080709051006124 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0029861110961064696, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0047222222201526165, "completions/clipped_ratio": 0.0, "completions/max_length": 3098.0, "completions/max_terminated_length": 3098.0, "completions/mean_length": 2884.09375, "completions/mean_terminated_length": 2884.09375, "completions/min_length": 2236.0, "completions/min_terminated_length": 2236.0, "entropy": 0.3948609419167042, "epoch": 0.00798, "frac_reward_zero_std": 0.0, "grad_norm": 1.4350974559783936, "kl": 0.5110477805137634, "learning_rate": 9.999755986633378e-06, "loss": 0.0408, "num_tokens": 21165049.0, "reward": 0.00624999962747097, "reward_std": 0.03348710387945175, "rewards/rollout_reward_func/mean": 0.00624999962747097, "rewards/rollout_reward_func/std": 0.04293655976653099, "sampling/importance_sampling_ratio/max": 2.2736153602600098, "sampling/importance_sampling_ratio/mean": 0.9841665029525757, "sampling/importance_sampling_ratio/min": 0.16349110007286072, "sampling/sampling_logp_difference/max": 0.5857527256011963, "sampling/sampling_logp_difference/mean": 0.0461178719997406, "step": 399, "step_time": 41.288154925001436 }, { "clip_ratio/high_max": 0.020833333721384406, "clip_ratio/high_mean": 0.010416666860692203, "clip_ratio/low_mean": 0.009930555592291057, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02034722233656794, "entropy": 0.3919810652732849, "epoch": 0.008, "grad_norm": 1.3463600873947144, "kl": 0.534088172018528, "learning_rate": 9.999754640369969e-06, "loss": 0.035, "step": 400, "step_time": 9.14792783199664 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.0, "completions/max_length": 3113.0, "completions/max_terminated_length": 3113.0, "completions/mean_length": 2888.625, "completions/mean_terminated_length": 2888.625, "completions/min_length": 2569.0, "completions/min_terminated_length": 2569.0, "entropy": 0.34803007543087006, "epoch": 0.00802, "frac_reward_zero_std": 0.0, "grad_norm": 1.1884585618972778, "kl": 0.4120374508202076, "learning_rate": 9.99975329040311e-06, "loss": -0.0251, "num_tokens": 21280860.0, "reward": -0.020624998956918716, "reward_std": 0.09246805310249329, "rewards/rollout_reward_func/mean": -0.020624998956918716, "rewards/rollout_reward_func/std": 0.1875123679637909, "sampling/importance_sampling_ratio/max": 2.7591519355773926, "sampling/importance_sampling_ratio/mean": 1.0442249774932861, "sampling/importance_sampling_ratio/min": 0.558386504650116, "sampling/sampling_logp_difference/max": 0.5773515701293945, "sampling/sampling_logp_difference/mean": 0.03956456109881401, "step": 401, "step_time": 43.588559673997224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.006560457521118224, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006560457521118224, "entropy": 0.34330132976174355, "epoch": 0.00804, "grad_norm": 1.2133526802062988, "kl": 0.42174356430768967, "learning_rate": 9.9997519367328e-06, "loss": -0.0269, "step": 402, "step_time": 9.196838259005744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00046296295477077365, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00046296295477077365, "completions/clipped_ratio": 0.0625, "completions/max_length": 3085.0, "completions/max_terminated_length": 3085.0, "completions/mean_length": 2836.3125, "completions/mean_terminated_length": 2866.43359375, "completions/min_length": 1967.0, "completions/min_terminated_length": 2182.0, "entropy": 0.38612882420420647, "epoch": 0.00806, "frac_reward_zero_std": 0.125, "grad_norm": 1.1789888143539429, "kl": 0.5264580063521862, "learning_rate": 9.999750579359042e-06, "loss": -0.0421, "num_tokens": 21395052.0, "reward": -0.0034374999813735485, "reward_std": 0.13805632293224335, "rewards/rollout_reward_func/mean": -0.0034374999813735485, "rewards/rollout_reward_func/std": 0.2645215690135956, "sampling/importance_sampling_ratio/max": 1.6176923513412476, "sampling/importance_sampling_ratio/mean": 0.8611575365066528, "sampling/importance_sampling_ratio/min": 0.04078269377350807, "sampling/sampling_logp_difference/max": 1.5039923191070557, "sampling/sampling_logp_difference/mean": 0.038096338510513306, "step": 403, "step_time": 41.359143236004456 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.006714665098115802, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010186887346208096, "entropy": 0.38065794110298157, "epoch": 0.00808, "grad_norm": 0.9908080101013184, "kl": 0.5645314268767834, "learning_rate": 9.999749218281836e-06, "loss": -0.0446, "step": 404, "step_time": 9.103962293986115 } ], "logging_steps": 1.0, "max_steps": 100000, "num_input_tokens_seen": 21395052, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }