train_B_gin_rummy / trainer_state.json
Gege24's picture
Upload task output 1
7639078 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.00808,
"eval_steps": 500,
"global_step": 404,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2757.0,
"completions/max_terminated_length": 2757.0,
"completions/mean_length": 2474.625,
"completions/mean_terminated_length": 2484.48388671875,
"completions/min_length": 1847.0,
"completions/min_terminated_length": 1847.0,
"entropy": 0.6398179829120636,
"epoch": 2e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9939162731170654,
"kl": 0.0,
"learning_rate": 0.0,
"loss": -0.0017,
"num_tokens": 102428.0,
"reward": -0.06937500089406967,
"reward_std": 0.1426866054534912,
"rewards/rollout_reward_func/mean": -0.06937500089406967,
"rewards/rollout_reward_func/std": 0.22933192551136017,
"sampling/importance_sampling_ratio/max": 1.4089819192886353,
"sampling/importance_sampling_ratio/mean": 0.976771354675293,
"sampling/importance_sampling_ratio/min": 0.49511995911598206,
"sampling/sampling_logp_difference/max": 0.6248791217803955,
"sampling/sampling_logp_difference/mean": 0.02513751946389675,
"step": 1,
"step_time": 40.5301194210042
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.6398179829120636,
"epoch": 4e-05,
"grad_norm": 1.9967504739761353,
"kl": 0.0,
"learning_rate": 2.8571428571428575e-07,
"loss": -0.0017,
"step": 2,
"step_time": 8.158031945989933
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2696.0,
"completions/max_terminated_length": 2696.0,
"completions/mean_length": 2449.78125,
"completions/mean_terminated_length": 2449.78125,
"completions/min_length": 1730.0,
"completions/min_terminated_length": 1730.0,
"entropy": 0.5683682635426521,
"epoch": 6e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7206465005874634,
"kl": 0.0009393466789333615,
"learning_rate": 5.714285714285715e-07,
"loss": -0.0564,
"num_tokens": 204432.0,
"reward": -0.07793749868869781,
"reward_std": 0.19137459993362427,
"rewards/rollout_reward_func/mean": -0.07793749868869781,
"rewards/rollout_reward_func/std": 0.27293646335601807,
"sampling/importance_sampling_ratio/max": 1.327203631401062,
"sampling/importance_sampling_ratio/mean": 1.0219132900238037,
"sampling/importance_sampling_ratio/min": 0.7237246036529541,
"sampling/sampling_logp_difference/max": 0.2326061725616455,
"sampling/sampling_logp_difference/mean": 0.02151086926460266,
"step": 3,
"step_time": 37.95992787499563
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0018939394503831863,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0038470644503831863,
"entropy": 0.5685252919793129,
"epoch": 8e-05,
"grad_norm": 1.7116172313690186,
"kl": 0.0009586924861650914,
"learning_rate": 8.571428571428572e-07,
"loss": -0.056,
"step": 4,
"step_time": 8.032366728002671
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2665.0,
"completions/max_terminated_length": 2665.0,
"completions/mean_length": 2506.28125,
"completions/mean_terminated_length": 2506.28125,
"completions/min_length": 2145.0,
"completions/min_terminated_length": 2145.0,
"entropy": 0.5751285180449486,
"epoch": 0.0001,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.0214176177978516,
"kl": 0.0009553735217195936,
"learning_rate": 1.142857142857143e-06,
"loss": -0.0098,
"num_tokens": 307992.0,
"reward": -0.0325000025331974,
"reward_std": 0.07031647861003876,
"rewards/rollout_reward_func/mean": -0.0325000025331974,
"rewards/rollout_reward_func/std": 0.07426369935274124,
"sampling/importance_sampling_ratio/max": 1.2946679592132568,
"sampling/importance_sampling_ratio/mean": 0.9828097224235535,
"sampling/importance_sampling_ratio/min": 0.43161657452583313,
"sampling/sampling_logp_difference/max": 0.3309330940246582,
"sampling/sampling_logp_difference/mean": 0.022440873086452484,
"step": 5,
"step_time": 40.78049653198832
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5760079547762871,
"epoch": 0.00012,
"grad_norm": 1.9722453355789185,
"kl": 0.0010062552464660257,
"learning_rate": 1.4285714285714286e-06,
"loss": -0.0114,
"step": 6,
"step_time": 8.023996326002816
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0018939394503831863,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0038470644503831863,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2772.0,
"completions/max_terminated_length": 2772.0,
"completions/mean_length": 2420.15625,
"completions/mean_terminated_length": 2420.15625,
"completions/min_length": 1002.0,
"completions/min_terminated_length": 1002.0,
"entropy": 0.5471096336841583,
"epoch": 0.00014,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4982593059539795,
"kl": 0.001003716250124853,
"learning_rate": 1.7142857142857145e-06,
"loss": 0.0061,
"num_tokens": 409519.0,
"reward": -0.0521249994635582,
"reward_std": 0.2334682047367096,
"rewards/rollout_reward_func/mean": -0.0521249994635582,
"rewards/rollout_reward_func/std": 0.3047172725200653,
"sampling/importance_sampling_ratio/max": 1.9076436758041382,
"sampling/importance_sampling_ratio/mean": 1.0103974342346191,
"sampling/importance_sampling_ratio/min": 0.5369899868965149,
"sampling/sampling_logp_difference/max": 0.30723023414611816,
"sampling/sampling_logp_difference/mean": 0.02194805070757866,
"step": 7,
"step_time": 36.75636156700784
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.0039100684225559235,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009769443422555923,
"entropy": 0.5471609607338905,
"epoch": 0.00016,
"grad_norm": 1.3890447616577148,
"kl": 0.0008665668829053175,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0069,
"step": 8,
"step_time": 8.295392295018246
},
{
"clip_ratio/high_max": 0.01119087846018374,
"clip_ratio/high_mean": 0.00559543923009187,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00754856423009187,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2707.0,
"completions/max_terminated_length": 2707.0,
"completions/mean_length": 2485.09375,
"completions/mean_terminated_length": 2485.09375,
"completions/min_length": 1610.0,
"completions/min_terminated_length": 1610.0,
"entropy": 0.5709630325436592,
"epoch": 0.00018,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5354437828063965,
"kl": 0.0010416117875138298,
"learning_rate": 2.285714285714286e-06,
"loss": 0.0544,
"num_tokens": 512600.0,
"reward": 0.010625001043081284,
"reward_std": 0.14592772722244263,
"rewards/rollout_reward_func/mean": 0.010625001043081284,
"rewards/rollout_reward_func/std": 0.2494889795780182,
"sampling/importance_sampling_ratio/max": 1.6029342412948608,
"sampling/importance_sampling_ratio/mean": 0.9924947619438171,
"sampling/importance_sampling_ratio/min": 0.556939959526062,
"sampling/sampling_logp_difference/max": 0.3805968761444092,
"sampling/sampling_logp_difference/mean": 0.024389563128352165,
"step": 9,
"step_time": 39.20236621199729
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.5720346607267857,
"epoch": 0.0002,
"grad_norm": 1.5500391721725464,
"kl": 0.0010869200268643908,
"learning_rate": 2.571428571428571e-06,
"loss": 0.0545,
"step": 10,
"step_time": 8.161946275991795
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.0013586956774815917,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007218070677481592,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2716.0,
"completions/max_terminated_length": 2716.0,
"completions/mean_length": 2482.46875,
"completions/mean_terminated_length": 2482.46875,
"completions/min_length": 2252.0,
"completions/min_terminated_length": 2252.0,
"entropy": 0.6314467415213585,
"epoch": 0.00022,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8739888668060303,
"kl": 0.0012773948546964675,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.0387,
"num_tokens": 615646.0,
"reward": -0.031562499701976776,
"reward_std": 0.0844234824180603,
"rewards/rollout_reward_func/mean": -0.031562499701976776,
"rewards/rollout_reward_func/std": 0.09391380101442337,
"sampling/importance_sampling_ratio/max": 1.9019598960876465,
"sampling/importance_sampling_ratio/mean": 0.9923563003540039,
"sampling/importance_sampling_ratio/min": 0.5546591281890869,
"sampling/sampling_logp_difference/max": 0.3334968090057373,
"sampling/sampling_logp_difference/mean": 0.02209584228694439,
"step": 11,
"step_time": 40.81080743800703
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.6323637664318085,
"epoch": 0.00024,
"grad_norm": 1.8624191284179688,
"kl": 0.0011840567749459296,
"learning_rate": 3.142857142857143e-06,
"loss": 0.0397,
"step": 12,
"step_time": 8.157768286997452
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2760.0,
"completions/max_terminated_length": 2760.0,
"completions/mean_length": 2491.75,
"completions/mean_terminated_length": 2491.75,
"completions/min_length": 1867.0,
"completions/min_terminated_length": 1867.0,
"entropy": 0.5670790821313858,
"epoch": 0.00026,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.0665578842163086,
"kl": 0.0014767120446776971,
"learning_rate": 3.428571428571429e-06,
"loss": -0.0258,
"num_tokens": 719436.0,
"reward": -0.06818749755620956,
"reward_std": 0.1440477967262268,
"rewards/rollout_reward_func/mean": -0.06818749755620956,
"rewards/rollout_reward_func/std": 0.1960759162902832,
"sampling/importance_sampling_ratio/max": 1.743096113204956,
"sampling/importance_sampling_ratio/mean": 1.0059804916381836,
"sampling/importance_sampling_ratio/min": 0.5690397024154663,
"sampling/sampling_logp_difference/max": 0.42962151765823364,
"sampling/sampling_logp_difference/mean": 0.02228451520204544,
"step": 13,
"step_time": 40.541536634002114
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5652061775326729,
"epoch": 0.00028,
"grad_norm": 2.1127512454986572,
"kl": 0.0011144974414492026,
"learning_rate": 3.7142857142857146e-06,
"loss": -0.0274,
"step": 14,
"step_time": 8.263732075007283
},
{
"clip_ratio/high_max": 0.007575757801532745,
"clip_ratio/high_mean": 0.0037878789007663727,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005741003900766373,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2691.0,
"completions/max_terminated_length": 2691.0,
"completions/mean_length": 2484.71875,
"completions/mean_terminated_length": 2484.71875,
"completions/min_length": 2008.0,
"completions/min_terminated_length": 2008.0,
"entropy": 0.597566194832325,
"epoch": 0.0003,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.0065197944641113,
"kl": 0.0012788765816367231,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0418,
"num_tokens": 822717.0,
"reward": -0.10331249237060547,
"reward_std": 0.1455155909061432,
"rewards/rollout_reward_func/mean": -0.10331249237060547,
"rewards/rollout_reward_func/std": 0.20404654741287231,
"sampling/importance_sampling_ratio/max": 1.4405966997146606,
"sampling/importance_sampling_ratio/mean": 0.9779974818229675,
"sampling/importance_sampling_ratio/min": 0.42821815609931946,
"sampling/sampling_logp_difference/max": 0.4516181945800781,
"sampling/sampling_logp_difference/mean": 0.023157190531492233,
"step": 15,
"step_time": 37.62107830101013
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.5933906957507133,
"epoch": 0.00032,
"grad_norm": 1.9404207468032837,
"kl": 0.0014216557101462968,
"learning_rate": 4.2857142857142855e-06,
"loss": -0.0459,
"step": 16,
"step_time": 9.040029301002505
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.005093443673104048,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005093443673104048,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2700.0,
"completions/max_terminated_length": 2700.0,
"completions/mean_length": 2493.53125,
"completions/mean_terminated_length": 2493.53125,
"completions/min_length": 2279.0,
"completions/min_terminated_length": 2279.0,
"entropy": 0.5841243341565132,
"epoch": 0.00034,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.7072598934173584,
"kl": 0.0023420277575496584,
"learning_rate": 4.571428571428572e-06,
"loss": 0.0018,
"num_tokens": 926571.0,
"reward": -0.02068750001490116,
"reward_std": 0.08841773867607117,
"rewards/rollout_reward_func/mean": -0.02068750001490116,
"rewards/rollout_reward_func/std": 0.09211999177932739,
"sampling/importance_sampling_ratio/max": 1.8994961977005005,
"sampling/importance_sampling_ratio/mean": 1.0768578052520752,
"sampling/importance_sampling_ratio/min": 0.44597309827804565,
"sampling/sampling_logp_difference/max": 0.5778663158416748,
"sampling/sampling_logp_difference/mean": 0.02513628825545311,
"step": 17,
"step_time": 39.35067342498951
},
{
"clip_ratio/high_max": 0.007582720601931214,
"clip_ratio/high_mean": 0.003791360300965607,
"clip_ratio/low_mean": 0.005744485300965607,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009535845601931214,
"entropy": 0.5797206610441208,
"epoch": 0.00036,
"grad_norm": 2.360910177230835,
"kl": 0.0030427857127506286,
"learning_rate": 4.857142857142858e-06,
"loss": -0.0002,
"step": 18,
"step_time": 8.148258179004188
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2750.0,
"completions/max_terminated_length": 2750.0,
"completions/mean_length": 2509.125,
"completions/mean_terminated_length": 2509.125,
"completions/min_length": 2229.0,
"completions/min_terminated_length": 2229.0,
"entropy": 0.5699355229735374,
"epoch": 0.00038,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7883220911026,
"kl": 0.0033391022589057684,
"learning_rate": 5.142857142857142e-06,
"loss": -0.024,
"num_tokens": 1030914.0,
"reward": -0.04781249538064003,
"reward_std": 0.11873149871826172,
"rewards/rollout_reward_func/mean": -0.04781249538064003,
"rewards/rollout_reward_func/std": 0.18549835681915283,
"sampling/importance_sampling_ratio/max": 1.7033848762512207,
"sampling/importance_sampling_ratio/mean": 1.0213618278503418,
"sampling/importance_sampling_ratio/min": 0.5055848360061646,
"sampling/sampling_logp_difference/max": 0.3674435615539551,
"sampling/sampling_logp_difference/mean": 0.022144002839922905,
"step": 19,
"step_time": 40.42446762702457
},
{
"clip_ratio/high_max": 0.006623641354963183,
"clip_ratio/high_mean": 0.0033118206774815917,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007218070677481592,
"entropy": 0.5630137547850609,
"epoch": 0.0004,
"grad_norm": 1.659043550491333,
"kl": 0.004340015002526343,
"learning_rate": 5.428571428571429e-06,
"loss": -0.0286,
"step": 20,
"step_time": 8.287577473987767
},
{
"clip_ratio/high_max": 0.016526442486792803,
"clip_ratio/high_mean": 0.009621917037293315,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009621917037293315,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2694.0,
"completions/max_terminated_length": 2694.0,
"completions/mean_length": 2414.625,
"completions/mean_terminated_length": 2414.625,
"completions/min_length": 973.0,
"completions/min_terminated_length": 973.0,
"entropy": 0.5076057054102421,
"epoch": 0.00042,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7286955118179321,
"kl": 0.003812528128037229,
"learning_rate": 5.7142857142857145e-06,
"loss": -0.0864,
"num_tokens": 1131985.0,
"reward": -0.07437499612569809,
"reward_std": 0.15552687644958496,
"rewards/rollout_reward_func/mean": -0.07437499612569809,
"rewards/rollout_reward_func/std": 0.24696791172027588,
"sampling/importance_sampling_ratio/max": 1.4717481136322021,
"sampling/importance_sampling_ratio/mean": 0.9868855476379395,
"sampling/importance_sampling_ratio/min": 0.5080141425132751,
"sampling/sampling_logp_difference/max": 0.33080339431762695,
"sampling/sampling_logp_difference/mean": 0.023507488891482353,
"step": 21,
"step_time": 38.73707472301612
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.49742893874645233,
"epoch": 0.00044,
"grad_norm": 1.7899357080459595,
"kl": 0.005958152993116528,
"learning_rate": 6e-06,
"loss": -0.0874,
"step": 22,
"step_time": 8.990655720990617
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2769.0,
"completions/max_terminated_length": 2769.0,
"completions/mean_length": 2447.125,
"completions/mean_terminated_length": 2447.125,
"completions/min_length": 1353.0,
"completions/min_terminated_length": 1353.0,
"entropy": 0.5160082057118416,
"epoch": 0.00046,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.897569179534912,
"kl": 0.009750543569680303,
"learning_rate": 6.285714285714286e-06,
"loss": -0.0297,
"num_tokens": 1233712.0,
"reward": -0.06949999928474426,
"reward_std": 0.13460204005241394,
"rewards/rollout_reward_func/mean": -0.06949999928474426,
"rewards/rollout_reward_func/std": 0.19123773276805878,
"sampling/importance_sampling_ratio/max": 1.734204649925232,
"sampling/importance_sampling_ratio/mean": 1.0283379554748535,
"sampling/importance_sampling_ratio/min": 0.6807689666748047,
"sampling/sampling_logp_difference/max": 0.32382988929748535,
"sampling/sampling_logp_difference/mean": 0.025856411084532738,
"step": 23,
"step_time": 38.87603813601163
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.013556985300965607,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.015510110184550285,
"entropy": 0.5070139579474926,
"epoch": 0.00048,
"grad_norm": 1.6847742795944214,
"kl": 0.013837641919963062,
"learning_rate": 6.571428571428572e-06,
"loss": -0.0327,
"step": 24,
"step_time": 8.237429195003642
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2677.0,
"completions/max_terminated_length": 2677.0,
"completions/mean_length": 2477.65625,
"completions/mean_terminated_length": 2477.65625,
"completions/min_length": 2004.0,
"completions/min_terminated_length": 2004.0,
"entropy": 0.46570198982954025,
"epoch": 0.0005,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8208125829696655,
"kl": 0.017063510487787426,
"learning_rate": 6.857142857142858e-06,
"loss": -0.0471,
"num_tokens": 1336013.0,
"reward": -0.041374996304512024,
"reward_std": 0.13650161027908325,
"rewards/rollout_reward_func/mean": -0.041374996304512024,
"rewards/rollout_reward_func/std": 0.20301242172718048,
"sampling/importance_sampling_ratio/max": 1.717456579208374,
"sampling/importance_sampling_ratio/mean": 0.9826828241348267,
"sampling/importance_sampling_ratio/min": 0.5129362344741821,
"sampling/sampling_logp_difference/max": 0.6331992149353027,
"sampling/sampling_logp_difference/mean": 0.03637850284576416,
"step": 25,
"step_time": 38.30766316399968
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.453332532197237,
"epoch": 0.00052,
"grad_norm": 1.9915790557861328,
"kl": 0.022920054849237204,
"learning_rate": 7.1428571428571436e-06,
"loss": -0.049,
"step": 26,
"step_time": 8.041683328003273
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2685.0,
"completions/max_terminated_length": 2685.0,
"completions/mean_length": 2544.25,
"completions/mean_terminated_length": 2544.25,
"completions/min_length": 2150.0,
"completions/min_terminated_length": 2150.0,
"entropy": 0.4926302433013916,
"epoch": 0.00054,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7289246320724487,
"kl": 0.02990493644028902,
"learning_rate": 7.428571428571429e-06,
"loss": 0.0072,
"num_tokens": 1441197.0,
"reward": -0.06387500464916229,
"reward_std": 0.10385941714048386,
"rewards/rollout_reward_func/mean": -0.06387500464916229,
"rewards/rollout_reward_func/std": 0.16128110885620117,
"sampling/importance_sampling_ratio/max": 1.4102411270141602,
"sampling/importance_sampling_ratio/mean": 0.8670729398727417,
"sampling/importance_sampling_ratio/min": 0.3404218554496765,
"sampling/sampling_logp_difference/max": 0.7324519157409668,
"sampling/sampling_logp_difference/mean": 0.04336021840572357,
"step": 27,
"step_time": 39.86006593199272
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.009598214295692742,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.01350446417927742,
"entropy": 0.4816759377717972,
"epoch": 0.00056,
"grad_norm": 1.775354266166687,
"kl": 0.044567104894667864,
"learning_rate": 7.714285714285716e-06,
"loss": 0.003,
"step": 28,
"step_time": 8.571332884996082
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2696.0,
"completions/max_terminated_length": 2696.0,
"completions/mean_length": 2481.5625,
"completions/mean_terminated_length": 2481.5625,
"completions/min_length": 1996.0,
"completions/min_terminated_length": 1996.0,
"entropy": 0.42902951687574387,
"epoch": 0.00058,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6338751316070557,
"kl": 0.05208651162683964,
"learning_rate": 8.000000000000001e-06,
"loss": -0.1571,
"num_tokens": 1543722.0,
"reward": -0.007187500596046448,
"reward_std": 0.14660096168518066,
"rewards/rollout_reward_func/mean": -0.007187500596046448,
"rewards/rollout_reward_func/std": 0.22778363525867462,
"sampling/importance_sampling_ratio/max": 2.0380847454071045,
"sampling/importance_sampling_ratio/mean": 0.9687550067901611,
"sampling/importance_sampling_ratio/min": 0.2778550982475281,
"sampling/sampling_logp_difference/max": 1.2108449935913086,
"sampling/sampling_logp_difference/mean": 0.05287490040063858,
"step": 29,
"step_time": 38.6119481420028
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.009895833441987634,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013802083441987634,
"entropy": 0.4154880531132221,
"epoch": 0.0006,
"grad_norm": 1.6772791147232056,
"kl": 0.0775398297701031,
"learning_rate": 8.285714285714287e-06,
"loss": -0.1612,
"step": 30,
"step_time": 8.044421385988244
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0018382353009656072,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0018382353009656072,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2710.0,
"completions/max_terminated_length": 2710.0,
"completions/mean_length": 2503.375,
"completions/mean_terminated_length": 2503.375,
"completions/min_length": 408.0,
"completions/min_terminated_length": 408.0,
"entropy": 0.40555064752697945,
"epoch": 0.00062,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.574500560760498,
"kl": 0.05575865495484322,
"learning_rate": 8.571428571428571e-06,
"loss": -0.0645,
"num_tokens": 1647517.0,
"reward": -0.04531250149011612,
"reward_std": 0.14535784721374512,
"rewards/rollout_reward_func/mean": -0.04531250149011612,
"rewards/rollout_reward_func/std": 0.2093304991722107,
"sampling/importance_sampling_ratio/max": 1.7538701295852661,
"sampling/importance_sampling_ratio/mean": 0.8808121681213379,
"sampling/importance_sampling_ratio/min": 0.2845723628997803,
"sampling/sampling_logp_difference/max": 0.8374984264373779,
"sampling/sampling_logp_difference/mean": 0.0564584843814373,
"step": 31,
"step_time": 38.74203074599063
},
{
"clip_ratio/high_max": 0.0062500000931322575,
"clip_ratio/high_mean": 0.0031250000465661287,
"clip_ratio/low_mean": 0.003791360300965607,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006916360347531736,
"entropy": 0.3967522084712982,
"epoch": 0.00064,
"grad_norm": 1.582572340965271,
"kl": 0.06716000568121672,
"learning_rate": 8.857142857142858e-06,
"loss": -0.0667,
"step": 32,
"step_time": 8.189764875001856
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.005744485300965607,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011603860300965607,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2688.0,
"completions/max_terminated_length": 2688.0,
"completions/mean_length": 2415.21875,
"completions/mean_terminated_length": 2415.21875,
"completions/min_length": 1304.0,
"completions/min_terminated_length": 1304.0,
"entropy": 0.3761625960469246,
"epoch": 0.00066,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.277189016342163,
"kl": 0.06961325742304325,
"learning_rate": 9.142857142857144e-06,
"loss": -0.0156,
"num_tokens": 1748089.0,
"reward": -0.0175624992698431,
"reward_std": 0.22900649905204773,
"rewards/rollout_reward_func/mean": -0.0175624992698431,
"rewards/rollout_reward_func/std": 0.31228137016296387,
"sampling/importance_sampling_ratio/max": 2.0467588901519775,
"sampling/importance_sampling_ratio/mean": 0.9604863524436951,
"sampling/importance_sampling_ratio/min": 0.22416925430297852,
"sampling/sampling_logp_difference/max": 1.0932765007019043,
"sampling/sampling_logp_difference/mean": 0.06231032684445381,
"step": 33,
"step_time": 37.7063103410037
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.009765625,
"clip_ratio/low_mean": 0.013606223976239562,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.02337184874340892,
"entropy": 0.36862019822001457,
"epoch": 0.00068,
"grad_norm": 1.103268027305603,
"kl": 0.08232864388264716,
"learning_rate": 9.42857142857143e-06,
"loss": -0.0152,
"step": 34,
"step_time": 8.04393709400756
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0013586956774815917,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005264945677481592,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2683.0,
"completions/max_terminated_length": 2683.0,
"completions/mean_length": 2458.0,
"completions/mean_terminated_length": 2458.0,
"completions/min_length": 1005.0,
"completions/min_terminated_length": 1005.0,
"entropy": 0.3750050254166126,
"epoch": 0.0007,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6973440647125244,
"kl": 0.06921286834403872,
"learning_rate": 9.714285714285715e-06,
"loss": -0.1278,
"num_tokens": 1850470.0,
"reward": -0.0572500079870224,
"reward_std": 0.3149961531162262,
"rewards/rollout_reward_func/mean": -0.0572500079870224,
"rewards/rollout_reward_func/std": 0.40404173731803894,
"sampling/importance_sampling_ratio/max": 2.6847193241119385,
"sampling/importance_sampling_ratio/mean": 1.0530672073364258,
"sampling/importance_sampling_ratio/min": 0.1825435906648636,
"sampling/sampling_logp_difference/max": 1.4770822525024414,
"sampling/sampling_logp_difference/mean": 0.06285598129034042,
"step": 35,
"step_time": 38.96468219499366
},
{
"clip_ratio/high_max": 0.020833333488553762,
"clip_ratio/high_mean": 0.010416666744276881,
"clip_ratio/low_mean": 0.005205760127864778,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015622426988556981,
"entropy": 0.36646367236971855,
"epoch": 0.00072,
"grad_norm": 1.709372878074646,
"kl": 0.08743807720020413,
"learning_rate": 1e-05,
"loss": -0.129,
"step": 36,
"step_time": 8.023841024005378
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.009765625,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2705.0,
"completions/max_terminated_length": 2705.0,
"completions/mean_length": 2519.6875,
"completions/mean_terminated_length": 2519.6875,
"completions/min_length": 2134.0,
"completions/min_terminated_length": 2134.0,
"entropy": 0.34471601620316505,
"epoch": 0.00074,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.450613021850586,
"kl": 0.33715078979730606,
"learning_rate": 9.999999998148153e-06,
"loss": -0.2702,
"num_tokens": 1954652.0,
"reward": -0.017500001937150955,
"reward_std": 0.07098734378814697,
"rewards/rollout_reward_func/mean": -0.017500001937150955,
"rewards/rollout_reward_func/std": 0.08203067630529404,
"sampling/importance_sampling_ratio/max": 2.792572498321533,
"sampling/importance_sampling_ratio/mean": 0.9947938919067383,
"sampling/importance_sampling_ratio/min": 0.13970009982585907,
"sampling/sampling_logp_difference/max": 2.3089332580566406,
"sampling/sampling_logp_difference/mean": 0.08333279192447662,
"step": 37,
"step_time": 39.65462483598094
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.0234375,
"entropy": 0.3357379361987114,
"epoch": 0.00076,
"grad_norm": 2.3665034770965576,
"kl": 0.45696336030960083,
"learning_rate": 9.999999992592613e-06,
"loss": -0.276,
"step": 38,
"step_time": 8.107524030987406
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0022321429569274187,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006138392956927419,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2630.0,
"completions/max_terminated_length": 2630.0,
"completions/mean_length": 2378.6875,
"completions/mean_terminated_length": 2378.6875,
"completions/min_length": 384.0,
"completions/min_terminated_length": 384.0,
"entropy": 0.3408471681177616,
"epoch": 0.00078,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.7294774055480957,
"kl": 0.16567623522132635,
"learning_rate": 9.999999983333379e-06,
"loss": -0.0684,
"num_tokens": 2054728.0,
"reward": -0.14356249570846558,
"reward_std": 0.20076312124729156,
"rewards/rollout_reward_func/mean": -0.14356249570846558,
"rewards/rollout_reward_func/std": 0.34942853450775146,
"sampling/importance_sampling_ratio/max": 2.2133710384368896,
"sampling/importance_sampling_ratio/mean": 0.9344292879104614,
"sampling/importance_sampling_ratio/min": 0.10769516229629517,
"sampling/sampling_logp_difference/max": 1.4032087326049805,
"sampling/sampling_logp_difference/mean": 0.08816322684288025,
"step": 39,
"step_time": 40.239868925993505
},
{
"clip_ratio/high_max": 0.027901785913854837,
"clip_ratio/high_mean": 0.018136160681024194,
"clip_ratio/low_mean": 0.014892988605424762,
"clip_ratio/low_min": 0.008370535913854837,
"clip_ratio/region_mean": 0.0330291495192796,
"entropy": 0.3337853290140629,
"epoch": 0.0008,
"grad_norm": 3.1899514198303223,
"kl": 0.2171315811574459,
"learning_rate": 9.999999970370451e-06,
"loss": -0.0714,
"step": 40,
"step_time": 8.054502869999851
},
{
"clip_ratio/high_max": 0.020089285913854837,
"clip_ratio/high_mean": 0.011997767956927419,
"clip_ratio/low_mean": 0.0018939394503831863,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013891707407310605,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2690.0,
"completions/max_terminated_length": 2690.0,
"completions/mean_length": 2504.5,
"completions/mean_terminated_length": 2504.5,
"completions/min_length": 1868.0,
"completions/min_terminated_length": 1868.0,
"entropy": 0.3315769322216511,
"epoch": 0.00082,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.1958541870117188,
"kl": 0.556195599026978,
"learning_rate": 9.99999995370383e-06,
"loss": 0.1613,
"num_tokens": 2158491.0,
"reward": -0.03606250509619713,
"reward_std": 0.12721404433250427,
"rewards/rollout_reward_func/mean": -0.03606250509619713,
"rewards/rollout_reward_func/std": 0.2096494734287262,
"sampling/importance_sampling_ratio/max": 2.9438412189483643,
"sampling/importance_sampling_ratio/mean": 1.0292738676071167,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.6189584732055664,
"sampling/sampling_logp_difference/mean": 0.09665805101394653,
"step": 41,
"step_time": 40.6466921770043
},
{
"clip_ratio/high_max": 0.03548450651578605,
"clip_ratio/high_mean": 0.017742253257893026,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017742253257893026,
"entropy": 0.33141712099313736,
"epoch": 0.00084,
"grad_norm": 2.158820390701294,
"kl": 0.4570291112177074,
"learning_rate": 9.999999933333514e-06,
"loss": 0.1573,
"step": 42,
"step_time": 8.173876360000577
},
{
"clip_ratio/high_max": 0.019301470601931214,
"clip_ratio/high_mean": 0.009650735300965607,
"clip_ratio/low_mean": 0.0022321429569274187,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011882878257893026,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2684.0,
"completions/max_terminated_length": 2684.0,
"completions/mean_length": 2314.28125,
"completions/mean_terminated_length": 2314.28125,
"completions/min_length": 380.0,
"completions/min_terminated_length": 380.0,
"entropy": 0.3161774128675461,
"epoch": 0.00086,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.4876537322998047,
"kl": 0.5472150244750082,
"learning_rate": 9.999999909259504e-06,
"loss": -0.0026,
"num_tokens": 2256178.0,
"reward": -0.11968749761581421,
"reward_std": 0.24497605860233307,
"rewards/rollout_reward_func/mean": -0.11968749761581421,
"rewards/rollout_reward_func/std": 0.3643242418766022,
"sampling/importance_sampling_ratio/max": 2.8263914585113525,
"sampling/importance_sampling_ratio/mean": 1.0258629322052002,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.7011332511901855,
"sampling/sampling_logp_difference/mean": 0.10345172137022018,
"step": 43,
"step_time": 38.945439303992316
},
{
"clip_ratio/high_max": 0.013494318351149559,
"clip_ratio/high_mean": 0.01065340917557478,
"clip_ratio/low_mean": 0.012890624813735485,
"clip_ratio/low_min": 0.0078125,
"clip_ratio/region_mean": 0.023544033989310265,
"entropy": 0.3204997628927231,
"epoch": 0.00088,
"grad_norm": 2.031017780303955,
"kl": 0.5513593647629023,
"learning_rate": 9.9999998814818e-06,
"loss": -0.0065,
"step": 44,
"step_time": 8.607130141004745
},
{
"clip_ratio/high_max": 0.025781250093132257,
"clip_ratio/high_mean": 0.014843750046566129,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01875000004656613,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2688.0,
"completions/max_terminated_length": 2688.0,
"completions/mean_length": 2464.5625,
"completions/mean_terminated_length": 2464.5625,
"completions/min_length": 391.0,
"completions/min_terminated_length": 391.0,
"entropy": 0.35709187015891075,
"epoch": 0.0009,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.1241366863250732,
"kl": 0.2615460283122957,
"learning_rate": 9.999999850000403e-06,
"loss": -0.071,
"num_tokens": 2358679.0,
"reward": -0.054625004529953,
"reward_std": 0.10086569935083389,
"rewards/rollout_reward_func/mean": -0.054625004529953,
"rewards/rollout_reward_func/std": 0.15416575968265533,
"sampling/importance_sampling_ratio/max": 2.5754051208496094,
"sampling/importance_sampling_ratio/mean": 1.00569486618042,
"sampling/importance_sampling_ratio/min": 0.007187636569142342,
"sampling/sampling_logp_difference/max": 3.3626089096069336,
"sampling/sampling_logp_difference/mean": 0.10147839784622192,
"step": 45,
"step_time": 39.38599755401083
},
{
"clip_ratio/high_max": 0.025781250093132257,
"clip_ratio/high_mean": 0.01679687504656613,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02460937504656613,
"entropy": 0.3584505319595337,
"epoch": 0.00092,
"grad_norm": 1.9520725011825562,
"kl": 0.2969521852210164,
"learning_rate": 9.999999814815314e-06,
"loss": -0.069,
"step": 46,
"step_time": 8.046390703013458
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2686.0,
"completions/max_terminated_length": 2686.0,
"completions/mean_length": 2508.46875,
"completions/mean_terminated_length": 2508.46875,
"completions/min_length": 2281.0,
"completions/min_terminated_length": 2281.0,
"entropy": 0.3446594066917896,
"epoch": 0.00094,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.154435873031616,
"kl": 0.28098534140735865,
"learning_rate": 9.99999977592653e-06,
"loss": -0.0095,
"num_tokens": 2462728.0,
"reward": -0.007187499664723873,
"reward_std": 0.05442311242222786,
"rewards/rollout_reward_func/mean": -0.007187499664723873,
"rewards/rollout_reward_func/std": 0.05979720130562782,
"sampling/importance_sampling_ratio/max": 2.8695878982543945,
"sampling/importance_sampling_ratio/mean": 0.9559117555618286,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.546647071838379,
"sampling/sampling_logp_difference/mean": 0.10615775734186172,
"step": 47,
"step_time": 41.4313356499988
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.34684762358665466,
"epoch": 0.00096,
"grad_norm": 3.2517645359039307,
"kl": 0.24896481167525053,
"learning_rate": 9.999999733334051e-06,
"loss": -0.0127,
"step": 48,
"step_time": 8.04854733600223
},
{
"clip_ratio/high_max": 0.01499417726881802,
"clip_ratio/high_mean": 0.009450213401578367,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015309588401578367,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2774.0,
"completions/max_terminated_length": 2774.0,
"completions/mean_length": 2508.15625,
"completions/mean_terminated_length": 2508.15625,
"completions/min_length": 1637.0,
"completions/min_terminated_length": 1637.0,
"entropy": 0.44613758474588394,
"epoch": 0.00098,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6932541131973267,
"kl": 0.1522258846089244,
"learning_rate": 9.99999968703788e-06,
"loss": -0.0828,
"num_tokens": 2566857.0,
"reward": -0.01862499676644802,
"reward_std": 0.17793361842632294,
"rewards/rollout_reward_func/mean": -0.01862499676644802,
"rewards/rollout_reward_func/std": 0.2609236538410187,
"sampling/importance_sampling_ratio/max": 2.0118746757507324,
"sampling/importance_sampling_ratio/mean": 0.6943110227584839,
"sampling/importance_sampling_ratio/min": 0.10269977152347565,
"sampling/sampling_logp_difference/max": 1.4632587432861328,
"sampling/sampling_logp_difference/mean": 0.08679313212633133,
"step": 49,
"step_time": 38.579419355992286
},
{
"clip_ratio/high_max": 0.0390625,
"clip_ratio/high_mean": 0.020889945793896914,
"clip_ratio/low_mean": 0.01171875,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.032608695793896914,
"entropy": 0.44533828645944595,
"epoch": 0.001,
"grad_norm": 1.3143709897994995,
"kl": 0.13447811640799046,
"learning_rate": 9.999999637038016e-06,
"loss": -0.085,
"step": 50,
"step_time": 9.22721716301021
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2741.0,
"completions/max_terminated_length": 2741.0,
"completions/mean_length": 2454.53125,
"completions/mean_terminated_length": 2454.53125,
"completions/min_length": 327.0,
"completions/min_terminated_length": 327.0,
"entropy": 0.31926462426781654,
"epoch": 0.00102,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.8239920139312744,
"kl": 0.1380001651123166,
"learning_rate": 9.999999583334458e-06,
"loss": 0.0342,
"num_tokens": 2668420.0,
"reward": -0.10174999386072159,
"reward_std": 0.16292209923267365,
"rewards/rollout_reward_func/mean": -0.10174999386072159,
"rewards/rollout_reward_func/std": 0.30493563413619995,
"sampling/importance_sampling_ratio/max": 2.260237216949463,
"sampling/importance_sampling_ratio/mean": 0.9161804914474487,
"sampling/importance_sampling_ratio/min": 0.1295454055070877,
"sampling/sampling_logp_difference/max": 1.4744603633880615,
"sampling/sampling_logp_difference/mean": 0.087027907371521,
"step": 51,
"step_time": 40.09692535500653
},
{
"clip_ratio/high_max": 0.02734375,
"clip_ratio/high_mean": 0.01953125,
"clip_ratio/low_mean": 0.01171875,
"clip_ratio/low_min": 0.0078125,
"clip_ratio/region_mean": 0.03125,
"entropy": 0.31379691883921623,
"epoch": 0.00104,
"grad_norm": 1.4041469097137451,
"kl": 0.11381018441170454,
"learning_rate": 9.999999525927207e-06,
"loss": 0.0289,
"step": 52,
"step_time": 8.14323568900727
},
{
"clip_ratio/high_max": 0.011600378900766373,
"clip_ratio/high_mean": 0.005800189450383186,
"clip_ratio/low_mean": 0.009895833441987634,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01569602289237082,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2827.0,
"completions/max_terminated_length": 2827.0,
"completions/mean_length": 2525.75,
"completions/mean_terminated_length": 2525.75,
"completions/min_length": 1651.0,
"completions/min_terminated_length": 1651.0,
"entropy": 0.3053863197565079,
"epoch": 0.00106,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.0077009201049805,
"kl": 0.22236851323395967,
"learning_rate": 9.999999464816262e-06,
"loss": -0.1174,
"num_tokens": 2772803.0,
"reward": -0.023000001907348633,
"reward_std": 0.2369600236415863,
"rewards/rollout_reward_func/mean": -0.023000001907348633,
"rewards/rollout_reward_func/std": 0.3612251579761505,
"sampling/importance_sampling_ratio/max": 2.7046000957489014,
"sampling/importance_sampling_ratio/mean": 0.9783951640129089,
"sampling/importance_sampling_ratio/min": 0.022106723859906197,
"sampling/sampling_logp_difference/max": 2.115330696105957,
"sampling/sampling_logp_difference/mean": 0.07762297242879868,
"step": 53,
"step_time": 39.43046704398148
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013671875,
"entropy": 0.3010335825383663,
"epoch": 0.00108,
"grad_norm": 2.0416150093078613,
"kl": 0.24721599649637938,
"learning_rate": 9.999999400001624e-06,
"loss": -0.1213,
"step": 54,
"step_time": 8.363815268006874
},
{
"clip_ratio/high_max": 0.01953125,
"clip_ratio/high_mean": 0.009765625,
"clip_ratio/low_mean": 0.0016447368543595076,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011410361854359508,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2618.0,
"completions/max_terminated_length": 2618.0,
"completions/mean_length": 2366.34375,
"completions/mean_terminated_length": 2366.34375,
"completions/min_length": 372.0,
"completions/min_terminated_length": 372.0,
"entropy": 0.33765238150954247,
"epoch": 0.0011,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.974804162979126,
"kl": 0.18207258731126785,
"learning_rate": 9.999999331483293e-06,
"loss": 0.0599,
"num_tokens": 2871736.0,
"reward": -0.10312500596046448,
"reward_std": 0.28904440999031067,
"rewards/rollout_reward_func/mean": -0.10312500596046448,
"rewards/rollout_reward_func/std": 0.4227745532989502,
"sampling/importance_sampling_ratio/max": 2.3023340702056885,
"sampling/importance_sampling_ratio/mean": 0.8361672163009644,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 14.0252685546875,
"sampling/sampling_logp_difference/mean": 0.1051572933793068,
"step": 55,
"step_time": 38.05553440601216
},
{
"clip_ratio/high_max": 0.022820723708719015,
"clip_ratio/high_mean": 0.013363486854359508,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.019222861854359508,
"entropy": 0.33843234553933144,
"epoch": 0.00112,
"grad_norm": 2.0378940105438232,
"kl": 0.17897878028452396,
"learning_rate": 9.999999259261269e-06,
"loss": 0.0585,
"step": 56,
"step_time": 7.9421927529911045
},
{
"clip_ratio/high_max": 0.0234375,
"clip_ratio/high_mean": 0.01171875,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2746.0,
"completions/max_terminated_length": 2746.0,
"completions/mean_length": 2515.8125,
"completions/mean_terminated_length": 2515.8125,
"completions/min_length": 1867.0,
"completions/min_terminated_length": 1867.0,
"entropy": 0.32060882076621056,
"epoch": 0.00114,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.0523271560668945,
"kl": 0.23650522576645017,
"learning_rate": 9.999999183335551e-06,
"loss": 0.1251,
"num_tokens": 2976047.0,
"reward": -0.0755000039935112,
"reward_std": 0.1690390408039093,
"rewards/rollout_reward_func/mean": -0.0755000039935112,
"rewards/rollout_reward_func/std": 0.2713806927204132,
"sampling/importance_sampling_ratio/max": 2.124682664871216,
"sampling/importance_sampling_ratio/mean": 0.9381376504898071,
"sampling/importance_sampling_ratio/min": 0.15374599397182465,
"sampling/sampling_logp_difference/max": 1.6065802574157715,
"sampling/sampling_logp_difference/mean": 0.07379056513309479,
"step": 57,
"step_time": 40.01717420799832
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013671875,
"entropy": 0.3272933140397072,
"epoch": 0.00116,
"grad_norm": 1.6893479824066162,
"kl": 0.20167131815105677,
"learning_rate": 9.999999103706142e-06,
"loss": 0.1206,
"step": 58,
"step_time": 8.224727661014185
},
{
"clip_ratio/high_max": 0.016927083488553762,
"clip_ratio/high_mean": 0.008463541744276881,
"clip_ratio/low_mean": 0.003791360300965607,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012254901928827167,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2738.0,
"completions/max_terminated_length": 2738.0,
"completions/mean_length": 2365.625,
"completions/mean_terminated_length": 2365.625,
"completions/min_length": 379.0,
"completions/min_terminated_length": 379.0,
"entropy": 0.328445702791214,
"epoch": 0.00118,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7868247032165527,
"kl": 0.2537320605479181,
"learning_rate": 9.999999020373038e-06,
"loss": -0.0306,
"num_tokens": 3074895.0,
"reward": -0.13575001060962677,
"reward_std": 0.2329138219356537,
"rewards/rollout_reward_func/mean": -0.13575001060962677,
"rewards/rollout_reward_func/std": 0.3464026153087616,
"sampling/importance_sampling_ratio/max": 1.9202526807785034,
"sampling/importance_sampling_ratio/mean": 0.8327348232269287,
"sampling/importance_sampling_ratio/min": 0.11179676651954651,
"sampling/sampling_logp_difference/max": 1.572983741760254,
"sampling/sampling_logp_difference/mean": 0.07091033458709717,
"step": 59,
"step_time": 37.41163979900739
},
{
"clip_ratio/high_max": 0.04427083348855376,
"clip_ratio/high_mean": 0.02408854174427688,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.026041666977107525,
"entropy": 0.3374003656208515,
"epoch": 0.0012,
"grad_norm": 1.3850635290145874,
"kl": 0.21838521771132946,
"learning_rate": 9.999998933336242e-06,
"loss": -0.0349,
"step": 60,
"step_time": 8.157472340019012
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2783.0,
"completions/max_terminated_length": 2783.0,
"completions/mean_length": 2484.78125,
"completions/mean_terminated_length": 2484.78125,
"completions/min_length": 1034.0,
"completions/min_terminated_length": 1034.0,
"entropy": 0.3523460924625397,
"epoch": 0.00122,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.1732754707336426,
"kl": 0.06509718182496727,
"learning_rate": 9.999998842595754e-06,
"loss": -0.0152,
"num_tokens": 3178521.0,
"reward": -0.08306249976158142,
"reward_std": 0.19620290398597717,
"rewards/rollout_reward_func/mean": -0.08306249976158142,
"rewards/rollout_reward_func/std": 0.258004754781723,
"sampling/importance_sampling_ratio/max": 2.3492777347564697,
"sampling/importance_sampling_ratio/mean": 1.0118275880813599,
"sampling/importance_sampling_ratio/min": 0.2545172870159149,
"sampling/sampling_logp_difference/max": 0.9898586273193359,
"sampling/sampling_logp_difference/mean": 0.05905335396528244,
"step": 61,
"step_time": 41.6817599410133
},
{
"clip_ratio/high_max": 0.01953125,
"clip_ratio/high_mean": 0.01171875,
"clip_ratio/low_mean": 0.016276041977107525,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.027994791977107525,
"entropy": 0.35907527431845665,
"epoch": 0.00124,
"grad_norm": 1.8089604377746582,
"kl": 0.07727690786123276,
"learning_rate": 9.999998748151573e-06,
"loss": -0.0186,
"step": 62,
"step_time": 8.292622140004823
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2798.0,
"completions/max_terminated_length": 2798.0,
"completions/mean_length": 2538.625,
"completions/mean_terminated_length": 2538.625,
"completions/min_length": 2284.0,
"completions/min_terminated_length": 2284.0,
"entropy": 0.3591052442789078,
"epoch": 0.00126,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.542479157447815,
"kl": 0.18014016561210155,
"learning_rate": 9.999998650003697e-06,
"loss": -0.123,
"num_tokens": 3283395.0,
"reward": 0.009062500670552254,
"reward_std": 0.06245460733771324,
"rewards/rollout_reward_func/mean": 0.009062500670552254,
"rewards/rollout_reward_func/std": 0.06502402573823929,
"sampling/importance_sampling_ratio/max": 2.2381744384765625,
"sampling/importance_sampling_ratio/mean": 0.9013581871986389,
"sampling/importance_sampling_ratio/min": 0.09256737679243088,
"sampling/sampling_logp_difference/max": 1.322446346282959,
"sampling/sampling_logp_difference/mean": 0.07073262333869934,
"step": 63,
"step_time": 41.169668218979496
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.3636559210717678,
"epoch": 0.00128,
"grad_norm": 2.6269946098327637,
"kl": 0.17624082788825035,
"learning_rate": 9.999998548152132e-06,
"loss": -0.125,
"step": 64,
"step_time": 8.296477131996653
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2689.0,
"completions/max_terminated_length": 2689.0,
"completions/mean_length": 2548.90625,
"completions/mean_terminated_length": 2548.90625,
"completions/min_length": 2301.0,
"completions/min_terminated_length": 2301.0,
"entropy": 0.3906066454946995,
"epoch": 0.0013,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.987762212753296,
"kl": 0.13116181548684835,
"learning_rate": 9.999998442596872e-06,
"loss": 0.016,
"num_tokens": 3388504.0,
"reward": -0.021375000476837158,
"reward_std": 0.125992089509964,
"rewards/rollout_reward_func/mean": -0.021375000476837158,
"rewards/rollout_reward_func/std": 0.19460426270961761,
"sampling/importance_sampling_ratio/max": 2.4538660049438477,
"sampling/importance_sampling_ratio/mean": 0.9186806082725525,
"sampling/importance_sampling_ratio/min": 0.14261139929294586,
"sampling/sampling_logp_difference/max": 1.1358554363250732,
"sampling/sampling_logp_difference/mean": 0.07268854975700378,
"step": 65,
"step_time": 40.21691108200321
},
{
"clip_ratio/high_max": 0.01953125,
"clip_ratio/high_mean": 0.009765625,
"clip_ratio/low_mean": 0.009765625,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.01953125,
"entropy": 0.3885869197547436,
"epoch": 0.00132,
"grad_norm": 2.8780970573425293,
"kl": 0.13104048231616616,
"learning_rate": 9.999998333337923e-06,
"loss": 0.0126,
"step": 66,
"step_time": 8.100458256005368
},
{
"clip_ratio/high_max": 0.020833333488553762,
"clip_ratio/high_mean": 0.01627604174427688,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01627604174427688,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2643.0,
"completions/max_terminated_length": 2643.0,
"completions/mean_length": 2450.53125,
"completions/mean_terminated_length": 2450.53125,
"completions/min_length": 1017.0,
"completions/min_terminated_length": 1017.0,
"entropy": 0.3505115546286106,
"epoch": 0.00134,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.603503704071045,
"kl": 0.17773088440299034,
"learning_rate": 9.99999822037528e-06,
"loss": -0.0312,
"num_tokens": 3490536.0,
"reward": -0.06999999284744263,
"reward_std": 0.17571845650672913,
"rewards/rollout_reward_func/mean": -0.06999999284744263,
"rewards/rollout_reward_func/std": 0.2663976848125458,
"sampling/importance_sampling_ratio/max": 2.7864060401916504,
"sampling/importance_sampling_ratio/mean": 0.9576973915100098,
"sampling/importance_sampling_ratio/min": 0.23718009889125824,
"sampling/sampling_logp_difference/max": 1.3010015487670898,
"sampling/sampling_logp_difference/mean": 0.07168266922235489,
"step": 67,
"step_time": 40.63574154300295
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.3506194017827511,
"epoch": 0.00136,
"grad_norm": 1.6443591117858887,
"kl": 0.1914054024964571,
"learning_rate": 9.999998103708944e-06,
"loss": -0.0329,
"step": 68,
"step_time": 7.992264769010944
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2775.0,
"completions/max_terminated_length": 2775.0,
"completions/mean_length": 2423.375,
"completions/mean_terminated_length": 2423.375,
"completions/min_length": 609.0,
"completions/min_terminated_length": 609.0,
"entropy": 0.35105321556329727,
"epoch": 0.00138,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6509017944335938,
"kl": 0.13090949086472392,
"learning_rate": 9.999997983338918e-06,
"loss": 0.0782,
"num_tokens": 3591770.0,
"reward": -0.05456249788403511,
"reward_std": 0.1812342405319214,
"rewards/rollout_reward_func/mean": -0.05456249788403511,
"rewards/rollout_reward_func/std": 0.2536933422088623,
"sampling/importance_sampling_ratio/max": 2.6188161373138428,
"sampling/importance_sampling_ratio/mean": 0.9236185550689697,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.9868068695068359,
"sampling/sampling_logp_difference/mean": 0.07983951270580292,
"step": 69,
"step_time": 40.37906555600057
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.009765625,
"clip_ratio/low_mean": 0.005744485300965607,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015510110184550285,
"entropy": 0.35209859535098076,
"epoch": 0.0014,
"grad_norm": 1.7578048706054688,
"kl": 0.13869365211576223,
"learning_rate": 9.999997859265198e-06,
"loss": 0.0777,
"step": 70,
"step_time": 8.326206992009247
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.005468750023283064,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007421875023283064,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2720.0,
"completions/max_terminated_length": 2720.0,
"completions/mean_length": 2578.5,
"completions/mean_terminated_length": 2578.5,
"completions/min_length": 2418.0,
"completions/min_terminated_length": 2418.0,
"entropy": 0.4290778189897537,
"epoch": 0.00142,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.7589776515960693,
"kl": 0.14776458498090506,
"learning_rate": 9.999997731487788e-06,
"loss": 0.114,
"num_tokens": 3697547.0,
"reward": 0.012937500141561031,
"reward_std": 0.07223817706108093,
"rewards/rollout_reward_func/mean": 0.012937500141561031,
"rewards/rollout_reward_func/std": 0.07497953623533249,
"sampling/importance_sampling_ratio/max": 2.384115695953369,
"sampling/importance_sampling_ratio/mean": 1.0900431871414185,
"sampling/importance_sampling_ratio/min": 0.3165396749973297,
"sampling/sampling_logp_difference/max": 1.0836691856384277,
"sampling/sampling_logp_difference/mean": 0.07604964077472687,
"step": 71,
"step_time": 41.12706470600824
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.4288586899638176,
"epoch": 0.00144,
"grad_norm": 2.7190988063812256,
"kl": 0.13816983718425035,
"learning_rate": 9.999997600006685e-06,
"loss": 0.1101,
"step": 72,
"step_time": 8.669852263999928
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2775.0,
"completions/max_terminated_length": 2775.0,
"completions/mean_length": 2492.09375,
"completions/mean_terminated_length": 2492.09375,
"completions/min_length": 1328.0,
"completions/min_terminated_length": 1328.0,
"entropy": 0.3525308780372143,
"epoch": 0.00146,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.7203540802001953,
"kl": 0.1596079389564693,
"learning_rate": 9.999997464821892e-06,
"loss": -0.0414,
"num_tokens": 3800398.0,
"reward": -0.000937499338760972,
"reward_std": 0.16524627804756165,
"rewards/rollout_reward_func/mean": -0.000937499338760972,
"rewards/rollout_reward_func/std": 0.2613305449485779,
"sampling/importance_sampling_ratio/max": 2.968574047088623,
"sampling/importance_sampling_ratio/mean": 1.0997503995895386,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.0971083641052246,
"sampling/sampling_logp_difference/mean": 0.07693592458963394,
"step": 73,
"step_time": 39.97552160199848
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.3504711091518402,
"epoch": 0.00148,
"grad_norm": 2.504063129425049,
"kl": 0.1592545616440475,
"learning_rate": 9.999997325933409e-06,
"loss": -0.0468,
"step": 74,
"step_time": 8.246904959989479
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2716.0,
"completions/max_terminated_length": 2716.0,
"completions/mean_length": 2496.875,
"completions/mean_terminated_length": 2496.875,
"completions/min_length": 1654.0,
"completions/min_terminated_length": 1654.0,
"entropy": 0.35697900131344795,
"epoch": 0.0015,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.647608518600464,
"kl": 0.0990099674090743,
"learning_rate": 9.999997183341233e-06,
"loss": 0.0068,
"num_tokens": 3903870.0,
"reward": -0.09281250834465027,
"reward_std": 0.1840982586145401,
"rewards/rollout_reward_func/mean": -0.09281250834465027,
"rewards/rollout_reward_func/std": 0.3194171190261841,
"sampling/importance_sampling_ratio/max": 2.5169003009796143,
"sampling/importance_sampling_ratio/mean": 1.1982736587524414,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.017937421798706,
"sampling/sampling_logp_difference/mean": 0.05824393406510353,
"step": 75,
"step_time": 39.21068185700278
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.009765625,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.015625,
"entropy": 0.3578021042048931,
"epoch": 0.00152,
"grad_norm": 2.5704877376556396,
"kl": 0.0889095813035965,
"learning_rate": 9.999997037045365e-06,
"loss": -0.0003,
"step": 76,
"step_time": 8.127733456996793
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2657.0,
"completions/max_terminated_length": 2657.0,
"completions/mean_length": 2542.59375,
"completions/mean_terminated_length": 2542.59375,
"completions/min_length": 2408.0,
"completions/min_terminated_length": 2408.0,
"entropy": 0.3645514212548733,
"epoch": 0.00154,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.7248449325561523,
"kl": 0.09846360562369227,
"learning_rate": 9.999996887045808e-06,
"loss": -0.027,
"num_tokens": 4008956.0,
"reward": -0.0031249993480741978,
"reward_std": 0.06556794792413712,
"rewards/rollout_reward_func/mean": -0.0031249993480741978,
"rewards/rollout_reward_func/std": 0.0677013099193573,
"sampling/importance_sampling_ratio/max": 2.925798177719116,
"sampling/importance_sampling_ratio/mean": 1.2111601829528809,
"sampling/importance_sampling_ratio/min": 0.17253729701042175,
"sampling/sampling_logp_difference/max": 1.6732475757598877,
"sampling/sampling_logp_difference/mean": 0.06314364075660706,
"step": 77,
"step_time": 41.08125996801391
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.009765625,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.015625,
"entropy": 0.36346615105867386,
"epoch": 0.00156,
"grad_norm": 2.5180888175964355,
"kl": 0.0976226981729269,
"learning_rate": 9.99999673334256e-06,
"loss": -0.0306,
"step": 78,
"step_time": 8.931478123995475
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2696.0,
"completions/max_terminated_length": 2696.0,
"completions/mean_length": 2443.71875,
"completions/mean_terminated_length": 2443.71875,
"completions/min_length": 1264.0,
"completions/min_terminated_length": 1264.0,
"entropy": 0.37226664274930954,
"epoch": 0.00158,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6087464094161987,
"kl": 0.12296244129538536,
"learning_rate": 9.99999657593562e-06,
"loss": -0.0621,
"num_tokens": 4110538.0,
"reward": 0.0046875025145709515,
"reward_std": 0.1708545833826065,
"rewards/rollout_reward_func/mean": 0.0046875025145709515,
"rewards/rollout_reward_func/std": 0.2877609431743622,
"sampling/importance_sampling_ratio/max": 2.034745931625366,
"sampling/importance_sampling_ratio/mean": 0.9870630502700806,
"sampling/importance_sampling_ratio/min": 0.13036122918128967,
"sampling/sampling_logp_difference/max": 1.4829730987548828,
"sampling/sampling_logp_difference/mean": 0.0670691505074501,
"step": 79,
"step_time": 39.7622713850069
},
{
"clip_ratio/high_max": 0.011600378900766373,
"clip_ratio/high_mean": 0.005800189450383186,
"clip_ratio/low_mean": 0.016295553650707006,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.02209574286825955,
"entropy": 0.3671272359788418,
"epoch": 0.0016,
"grad_norm": 1.653118371963501,
"kl": 0.13659503497183323,
"learning_rate": 9.99999641482499e-06,
"loss": -0.0661,
"step": 80,
"step_time": 8.155490196986648
},
{
"clip_ratio/high_max": 0.0234375,
"clip_ratio/high_mean": 0.01171875,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2723.0,
"completions/max_terminated_length": 2723.0,
"completions/mean_length": 2522.65625,
"completions/mean_terminated_length": 2522.65625,
"completions/min_length": 1634.0,
"completions/min_terminated_length": 1634.0,
"entropy": 0.32682304456830025,
"epoch": 0.00162,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6380876302719116,
"kl": 0.11479341750964522,
"learning_rate": 9.999996250010671e-06,
"loss": -0.0309,
"num_tokens": 4214814.0,
"reward": 0.0560000017285347,
"reward_std": 0.2510544955730438,
"rewards/rollout_reward_func/mean": 0.0560000017285347,
"rewards/rollout_reward_func/std": 0.35488805174827576,
"sampling/importance_sampling_ratio/max": 2.08979868888855,
"sampling/importance_sampling_ratio/mean": 1.054112434387207,
"sampling/importance_sampling_ratio/min": 0.2209986299276352,
"sampling/sampling_logp_difference/max": 1.3254785537719727,
"sampling/sampling_logp_difference/mean": 0.04853574186563492,
"step": 81,
"step_time": 39.71311388498725
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.01171875,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.017578125,
"entropy": 0.3208409249782562,
"epoch": 0.00164,
"grad_norm": 1.6973153352737427,
"kl": 0.11302056722342968,
"learning_rate": 9.999996081492662e-06,
"loss": -0.0305,
"step": 82,
"step_time": 8.233296020996931
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2735.0,
"completions/max_terminated_length": 2735.0,
"completions/mean_length": 2485.5625,
"completions/mean_terminated_length": 2485.5625,
"completions/min_length": 403.0,
"completions/min_terminated_length": 403.0,
"entropy": 0.34682078659534454,
"epoch": 0.00166,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4546597003936768,
"kl": 0.26240994967520237,
"learning_rate": 9.999995909270962e-06,
"loss": 0.0636,
"num_tokens": 4318021.0,
"reward": -0.08749999105930328,
"reward_std": 0.24293166399002075,
"rewards/rollout_reward_func/mean": -0.08749999105930328,
"rewards/rollout_reward_func/std": 0.3317111134529114,
"sampling/importance_sampling_ratio/max": 2.534839391708374,
"sampling/importance_sampling_ratio/mean": 0.9597312211990356,
"sampling/importance_sampling_ratio/min": 0.14286302030086517,
"sampling/sampling_logp_difference/max": 1.5682034492492676,
"sampling/sampling_logp_difference/mean": 0.07494957745075226,
"step": 83,
"step_time": 40.59216376800032
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.3458695523440838,
"epoch": 0.00168,
"grad_norm": 1.3503389358520508,
"kl": 0.2947566229850054,
"learning_rate": 9.999995733345573e-06,
"loss": 0.0628,
"step": 84,
"step_time": 8.696758408012101
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2717.0,
"completions/max_terminated_length": 2717.0,
"completions/mean_length": 2553.5,
"completions/mean_terminated_length": 2553.5,
"completions/min_length": 2408.0,
"completions/min_terminated_length": 2408.0,
"entropy": 0.34082989022135735,
"epoch": 0.0017,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7237926721572876,
"kl": 0.215694485232234,
"learning_rate": 9.999995553716494e-06,
"loss": -0.0164,
"num_tokens": 4423460.0,
"reward": -0.0023124990984797478,
"reward_std": 0.05388234183192253,
"rewards/rollout_reward_func/mean": -0.0023124990984797478,
"rewards/rollout_reward_func/std": 0.07007481157779694,
"sampling/importance_sampling_ratio/max": 2.3099896907806396,
"sampling/importance_sampling_ratio/mean": 0.9558588266372681,
"sampling/importance_sampling_ratio/min": 0.09475265443325043,
"sampling/sampling_logp_difference/max": 1.84321129322052,
"sampling/sampling_logp_difference/mean": 0.06895856559276581,
"step": 85,
"step_time": 41.61511501099449
},
{
"clip_ratio/high_max": 0.01953125,
"clip_ratio/high_mean": 0.009765625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.3418670929968357,
"epoch": 0.00172,
"grad_norm": 1.680446743965149,
"kl": 0.21228844951838255,
"learning_rate": 9.999995370383725e-06,
"loss": -0.0208,
"step": 86,
"step_time": 8.200968577024469
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2714.0,
"completions/max_terminated_length": 2714.0,
"completions/mean_length": 2394.21875,
"completions/mean_terminated_length": 2394.21875,
"completions/min_length": 1009.0,
"completions/min_terminated_length": 1009.0,
"entropy": 0.33990855142474174,
"epoch": 0.00174,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7447688579559326,
"kl": 0.22580334031954408,
"learning_rate": 9.999995183347268e-06,
"loss": -0.0144,
"num_tokens": 4523582.0,
"reward": -0.10875000059604645,
"reward_std": 0.17445093393325806,
"rewards/rollout_reward_func/mean": -0.10875000059604645,
"rewards/rollout_reward_func/std": 0.2984124720096588,
"sampling/importance_sampling_ratio/max": 1.6535972356796265,
"sampling/importance_sampling_ratio/mean": 0.8531072735786438,
"sampling/importance_sampling_ratio/min": 0.1560264229774475,
"sampling/sampling_logp_difference/max": 1.3071918487548828,
"sampling/sampling_logp_difference/mean": 0.06737589836120605,
"step": 87,
"step_time": 38.91260410401446
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.34269775450229645,
"epoch": 0.00176,
"grad_norm": 1.6040964126586914,
"kl": 0.21285296906717122,
"learning_rate": 9.999994992607122e-06,
"loss": -0.0147,
"step": 88,
"step_time": 8.108735417001299
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2733.0,
"completions/max_terminated_length": 2733.0,
"completions/mean_length": 2498.4375,
"completions/mean_terminated_length": 2498.4375,
"completions/min_length": 1149.0,
"completions/min_terminated_length": 1149.0,
"entropy": 0.41232921928167343,
"epoch": 0.00178,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.217099666595459,
"kl": 0.15504596289247274,
"learning_rate": 9.999994798163286e-06,
"loss": -0.0326,
"num_tokens": 4627049.0,
"reward": -0.045250002294778824,
"reward_std": 0.1063895896077156,
"rewards/rollout_reward_func/mean": -0.045250002294778824,
"rewards/rollout_reward_func/std": 0.19846490025520325,
"sampling/importance_sampling_ratio/max": 1.697726845741272,
"sampling/importance_sampling_ratio/mean": 0.9777263402938843,
"sampling/importance_sampling_ratio/min": 0.22058866918087006,
"sampling/sampling_logp_difference/max": 1.5214389562606812,
"sampling/sampling_logp_difference/mean": 0.05762651562690735,
"step": 89,
"step_time": 41.47546880201844
},
{
"clip_ratio/high_max": 0.025240384973585606,
"clip_ratio/high_mean": 0.016526442486792803,
"clip_ratio/low_mean": 0.009765625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.026292067486792803,
"entropy": 0.40969282388687134,
"epoch": 0.0018,
"grad_norm": 2.034275770187378,
"kl": 0.1437628036364913,
"learning_rate": 9.999994600015764e-06,
"loss": -0.0348,
"step": 90,
"step_time": 8.164387071010424
},
{
"clip_ratio/high_max": 0.020833333488553762,
"clip_ratio/high_mean": 0.012369791744276881,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.014322916744276881,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2689.0,
"completions/max_terminated_length": 2689.0,
"completions/mean_length": 2442.15625,
"completions/mean_terminated_length": 2442.15625,
"completions/min_length": 403.0,
"completions/min_terminated_length": 403.0,
"entropy": 0.3805011622607708,
"epoch": 0.00182,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4305521249771118,
"kl": 0.1604838757775724,
"learning_rate": 9.99999439816455e-06,
"loss": -0.1169,
"num_tokens": 4728945.0,
"reward": -0.07343750447034836,
"reward_std": 0.24067096412181854,
"rewards/rollout_reward_func/mean": -0.07343750447034836,
"rewards/rollout_reward_func/std": 0.3149354159832001,
"sampling/importance_sampling_ratio/max": 2.548600673675537,
"sampling/importance_sampling_ratio/mean": 0.9048938155174255,
"sampling/importance_sampling_ratio/min": 0.09455844014883041,
"sampling/sampling_logp_difference/max": 1.6043697595596313,
"sampling/sampling_logp_difference/mean": 0.0698060691356659,
"step": 91,
"step_time": 40.740327625004284
},
{
"clip_ratio/high_max": 0.024739583488553762,
"clip_ratio/high_mean": 0.014322916744276881,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.014322916744276881,
"entropy": 0.3806532882153988,
"epoch": 0.00184,
"grad_norm": 1.4468754529953003,
"kl": 0.16916062962263823,
"learning_rate": 9.999994192609649e-06,
"loss": -0.1191,
"step": 92,
"step_time": 8.171024764007598
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2754.0,
"completions/max_terminated_length": 2754.0,
"completions/mean_length": 2552.75,
"completions/mean_terminated_length": 2552.75,
"completions/min_length": 2287.0,
"completions/min_terminated_length": 2287.0,
"entropy": 0.3542626202106476,
"epoch": 0.00186,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5995057821273804,
"kl": 0.18657434731721878,
"learning_rate": 9.99999398335106e-06,
"loss": -0.1028,
"num_tokens": 4833871.0,
"reward": 0.008937498554587364,
"reward_std": 0.05844488739967346,
"rewards/rollout_reward_func/mean": 0.008937498554587364,
"rewards/rollout_reward_func/std": 0.06495653092861176,
"sampling/importance_sampling_ratio/max": 1.8651371002197266,
"sampling/importance_sampling_ratio/mean": 0.9047431349754333,
"sampling/importance_sampling_ratio/min": 0.2518555521965027,
"sampling/sampling_logp_difference/max": 1.338068962097168,
"sampling/sampling_logp_difference/mean": 0.05908910930156708,
"step": 93,
"step_time": 41.447705181002675
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.3570367135107517,
"epoch": 0.00188,
"grad_norm": 1.3854937553405762,
"kl": 0.17741655930876732,
"learning_rate": 9.999993770388785e-06,
"loss": -0.1059,
"step": 94,
"step_time": 8.301304157990671
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2725.0,
"completions/max_terminated_length": 2725.0,
"completions/mean_length": 2492.8125,
"completions/mean_terminated_length": 2492.8125,
"completions/min_length": 1657.0,
"completions/min_terminated_length": 1657.0,
"entropy": 0.3486361838877201,
"epoch": 0.0019,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6948275566101074,
"kl": 0.11026190128177404,
"learning_rate": 9.99999355372282e-06,
"loss": 0.0319,
"num_tokens": 4937685.0,
"reward": 0.0045624990016222,
"reward_std": 0.16633789241313934,
"rewards/rollout_reward_func/mean": 0.0045624990016222,
"rewards/rollout_reward_func/std": 0.25424712896347046,
"sampling/importance_sampling_ratio/max": 2.6231071949005127,
"sampling/importance_sampling_ratio/mean": 0.9236529469490051,
"sampling/importance_sampling_ratio/min": 0.2035244256258011,
"sampling/sampling_logp_difference/max": 1.0063461065292358,
"sampling/sampling_logp_difference/mean": 0.05296621471643448,
"step": 95,
"step_time": 40.5261353790047
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.010044642956927419,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011997767956927419,
"entropy": 0.3515312075614929,
"epoch": 0.00192,
"grad_norm": 1.670708179473877,
"kl": 0.1139525892212987,
"learning_rate": 9.999993333353169e-06,
"loss": 0.0283,
"step": 96,
"step_time": 8.195346737004002
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2674.0,
"completions/max_terminated_length": 2674.0,
"completions/mean_length": 2518.875,
"completions/mean_terminated_length": 2518.875,
"completions/min_length": 2312.0,
"completions/min_terminated_length": 2312.0,
"entropy": 0.3493756130337715,
"epoch": 0.00194,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.074188470840454,
"kl": 0.12551532126963139,
"learning_rate": 9.999993109279829e-06,
"loss": -0.0301,
"num_tokens": 5041376.0,
"reward": -0.060187503695487976,
"reward_std": 0.17671775817871094,
"rewards/rollout_reward_func/mean": -0.060187503695487976,
"rewards/rollout_reward_func/std": 0.270876407623291,
"sampling/importance_sampling_ratio/max": 2.0745737552642822,
"sampling/importance_sampling_ratio/mean": 1.0269553661346436,
"sampling/importance_sampling_ratio/min": 0.3745565414428711,
"sampling/sampling_logp_difference/max": 1.2888622283935547,
"sampling/sampling_logp_difference/mean": 0.057082515209913254,
"step": 97,
"step_time": 40.23683304199949
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.3484783321619034,
"epoch": 0.00196,
"grad_norm": 1.8363243341445923,
"kl": 0.1370929814875126,
"learning_rate": 9.999992881502803e-06,
"loss": -0.035,
"step": 98,
"step_time": 8.031628980010282
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.009765625,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.015625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2773.0,
"completions/max_terminated_length": 2773.0,
"completions/mean_length": 2592.90625,
"completions/mean_terminated_length": 2592.90625,
"completions/min_length": 1891.0,
"completions/min_terminated_length": 1891.0,
"entropy": 0.370455052703619,
"epoch": 0.00198,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6820168495178223,
"kl": 0.14302024198696017,
"learning_rate": 9.999992650022092e-06,
"loss": -0.1369,
"num_tokens": 5148461.0,
"reward": -0.013124998658895493,
"reward_std": 0.13135413825511932,
"rewards/rollout_reward_func/mean": -0.013124998658895493,
"rewards/rollout_reward_func/std": 0.21369919180870056,
"sampling/importance_sampling_ratio/max": 2.303262710571289,
"sampling/importance_sampling_ratio/mean": 0.9167919158935547,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.0467338562011719,
"sampling/sampling_logp_difference/mean": 0.0629487931728363,
"step": 99,
"step_time": 40.73835782099923
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.3696703165769577,
"epoch": 0.002,
"grad_norm": 1.7006328105926514,
"kl": 0.13471043622121215,
"learning_rate": 9.999992414837692e-06,
"loss": -0.1399,
"step": 100,
"step_time": 9.27685954599292
},
{
"clip_ratio/high_max": 0.007359307492151856,
"clip_ratio/high_mean": 0.003679653746075928,
"clip_ratio/low_mean": 0.007582720601931214,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011262374348007143,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2720.0,
"completions/max_terminated_length": 2720.0,
"completions/mean_length": 2466.6875,
"completions/mean_terminated_length": 2466.6875,
"completions/min_length": 1575.0,
"completions/min_terminated_length": 1575.0,
"entropy": 0.33331993594765663,
"epoch": 0.00202,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.541299819946289,
"kl": 0.07826073374599218,
"learning_rate": 9.999992175949606e-06,
"loss": -0.043,
"num_tokens": 5250984.0,
"reward": 0.07187500596046448,
"reward_std": 0.3101885914802551,
"rewards/rollout_reward_func/mean": 0.07187500596046448,
"rewards/rollout_reward_func/std": 0.41012537479400635,
"sampling/importance_sampling_ratio/max": 1.7896367311477661,
"sampling/importance_sampling_ratio/mean": 0.9954925775527954,
"sampling/importance_sampling_ratio/min": 0.32469430565834045,
"sampling/sampling_logp_difference/max": 1.0306782722473145,
"sampling/sampling_logp_difference/mean": 0.046852171421051025,
"step": 101,
"step_time": 39.81217833299888
},
{
"clip_ratio/high_max": 0.01997091481462121,
"clip_ratio/high_mean": 0.009985457407310605,
"clip_ratio/low_mean": 0.009929753025062382,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.01991521066520363,
"entropy": 0.33568425104022026,
"epoch": 0.00204,
"grad_norm": 1.5253196954727173,
"kl": 0.0802880369592458,
"learning_rate": 9.999991933357835e-06,
"loss": -0.0472,
"step": 102,
"step_time": 8.12889104099304
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0018382353009656072,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003791360300965607,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2744.0,
"completions/max_terminated_length": 2744.0,
"completions/mean_length": 2527.25,
"completions/mean_terminated_length": 2527.25,
"completions/min_length": 1738.0,
"completions/min_terminated_length": 1738.0,
"entropy": 0.34852392226457596,
"epoch": 0.00206,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3484529256820679,
"kl": 0.14044185122475028,
"learning_rate": 9.999991687062379e-06,
"loss": -0.2352,
"num_tokens": 5355804.0,
"reward": -0.03231249749660492,
"reward_std": 0.11947058886289597,
"rewards/rollout_reward_func/mean": -0.03231249749660492,
"rewards/rollout_reward_func/std": 0.195997416973114,
"sampling/importance_sampling_ratio/max": 2.030506134033203,
"sampling/importance_sampling_ratio/mean": 0.9182260632514954,
"sampling/importance_sampling_ratio/min": 0.12404513359069824,
"sampling/sampling_logp_difference/max": 1.0576603412628174,
"sampling/sampling_logp_difference/mean": 0.053935110569000244,
"step": 103,
"step_time": 40.22734483000386
},
{
"clip_ratio/high_max": 0.008370535913854837,
"clip_ratio/high_mean": 0.004185267956927419,
"clip_ratio/low_mean": 0.011488970601931214,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015674238558858633,
"entropy": 0.34627125039696693,
"epoch": 0.00208,
"grad_norm": 1.265508770942688,
"kl": 0.15802726102992892,
"learning_rate": 9.999991437063234e-06,
"loss": -0.2393,
"step": 104,
"step_time": 8.25907339999685
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2740.0,
"completions/max_terminated_length": 2740.0,
"completions/mean_length": 2569.4375,
"completions/mean_terminated_length": 2569.4375,
"completions/min_length": 2413.0,
"completions/min_terminated_length": 2413.0,
"entropy": 0.3400811739265919,
"epoch": 0.0021,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.734525203704834,
"kl": 0.14485831279307604,
"learning_rate": 9.999991183360406e-06,
"loss": -0.0094,
"num_tokens": 5461781.0,
"reward": -0.05031249672174454,
"reward_std": 0.12431386858224869,
"rewards/rollout_reward_func/mean": -0.05031249672174454,
"rewards/rollout_reward_func/std": 0.25011590123176575,
"sampling/importance_sampling_ratio/max": 2.130937337875366,
"sampling/importance_sampling_ratio/mean": 1.0058907270431519,
"sampling/importance_sampling_ratio/min": 0.20956012606620789,
"sampling/sampling_logp_difference/max": 1.1093370914459229,
"sampling/sampling_logp_difference/mean": 0.05707106366753578,
"step": 105,
"step_time": 41.4205684740009
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.33539753779768944,
"epoch": 0.00212,
"grad_norm": 1.7177813053131104,
"kl": 0.14945369446650147,
"learning_rate": 9.999990925953894e-06,
"loss": -0.0119,
"step": 106,
"step_time": 8.722428562010464
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2650.0,
"completions/max_terminated_length": 2650.0,
"completions/mean_length": 2511.96875,
"completions/mean_terminated_length": 2511.96875,
"completions/min_length": 2037.0,
"completions/min_terminated_length": 2037.0,
"entropy": 0.3455476462841034,
"epoch": 0.00214,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5704108476638794,
"kl": 0.14504612796008587,
"learning_rate": 9.999990664843696e-06,
"loss": -0.1358,
"num_tokens": 5565383.0,
"reward": -0.06875000149011612,
"reward_std": 0.15667273104190826,
"rewards/rollout_reward_func/mean": -0.06875000149011612,
"rewards/rollout_reward_func/std": 0.2505837082862854,
"sampling/importance_sampling_ratio/max": 1.963279128074646,
"sampling/importance_sampling_ratio/mean": 0.8803051114082336,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.3123910427093506,
"sampling/sampling_logp_difference/mean": 0.06202516704797745,
"step": 107,
"step_time": 39.60044279100839
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.009765625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017578125,
"entropy": 0.338915441185236,
"epoch": 0.00216,
"grad_norm": 1.1836293935775757,
"kl": 0.18176286108791828,
"learning_rate": 9.999990400029814e-06,
"loss": -0.1392,
"step": 108,
"step_time": 8.004781586998433
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2700.0,
"completions/max_terminated_length": 2700.0,
"completions/mean_length": 2556.59375,
"completions/mean_terminated_length": 2556.59375,
"completions/min_length": 2298.0,
"completions/min_terminated_length": 2298.0,
"entropy": 0.3399314768612385,
"epoch": 0.00218,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8767611980438232,
"kl": 0.19613626692444086,
"learning_rate": 9.999990131512245e-06,
"loss": -0.1203,
"num_tokens": 5671035.0,
"reward": 0.030062498524785042,
"reward_std": 0.06542174518108368,
"rewards/rollout_reward_func/mean": 0.030062498524785042,
"rewards/rollout_reward_func/std": 0.06506668031215668,
"sampling/importance_sampling_ratio/max": 2.1095526218414307,
"sampling/importance_sampling_ratio/mean": 0.9953502416610718,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.2929037809371948,
"sampling/sampling_logp_difference/mean": 0.0701940506696701,
"step": 109,
"step_time": 40.07611987899145
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.007697610184550285,
"clip_ratio/low_mean": 0.005744485300965607,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01344209536910057,
"entropy": 0.33449552208185196,
"epoch": 0.0022,
"grad_norm": 1.8064794540405273,
"kl": 0.22378786839544773,
"learning_rate": 9.999989859290995e-06,
"loss": -0.1237,
"step": 110,
"step_time": 8.101552588006598
},
{
"clip_ratio/high_max": 0.022248641354963183,
"clip_ratio/high_mean": 0.013077445677481592,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015030570677481592,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2644.0,
"completions/max_terminated_length": 2644.0,
"completions/mean_length": 2361.90625,
"completions/mean_terminated_length": 2361.90625,
"completions/min_length": 369.0,
"completions/min_terminated_length": 369.0,
"entropy": 0.32380833476781845,
"epoch": 0.00222,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7965515851974487,
"kl": 0.2527701100334525,
"learning_rate": 9.99998958336606e-06,
"loss": -0.067,
"num_tokens": 5770467.0,
"reward": -0.17624998092651367,
"reward_std": 0.35133326053619385,
"rewards/rollout_reward_func/mean": -0.17624998092651367,
"rewards/rollout_reward_func/std": 0.3963848650455475,
"sampling/importance_sampling_ratio/max": 2.2881264686584473,
"sampling/importance_sampling_ratio/mean": 0.8733131289482117,
"sampling/importance_sampling_ratio/min": 0.2075604349374771,
"sampling/sampling_logp_difference/max": 1.3090085983276367,
"sampling/sampling_logp_difference/mean": 0.07093898952007294,
"step": 111,
"step_time": 38.757764058995235
},
{
"clip_ratio/high_max": 0.017968750093132257,
"clip_ratio/high_mean": 0.010937499813735485,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.014843749813735485,
"entropy": 0.32395435497164726,
"epoch": 0.00224,
"grad_norm": 1.7358996868133545,
"kl": 0.27141671627759933,
"learning_rate": 9.999989303737442e-06,
"loss": -0.0699,
"step": 112,
"step_time": 8.030561457002477
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2722.0,
"completions/max_terminated_length": 2722.0,
"completions/mean_length": 2539.1875,
"completions/mean_terminated_length": 2539.1875,
"completions/min_length": 2354.0,
"completions/min_terminated_length": 2354.0,
"entropy": 0.3293020986020565,
"epoch": 0.00226,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9367733001708984,
"kl": 0.30573761742562056,
"learning_rate": 9.999989020405141e-06,
"loss": 0.0804,
"num_tokens": 5875527.0,
"reward": -0.009312499314546585,
"reward_std": 0.06325964629650116,
"rewards/rollout_reward_func/mean": -0.009312499314546585,
"rewards/rollout_reward_func/std": 0.07523487508296967,
"sampling/importance_sampling_ratio/max": 2.04264760017395,
"sampling/importance_sampling_ratio/mean": 0.9071935415267944,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.2561190128326416,
"sampling/sampling_logp_difference/mean": 0.06177542358636856,
"step": 113,
"step_time": 39.97751610999694
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.33135970309376717,
"epoch": 0.00228,
"grad_norm": 1.8863413333892822,
"kl": 0.2953490880317986,
"learning_rate": 9.999988733369157e-06,
"loss": 0.08,
"step": 114,
"step_time": 8.203803888012771
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2766.0,
"completions/max_terminated_length": 2766.0,
"completions/mean_length": 2594.53125,
"completions/mean_terminated_length": 2594.53125,
"completions/min_length": 2435.0,
"completions/min_terminated_length": 2435.0,
"entropy": 0.312834270298481,
"epoch": 0.0023,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5824787616729736,
"kl": 0.19389449525624514,
"learning_rate": 9.999988442629489e-06,
"loss": 0.0764,
"num_tokens": 5982430.0,
"reward": 0.011749999597668648,
"reward_std": 0.06366527080535889,
"rewards/rollout_reward_func/mean": 0.011749999597668648,
"rewards/rollout_reward_func/std": 0.06497592478990555,
"sampling/importance_sampling_ratio/max": 2.5081026554107666,
"sampling/importance_sampling_ratio/mean": 1.0248709917068481,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.7471835613250732,
"sampling/sampling_logp_difference/mean": 0.06724181771278381,
"step": 115,
"step_time": 40.59423167099885
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.009765625,
"clip_ratio/low_mean": 0.009765625,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.01953125,
"entropy": 0.3156875427812338,
"epoch": 0.00232,
"grad_norm": 1.6595027446746826,
"kl": 0.18415676709264517,
"learning_rate": 9.99998814818614e-06,
"loss": 0.074,
"step": 116,
"step_time": 8.306354760017712
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2738.0,
"completions/max_terminated_length": 2738.0,
"completions/mean_length": 2564.59375,
"completions/mean_terminated_length": 2564.59375,
"completions/min_length": 2383.0,
"completions/min_terminated_length": 2383.0,
"entropy": 0.3278175815939903,
"epoch": 0.00234,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.980755090713501,
"kl": 0.09133347170427442,
"learning_rate": 9.999987850039108e-06,
"loss": -0.0312,
"num_tokens": 6088349.0,
"reward": 0.004375000484287739,
"reward_std": 0.041993025690317154,
"rewards/rollout_reward_func/mean": 0.004375000484287739,
"rewards/rollout_reward_func/std": 0.04905148595571518,
"sampling/importance_sampling_ratio/max": 2.285834789276123,
"sampling/importance_sampling_ratio/mean": 1.0151498317718506,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.7270146608352661,
"sampling/sampling_logp_difference/mean": 0.0533914715051651,
"step": 117,
"step_time": 41.6384668170067
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.3312051221728325,
"epoch": 0.00236,
"grad_norm": 2.001918077468872,
"kl": 0.09348489623516798,
"learning_rate": 9.999987548188395e-06,
"loss": -0.034,
"step": 118,
"step_time": 8.222705290994782
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2696.0,
"completions/max_terminated_length": 2696.0,
"completions/mean_length": 2553.4375,
"completions/mean_terminated_length": 2553.4375,
"completions/min_length": 2060.0,
"completions/min_terminated_length": 2060.0,
"entropy": 0.32508161664009094,
"epoch": 0.00238,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7715137004852295,
"kl": 0.14226396940648556,
"learning_rate": 9.999987242634e-06,
"loss": -0.0903,
"num_tokens": 6193617.0,
"reward": -0.0234375,
"reward_std": 0.10268107056617737,
"rewards/rollout_reward_func/mean": -0.0234375,
"rewards/rollout_reward_func/std": 0.17849929630756378,
"sampling/importance_sampling_ratio/max": 2.5179085731506348,
"sampling/importance_sampling_ratio/mean": 1.0467031002044678,
"sampling/importance_sampling_ratio/min": 0.23064902424812317,
"sampling/sampling_logp_difference/max": 1.435530662536621,
"sampling/sampling_logp_difference/mean": 0.052883878350257874,
"step": 119,
"step_time": 40.86630449297809
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.009765625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.32609760016202927,
"epoch": 0.0024,
"grad_norm": 1.6669055223464966,
"kl": 0.14041694393381476,
"learning_rate": 9.999986933375924e-06,
"loss": -0.0943,
"step": 120,
"step_time": 8.225507424009265
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2762.0,
"completions/max_terminated_length": 2762.0,
"completions/mean_length": 2509.9375,
"completions/mean_terminated_length": 2509.9375,
"completions/min_length": 1604.0,
"completions/min_terminated_length": 1604.0,
"entropy": 0.34493784606456757,
"epoch": 0.00242,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7307958602905273,
"kl": 0.20689602987840772,
"learning_rate": 9.999986620414169e-06,
"loss": -0.028,
"num_tokens": 6297378.0,
"reward": 0.004999998956918716,
"reward_std": 0.1766255795955658,
"rewards/rollout_reward_func/mean": 0.004999998956918716,
"rewards/rollout_reward_func/std": 0.28026482462882996,
"sampling/importance_sampling_ratio/max": 2.4770820140838623,
"sampling/importance_sampling_ratio/mean": 1.0248966217041016,
"sampling/importance_sampling_ratio/min": 0.1811486929655075,
"sampling/sampling_logp_difference/max": 1.7409013509750366,
"sampling/sampling_logp_difference/mean": 0.06154333055019379,
"step": 121,
"step_time": 39.13895379099267
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.008091517956927419,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011997767724096775,
"entropy": 0.34469663724303246,
"epoch": 0.00244,
"grad_norm": 1.7192891836166382,
"kl": 0.21292951330542564,
"learning_rate": 9.999986303748731e-06,
"loss": -0.0298,
"step": 122,
"step_time": 9.244061531986517
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2753.0,
"completions/max_terminated_length": 2753.0,
"completions/mean_length": 2531.1875,
"completions/mean_terminated_length": 2531.1875,
"completions/min_length": 1462.0,
"completions/min_terminated_length": 1462.0,
"entropy": 0.33442598581314087,
"epoch": 0.00246,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6036715507507324,
"kl": 0.15767440851777792,
"learning_rate": 9.999985983379614e-06,
"loss": -0.0708,
"num_tokens": 6402259.0,
"reward": -0.02025000751018524,
"reward_std": 0.12010502070188522,
"rewards/rollout_reward_func/mean": -0.02025000751018524,
"rewards/rollout_reward_func/std": 0.2211160957813263,
"sampling/importance_sampling_ratio/max": 2.1161069869995117,
"sampling/importance_sampling_ratio/mean": 0.9873582720756531,
"sampling/importance_sampling_ratio/min": 0.13119953870773315,
"sampling/sampling_logp_difference/max": 1.1497358083724976,
"sampling/sampling_logp_difference/mean": 0.06182745844125748,
"step": 123,
"step_time": 40.470048752991715
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"entropy": 0.33676064386963844,
"epoch": 0.00248,
"grad_norm": 1.6138527393341064,
"kl": 0.1523943403735757,
"learning_rate": 9.999985659306817e-06,
"loss": -0.0723,
"step": 124,
"step_time": 8.273418343997037
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0022321429569274187,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0022321429569274187,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2735.0,
"completions/max_terminated_length": 2735.0,
"completions/mean_length": 2546.28125,
"completions/mean_terminated_length": 2546.28125,
"completions/min_length": 1617.0,
"completions/min_terminated_length": 1617.0,
"entropy": 0.3453407287597656,
"epoch": 0.0025,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.0110528469085693,
"kl": 0.170327290892601,
"learning_rate": 9.999985331530339e-06,
"loss": -0.1445,
"num_tokens": 6507286.0,
"reward": 0.050312504172325134,
"reward_std": 0.11686505377292633,
"rewards/rollout_reward_func/mean": 0.050312504172325134,
"rewards/rollout_reward_func/std": 0.19619746506214142,
"sampling/importance_sampling_ratio/max": 2.439424991607666,
"sampling/importance_sampling_ratio/mean": 0.8885781764984131,
"sampling/importance_sampling_ratio/min": 0.27067622542381287,
"sampling/sampling_logp_difference/max": 1.2129557132720947,
"sampling/sampling_logp_difference/mean": 0.059535570442676544,
"step": 125,
"step_time": 40.06854912801646
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.3458435758948326,
"epoch": 0.00252,
"grad_norm": 2.0043210983276367,
"kl": 0.19776208233088255,
"learning_rate": 9.999985000050181e-06,
"loss": -0.1484,
"step": 126,
"step_time": 8.200360839000496
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2685.0,
"completions/max_terminated_length": 2685.0,
"completions/mean_length": 2440.0625,
"completions/mean_terminated_length": 2440.0625,
"completions/min_length": 663.0,
"completions/min_terminated_length": 663.0,
"entropy": 0.3563276380300522,
"epoch": 0.00254,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.6714493036270142,
"kl": 0.10458008944988251,
"learning_rate": 9.999984664866347e-06,
"loss": -0.0615,
"num_tokens": 6608666.0,
"reward": -0.027812499552965164,
"reward_std": 0.24362991750240326,
"rewards/rollout_reward_func/mean": -0.027812499552965164,
"rewards/rollout_reward_func/std": 0.3529391288757324,
"sampling/importance_sampling_ratio/max": 2.106961727142334,
"sampling/importance_sampling_ratio/mean": 1.0129467248916626,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.7839922904968262,
"sampling/sampling_logp_difference/mean": 0.05322550982236862,
"step": 127,
"step_time": 38.439512436008954
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.35617511719465256,
"epoch": 0.00256,
"grad_norm": 1.5934431552886963,
"kl": 0.10316136293113232,
"learning_rate": 9.999984325978833e-06,
"loss": -0.0651,
"step": 128,
"step_time": 8.525172622001264
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2727.0,
"completions/max_terminated_length": 2727.0,
"completions/mean_length": 2538.875,
"completions/mean_terminated_length": 2538.875,
"completions/min_length": 2183.0,
"completions/min_terminated_length": 2183.0,
"entropy": 0.3401091955602169,
"epoch": 0.00258,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5359450578689575,
"kl": 0.23094982374459505,
"learning_rate": 9.99998398338764e-06,
"loss": 0.0811,
"num_tokens": 6713463.0,
"reward": 0.006437500007450581,
"reward_std": 0.045757949352264404,
"rewards/rollout_reward_func/mean": 0.006437500007450581,
"rewards/rollout_reward_func/std": 0.06285206973552704,
"sampling/importance_sampling_ratio/max": 1.7960189580917358,
"sampling/importance_sampling_ratio/mean": 0.829826831817627,
"sampling/importance_sampling_ratio/min": 0.19573816657066345,
"sampling/sampling_logp_difference/max": 1.1058683395385742,
"sampling/sampling_logp_difference/mean": 0.06072615832090378,
"step": 129,
"step_time": 39.87553574399499
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.009765625,
"clip_ratio/low_mean": 0.01171875,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.021484375,
"entropy": 0.3394704796373844,
"epoch": 0.0026,
"grad_norm": 1.2305470705032349,
"kl": 0.20992374047636986,
"learning_rate": 9.99998363709277e-06,
"loss": 0.0796,
"step": 130,
"step_time": 8.169040031993063
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2687.0,
"completions/max_terminated_length": 2687.0,
"completions/mean_length": 2560.53125,
"completions/mean_terminated_length": 2560.53125,
"completions/min_length": 2391.0,
"completions/min_terminated_length": 2391.0,
"entropy": 0.3369971923530102,
"epoch": 0.00262,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8376662731170654,
"kl": 0.6742809629067779,
"learning_rate": 9.999983287094222e-06,
"loss": -0.1498,
"num_tokens": 6818761.0,
"reward": -0.012437498196959496,
"reward_std": 0.10733288526535034,
"rewards/rollout_reward_func/mean": -0.012437498196959496,
"rewards/rollout_reward_func/std": 0.18053805828094482,
"sampling/importance_sampling_ratio/max": 2.5355255603790283,
"sampling/importance_sampling_ratio/mean": 0.9010251760482788,
"sampling/importance_sampling_ratio/min": 0.15409857034683228,
"sampling/sampling_logp_difference/max": 1.7746939659118652,
"sampling/sampling_logp_difference/mean": 0.06549885869026184,
"step": 131,
"step_time": 40.05877101999795
},
{
"clip_ratio/high_max": 0.02734375,
"clip_ratio/high_mean": 0.015625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017578125,
"entropy": 0.3364224396646023,
"epoch": 0.00264,
"grad_norm": 1.5281078815460205,
"kl": 0.5020047454163432,
"learning_rate": 9.999982933391998e-06,
"loss": -0.1526,
"step": 132,
"step_time": 8.129041669002618
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2657.0,
"completions/max_terminated_length": 2657.0,
"completions/mean_length": 2431.9375,
"completions/mean_terminated_length": 2431.9375,
"completions/min_length": 657.0,
"completions/min_terminated_length": 657.0,
"entropy": 0.3256659470498562,
"epoch": 0.00266,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5615955591201782,
"kl": 0.11903299344703555,
"learning_rate": 9.999982575986095e-06,
"loss": -0.1114,
"num_tokens": 6919971.0,
"reward": 0.049687501043081284,
"reward_std": 0.26549428701400757,
"rewards/rollout_reward_func/mean": 0.049687501043081284,
"rewards/rollout_reward_func/std": 0.3750826418399811,
"sampling/importance_sampling_ratio/max": 2.790414571762085,
"sampling/importance_sampling_ratio/mean": 1.039184808731079,
"sampling/importance_sampling_ratio/min": 0.362961083650589,
"sampling/sampling_logp_difference/max": 0.7517553567886353,
"sampling/sampling_logp_difference/mean": 0.05017857998609543,
"step": 133,
"step_time": 40.47903727200901
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.32507436349987984,
"epoch": 0.00268,
"grad_norm": 1.4362146854400635,
"kl": 0.11510731559246778,
"learning_rate": 9.999982214876516e-06,
"loss": -0.1129,
"step": 134,
"step_time": 8.01312270500057
},
{
"clip_ratio/high_max": 0.01953125,
"clip_ratio/high_mean": 0.009765625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2670.0,
"completions/max_terminated_length": 2670.0,
"completions/mean_length": 2546.34375,
"completions/mean_terminated_length": 2546.34375,
"completions/min_length": 2309.0,
"completions/min_terminated_length": 2309.0,
"entropy": 0.340265478938818,
"epoch": 0.0027,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.0948877334594727,
"kl": 0.26178126875311136,
"learning_rate": 9.999981850063262e-06,
"loss": 0.0991,
"num_tokens": 7025087.0,
"reward": 0.0013749999925494194,
"reward_std": 0.06908594816923141,
"rewards/rollout_reward_func/mean": 0.0013749999925494194,
"rewards/rollout_reward_func/std": 0.07447785884141922,
"sampling/importance_sampling_ratio/max": 2.6498475074768066,
"sampling/importance_sampling_ratio/mean": 1.0591254234313965,
"sampling/importance_sampling_ratio/min": 0.1885811686515808,
"sampling/sampling_logp_difference/max": 1.3395788669586182,
"sampling/sampling_logp_difference/mean": 0.05880427360534668,
"step": 135,
"step_time": 41.25988252200477
},
{
"clip_ratio/high_max": 0.01953125,
"clip_ratio/high_mean": 0.01171875,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01953125,
"entropy": 0.3363325670361519,
"epoch": 0.00272,
"grad_norm": 1.810211181640625,
"kl": 0.27428784500807524,
"learning_rate": 9.99998148154633e-06,
"loss": 0.0962,
"step": 136,
"step_time": 8.050127039008657
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2729.0,
"completions/max_terminated_length": 2729.0,
"completions/mean_length": 2506.28125,
"completions/mean_terminated_length": 2506.28125,
"completions/min_length": 2034.0,
"completions/min_terminated_length": 2034.0,
"entropy": 0.3250386714935303,
"epoch": 0.00274,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5487583875656128,
"kl": 0.2483032587915659,
"learning_rate": 9.999981109325725e-06,
"loss": -0.0189,
"num_tokens": 7129006.0,
"reward": -0.02031249925494194,
"reward_std": 0.10975626111030579,
"rewards/rollout_reward_func/mean": -0.02031249925494194,
"rewards/rollout_reward_func/std": 0.1898808479309082,
"sampling/importance_sampling_ratio/max": 1.8540207147598267,
"sampling/importance_sampling_ratio/mean": 0.9523411989212036,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.999842643737793,
"sampling/sampling_logp_difference/mean": 0.048174045979976654,
"step": 137,
"step_time": 40.41093556799751
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.32194047793745995,
"epoch": 0.00276,
"grad_norm": 1.530614972114563,
"kl": 0.25904428493231535,
"learning_rate": 9.999980733401442e-06,
"loss": -0.0239,
"step": 138,
"step_time": 8.193952220994106
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2705.0,
"completions/max_terminated_length": 2705.0,
"completions/mean_length": 2533.875,
"completions/mean_terminated_length": 2533.875,
"completions/min_length": 2022.0,
"completions/min_terminated_length": 2022.0,
"entropy": 0.34514220058918,
"epoch": 0.00278,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8153398036956787,
"kl": 0.20840087812393904,
"learning_rate": 9.999980353773486e-06,
"loss": -0.0015,
"num_tokens": 7233542.0,
"reward": -0.022499997168779373,
"reward_std": 0.11739690601825714,
"rewards/rollout_reward_func/mean": -0.022499997168779373,
"rewards/rollout_reward_func/std": 0.20528501272201538,
"sampling/importance_sampling_ratio/max": 2.392735242843628,
"sampling/importance_sampling_ratio/mean": 0.9636785984039307,
"sampling/importance_sampling_ratio/min": 0.17225754261016846,
"sampling/sampling_logp_difference/max": 1.1478333473205566,
"sampling/sampling_logp_difference/mean": 0.05775437504053116,
"step": 139,
"step_time": 41.039759424988006
},
{
"clip_ratio/high_max": 0.023697916883975267,
"clip_ratio/high_mean": 0.01575520820915699,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01770833320915699,
"entropy": 0.34427405893802643,
"epoch": 0.0028,
"grad_norm": 1.8111648559570312,
"kl": 0.20970841869711876,
"learning_rate": 9.999979970441856e-06,
"loss": -0.0059,
"step": 140,
"step_time": 8.234236395001062
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0020833334419876337,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004036458441987634,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2671.0,
"completions/max_terminated_length": 2671.0,
"completions/mean_length": 2401.71875,
"completions/mean_terminated_length": 2401.71875,
"completions/min_length": 602.0,
"completions/min_terminated_length": 602.0,
"entropy": 0.30447014793753624,
"epoch": 0.00282,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.1894261837005615,
"kl": 0.3195039564743638,
"learning_rate": 9.999979583406551e-06,
"loss": 0.0382,
"num_tokens": 7333627.0,
"reward": 0.06937500089406967,
"reward_std": 0.3690631687641144,
"rewards/rollout_reward_func/mean": 0.06937500089406967,
"rewards/rollout_reward_func/std": 0.46480610966682434,
"sampling/importance_sampling_ratio/max": 2.1372768878936768,
"sampling/importance_sampling_ratio/mean": 0.9657130837440491,
"sampling/importance_sampling_ratio/min": 0.05373276770114899,
"sampling/sampling_logp_difference/max": 1.0698232650756836,
"sampling/sampling_logp_difference/mean": 0.05248340219259262,
"step": 141,
"step_time": 37.874881055002334
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.3037009723484516,
"epoch": 0.00284,
"grad_norm": 1.1916241645812988,
"kl": 0.2727260245010257,
"learning_rate": 9.999979192667574e-06,
"loss": 0.0377,
"step": 142,
"step_time": 7.981406920000154
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2700.0,
"completions/max_terminated_length": 2700.0,
"completions/mean_length": 2406.1875,
"completions/mean_terminated_length": 2406.1875,
"completions/min_length": 395.0,
"completions/min_terminated_length": 395.0,
"entropy": 0.2937978897243738,
"epoch": 0.00286,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4525103569030762,
"kl": 0.12783690728247166,
"learning_rate": 9.999978798224922e-06,
"loss": -0.1101,
"num_tokens": 7433724.0,
"reward": -0.04500000178813934,
"reward_std": 0.307174950838089,
"rewards/rollout_reward_func/mean": -0.04500000178813934,
"rewards/rollout_reward_func/std": 0.41188785433769226,
"sampling/importance_sampling_ratio/max": 2.6429364681243896,
"sampling/importance_sampling_ratio/mean": 1.0415544509887695,
"sampling/importance_sampling_ratio/min": 0.36408287286758423,
"sampling/sampling_logp_difference/max": 1.1776275634765625,
"sampling/sampling_logp_difference/mean": 0.0455252081155777,
"step": 143,
"step_time": 39.24893150100979
},
{
"clip_ratio/high_max": 0.012276785913854837,
"clip_ratio/high_mean": 0.006138392956927419,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008091517956927419,
"entropy": 0.2940533272922039,
"epoch": 0.00288,
"grad_norm": 1.2668437957763672,
"kl": 0.1362289022654295,
"learning_rate": 9.999978400078598e-06,
"loss": -0.113,
"step": 144,
"step_time": 8.001272381996387
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2717.0,
"completions/max_terminated_length": 2717.0,
"completions/mean_length": 2491.625,
"completions/mean_terminated_length": 2491.625,
"completions/min_length": 1870.0,
"completions/min_terminated_length": 1870.0,
"entropy": 0.32774606347084045,
"epoch": 0.0029,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4663074016571045,
"kl": 0.2525828080251813,
"learning_rate": 9.9999779982286e-06,
"loss": -0.1612,
"num_tokens": 7537148.0,
"reward": -0.03437500074505806,
"reward_std": 0.12437704205513,
"rewards/rollout_reward_func/mean": -0.03437500074505806,
"rewards/rollout_reward_func/std": 0.20053055882453918,
"sampling/importance_sampling_ratio/max": 2.752513885498047,
"sampling/importance_sampling_ratio/mean": 0.8781605958938599,
"sampling/importance_sampling_ratio/min": 0.1707814782857895,
"sampling/sampling_logp_difference/max": 0.9282255172729492,
"sampling/sampling_logp_difference/mean": 0.05459430813789368,
"step": 145,
"step_time": 41.338485607011535
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.3271145783364773,
"epoch": 0.00292,
"grad_norm": 1.5428147315979004,
"kl": 0.2190783964470029,
"learning_rate": 9.999977592674933e-06,
"loss": -0.162,
"step": 146,
"step_time": 8.144251859994256
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2694.0,
"completions/max_terminated_length": 2694.0,
"completions/mean_length": 2471.40625,
"completions/mean_terminated_length": 2471.40625,
"completions/min_length": 1665.0,
"completions/min_terminated_length": 1665.0,
"entropy": 0.2840565647929907,
"epoch": 0.00294,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5519449710845947,
"kl": 0.3378815334290266,
"learning_rate": 9.999977183417593e-06,
"loss": -0.0958,
"num_tokens": 7640114.0,
"reward": 0.03281249850988388,
"reward_std": 0.15429173409938812,
"rewards/rollout_reward_func/mean": 0.03281249850988388,
"rewards/rollout_reward_func/std": 0.2244165688753128,
"sampling/importance_sampling_ratio/max": 2.7828152179718018,
"sampling/importance_sampling_ratio/mean": 1.010582685470581,
"sampling/importance_sampling_ratio/min": 0.2255764752626419,
"sampling/sampling_logp_difference/max": 1.4769656658172607,
"sampling/sampling_logp_difference/mean": 0.05251479893922806,
"step": 147,
"step_time": 39.98198530799709
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.28489716723561287,
"epoch": 0.00296,
"grad_norm": 1.4790749549865723,
"kl": 0.3447922170162201,
"learning_rate": 9.999976770456581e-06,
"loss": -0.0975,
"step": 148,
"step_time": 8.059031150987721
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2717.0,
"completions/max_terminated_length": 2717.0,
"completions/mean_length": 2457.5625,
"completions/mean_terminated_length": 2457.5625,
"completions/min_length": 1340.0,
"completions/min_terminated_length": 1340.0,
"entropy": 0.330892838537693,
"epoch": 0.00298,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9201385974884033,
"kl": 0.13803628273308277,
"learning_rate": 9.999976353791898e-06,
"loss": 0.0257,
"num_tokens": 7742168.0,
"reward": -0.06718750298023224,
"reward_std": 0.21596568822860718,
"rewards/rollout_reward_func/mean": -0.06718750298023224,
"rewards/rollout_reward_func/std": 0.3021240234375,
"sampling/importance_sampling_ratio/max": 2.435490369796753,
"sampling/importance_sampling_ratio/mean": 1.1018002033233643,
"sampling/importance_sampling_ratio/min": 0.3883662223815918,
"sampling/sampling_logp_difference/max": 0.6033191680908203,
"sampling/sampling_logp_difference/mean": 0.05041055008769035,
"step": 149,
"step_time": 39.8079579510013
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.3302891068160534,
"epoch": 0.003,
"grad_norm": 1.9833990335464478,
"kl": 0.14259828627109528,
"learning_rate": 9.999975933423546e-06,
"loss": 0.022,
"step": 150,
"step_time": 8.779339686996536
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2647.0,
"completions/max_terminated_length": 2647.0,
"completions/mean_length": 2507.3125,
"completions/mean_terminated_length": 2507.3125,
"completions/min_length": 1950.0,
"completions/min_terminated_length": 1950.0,
"entropy": 0.3101547583937645,
"epoch": 0.00302,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.518326759338379,
"kl": 0.200548829510808,
"learning_rate": 9.999975509351522e-06,
"loss": -0.0199,
"num_tokens": 7846003.0,
"reward": 0.09437499940395355,
"reward_std": 0.1686856895685196,
"rewards/rollout_reward_func/mean": 0.09437499940395355,
"rewards/rollout_reward_func/std": 0.26759305596351624,
"sampling/importance_sampling_ratio/max": 2.2425479888916016,
"sampling/importance_sampling_ratio/mean": 0.9671791791915894,
"sampling/importance_sampling_ratio/min": 0.1983872652053833,
"sampling/sampling_logp_difference/max": 0.8811240196228027,
"sampling/sampling_logp_difference/mean": 0.056956760585308075,
"step": 151,
"step_time": 39.684381083003245
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0037913601845502853,
"clip_ratio/low_min": 0.0036764706019312143,
"clip_ratio/region_mean": 0.005744485184550285,
"entropy": 0.3110111430287361,
"epoch": 0.00304,
"grad_norm": 1.555120587348938,
"kl": 0.21855803951621056,
"learning_rate": 9.99997508157583e-06,
"loss": -0.0236,
"step": 152,
"step_time": 8.083289796006284
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2671.0,
"completions/max_terminated_length": 2671.0,
"completions/mean_length": 2501.78125,
"completions/mean_terminated_length": 2501.78125,
"completions/min_length": 649.0,
"completions/min_terminated_length": 649.0,
"entropy": 0.30790841951966286,
"epoch": 0.00306,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.939740777015686,
"kl": 0.13009591028094292,
"learning_rate": 9.999974650096467e-06,
"loss": -0.0409,
"num_tokens": 7949583.0,
"reward": -0.06224999576807022,
"reward_std": 0.18093439936637878,
"rewards/rollout_reward_func/mean": -0.06224999576807022,
"rewards/rollout_reward_func/std": 0.2806392312049866,
"sampling/importance_sampling_ratio/max": 2.5082690715789795,
"sampling/importance_sampling_ratio/mean": 1.0172232389450073,
"sampling/importance_sampling_ratio/min": 0.11460259556770325,
"sampling/sampling_logp_difference/max": 1.0105078220367432,
"sampling/sampling_logp_difference/mean": 0.05251069366931915,
"step": 153,
"step_time": 39.95415290000528
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0071614584885537624,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.009114583488553762,
"entropy": 0.30542241409420967,
"epoch": 0.00308,
"grad_norm": 1.721543312072754,
"kl": 0.1375604411587119,
"learning_rate": 9.999974214913438e-06,
"loss": -0.0413,
"step": 154,
"step_time": 8.1919258509879
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2685.0,
"completions/max_terminated_length": 2685.0,
"completions/mean_length": 2505.8125,
"completions/mean_terminated_length": 2505.8125,
"completions/min_length": 2028.0,
"completions/min_terminated_length": 2028.0,
"entropy": 0.31059348583221436,
"epoch": 0.0031,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.3106508255004883,
"kl": 0.2599359452724457,
"learning_rate": 9.99997377602674e-06,
"loss": -0.0716,
"num_tokens": 8053769.0,
"reward": 0.03125,
"reward_std": 0.1753545105457306,
"rewards/rollout_reward_func/mean": 0.03125,
"rewards/rollout_reward_func/std": 0.2829064428806305,
"sampling/importance_sampling_ratio/max": 2.14701509475708,
"sampling/importance_sampling_ratio/mean": 0.9985713362693787,
"sampling/importance_sampling_ratio/min": 0.2423524558544159,
"sampling/sampling_logp_difference/max": 0.7375087738037109,
"sampling/sampling_logp_difference/mean": 0.05255991220474243,
"step": 155,
"step_time": 39.86996693698893
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.31180923245847225,
"epoch": 0.00312,
"grad_norm": 2.528000593185425,
"kl": 0.2466150252148509,
"learning_rate": 9.999973333436373e-06,
"loss": -0.0732,
"step": 156,
"step_time": 8.722005063995312
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013671875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2651.0,
"completions/max_terminated_length": 2651.0,
"completions/mean_length": 2445.84375,
"completions/mean_terminated_length": 2445.84375,
"completions/min_length": 1298.0,
"completions/min_terminated_length": 1298.0,
"entropy": 0.3062910810112953,
"epoch": 0.00314,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.596366047859192,
"kl": 0.2805585265159607,
"learning_rate": 9.999972887142338e-06,
"loss": 0.0232,
"num_tokens": 8155656.0,
"reward": -0.050624996423721313,
"reward_std": 0.18446293473243713,
"rewards/rollout_reward_func/mean": -0.050624996423721313,
"rewards/rollout_reward_func/std": 0.28230544924736023,
"sampling/importance_sampling_ratio/max": 1.973318099975586,
"sampling/importance_sampling_ratio/mean": 0.9094000458717346,
"sampling/importance_sampling_ratio/min": 0.2735903561115265,
"sampling/sampling_logp_difference/max": 1.0791234970092773,
"sampling/sampling_logp_difference/mean": 0.059729140251874924,
"step": 157,
"step_time": 38.55100798999047
},
{
"clip_ratio/high_max": 0.028245192486792803,
"clip_ratio/high_mean": 0.0219350962433964,
"clip_ratio/low_mean": 0.008263221010565758,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.030198317021131516,
"entropy": 0.30654633045196533,
"epoch": 0.00316,
"grad_norm": 1.384443759918213,
"kl": 0.27098785899579525,
"learning_rate": 9.999972437144638e-06,
"loss": 0.0171,
"step": 158,
"step_time": 8.019240472996898
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2706.0,
"completions/max_terminated_length": 2706.0,
"completions/mean_length": 2569.59375,
"completions/mean_terminated_length": 2569.59375,
"completions/min_length": 2401.0,
"completions/min_terminated_length": 2401.0,
"entropy": 0.3133653476834297,
"epoch": 0.00318,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.126905918121338,
"kl": 0.4180504083633423,
"learning_rate": 9.99997198344327e-06,
"loss": -0.0072,
"num_tokens": 8261326.0,
"reward": 0.022437501698732376,
"reward_std": 0.04600679874420166,
"rewards/rollout_reward_func/mean": 0.022437501698732376,
"rewards/rollout_reward_func/std": 0.06265468150377274,
"sampling/importance_sampling_ratio/max": 1.8128973245620728,
"sampling/importance_sampling_ratio/mean": 0.8245859146118164,
"sampling/importance_sampling_ratio/min": 0.15137887001037598,
"sampling/sampling_logp_difference/max": 1.5356993675231934,
"sampling/sampling_logp_difference/mean": 0.06314520537853241,
"step": 159,
"step_time": 40.667295111990825
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.3113729450851679,
"epoch": 0.0032,
"grad_norm": 1.070225715637207,
"kl": 0.386994413100183,
"learning_rate": 9.999971526038236e-06,
"loss": -0.0099,
"step": 160,
"step_time": 8.18899186798808
},
{
"clip_ratio/high_max": 0.014436141354963183,
"clip_ratio/high_mean": 0.007218070677481592,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009171195677481592,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2775.0,
"completions/max_terminated_length": 2775.0,
"completions/mean_length": 2458.0,
"completions/mean_terminated_length": 2458.0,
"completions/min_length": 394.0,
"completions/min_terminated_length": 394.0,
"entropy": 0.28068962320685387,
"epoch": 0.00322,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5223960876464844,
"kl": 0.1943189101293683,
"learning_rate": 9.999971064929537e-06,
"loss": 0.0709,
"num_tokens": 8363537.0,
"reward": -0.04000000283122063,
"reward_std": 0.17396603524684906,
"rewards/rollout_reward_func/mean": -0.04000000283122063,
"rewards/rollout_reward_func/std": 0.27076953649520874,
"sampling/importance_sampling_ratio/max": 2.4551634788513184,
"sampling/importance_sampling_ratio/mean": 0.9484375715255737,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.0188813209533691,
"sampling/sampling_logp_difference/mean": 0.058578573167324066,
"step": 161,
"step_time": 41.353516772003786
},
{
"clip_ratio/high_max": 0.010529891354963183,
"clip_ratio/high_mean": 0.005264945677481592,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007218070677481592,
"entropy": 0.27924087457358837,
"epoch": 0.00324,
"grad_norm": 1.2550345659255981,
"kl": 0.21171990223228931,
"learning_rate": 9.999970600117172e-06,
"loss": 0.0684,
"step": 162,
"step_time": 8.227772283993545
},
{
"clip_ratio/high_max": 0.017968750093132257,
"clip_ratio/high_mean": 0.008984375046566129,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010937500046566129,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2698.0,
"completions/max_terminated_length": 2698.0,
"completions/mean_length": 2467.15625,
"completions/mean_terminated_length": 2467.15625,
"completions/min_length": 394.0,
"completions/min_terminated_length": 394.0,
"entropy": 0.29255248606204987,
"epoch": 0.00326,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8392517566680908,
"kl": 0.22120904922485352,
"learning_rate": 9.999970131601143e-06,
"loss": -0.0554,
"num_tokens": 8466834.0,
"reward": 0.02281249687075615,
"reward_std": 0.17198839783668518,
"rewards/rollout_reward_func/mean": 0.02281249687075615,
"rewards/rollout_reward_func/std": 0.27481648325920105,
"sampling/importance_sampling_ratio/max": 2.9966037273406982,
"sampling/importance_sampling_ratio/mean": 1.031958818435669,
"sampling/importance_sampling_ratio/min": 0.21707558631896973,
"sampling/sampling_logp_difference/max": 1.1190013885498047,
"sampling/sampling_logp_difference/mean": 0.0631747841835022,
"step": 163,
"step_time": 40.34314790100325
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.008091517724096775,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008091517724096775,
"entropy": 0.29565080255270004,
"epoch": 0.00328,
"grad_norm": 1.9357725381851196,
"kl": 0.2302815355360508,
"learning_rate": 9.99996965938145e-06,
"loss": -0.0556,
"step": 164,
"step_time": 8.167869026008702
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2738.0,
"completions/max_terminated_length": 2738.0,
"completions/mean_length": 2552.15625,
"completions/mean_terminated_length": 2552.15625,
"completions/min_length": 1015.0,
"completions/min_terminated_length": 1015.0,
"entropy": 0.30549056455492973,
"epoch": 0.0033,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.3971192836761475,
"kl": 0.3436704585328698,
"learning_rate": 9.999969183458093e-06,
"loss": -0.0144,
"num_tokens": 8572678.0,
"reward": 0.005937501788139343,
"reward_std": 0.11331214010715485,
"rewards/rollout_reward_func/mean": 0.005937501788139343,
"rewards/rollout_reward_func/std": 0.19420406222343445,
"sampling/importance_sampling_ratio/max": 2.7245781421661377,
"sampling/importance_sampling_ratio/mean": 0.9707354307174683,
"sampling/importance_sampling_ratio/min": 0.354126900434494,
"sampling/sampling_logp_difference/max": 1.1669249534606934,
"sampling/sampling_logp_difference/mean": 0.06437948346138,
"step": 165,
"step_time": 40.59159922699473
},
{
"clip_ratio/high_max": 0.026154891354963183,
"clip_ratio/high_mean": 0.013077445677481592,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02088994567748159,
"entropy": 0.30958276242017746,
"epoch": 0.00332,
"grad_norm": 2.078913450241089,
"kl": 0.30393845308572054,
"learning_rate": 9.999968703831072e-06,
"loss": -0.0176,
"step": 166,
"step_time": 8.28843492600572
},
{
"clip_ratio/high_max": 0.02734375,
"clip_ratio/high_mean": 0.013671875,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2732.0,
"completions/max_terminated_length": 2732.0,
"completions/mean_length": 2549.9375,
"completions/mean_terminated_length": 2549.9375,
"completions/min_length": 2373.0,
"completions/min_terminated_length": 2373.0,
"entropy": 0.3152090422809124,
"epoch": 0.00334,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8376330137252808,
"kl": 0.5024630185216665,
"learning_rate": 9.999968220500388e-06,
"loss": -0.0287,
"num_tokens": 8678291.0,
"reward": 0.012812500819563866,
"reward_std": 0.03377830609679222,
"rewards/rollout_reward_func/mean": 0.012812500819563866,
"rewards/rollout_reward_func/std": 0.05075141042470932,
"sampling/importance_sampling_ratio/max": 1.9678910970687866,
"sampling/importance_sampling_ratio/mean": 0.9133018255233765,
"sampling/importance_sampling_ratio/min": 0.3517675995826721,
"sampling/sampling_logp_difference/max": 1.355534553527832,
"sampling/sampling_logp_difference/mean": 0.06513936817646027,
"step": 167,
"step_time": 41.46052245599276
},
{
"clip_ratio/high_max": 0.01953125,
"clip_ratio/high_mean": 0.013671875,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.021484375,
"entropy": 0.31694196164608,
"epoch": 0.00336,
"grad_norm": 1.6731157302856445,
"kl": 0.4716432988643646,
"learning_rate": 9.99996773346604e-06,
"loss": -0.0335,
"step": 168,
"step_time": 8.273968965004315
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2680.0,
"completions/max_terminated_length": 2680.0,
"completions/mean_length": 2482.625,
"completions/mean_terminated_length": 2482.625,
"completions/min_length": 1437.0,
"completions/min_terminated_length": 1437.0,
"entropy": 0.3059420734643936,
"epoch": 0.00338,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5976994037628174,
"kl": 0.37691117636859417,
"learning_rate": 9.999967242728034e-06,
"loss": -0.0395,
"num_tokens": 8781391.0,
"reward": 0.04149999842047691,
"reward_std": 0.15404483675956726,
"rewards/rollout_reward_func/mean": 0.04149999842047691,
"rewards/rollout_reward_func/std": 0.25512754917144775,
"sampling/importance_sampling_ratio/max": 2.3652420043945312,
"sampling/importance_sampling_ratio/mean": 0.9670310020446777,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.1599547863006592,
"sampling/sampling_logp_difference/mean": 0.06705337762832642,
"step": 169,
"step_time": 39.37707168101042
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.009765625,
"clip_ratio/low_mean": 0.009765625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01953125,
"entropy": 0.3081233687698841,
"epoch": 0.0034,
"grad_norm": 1.4047455787658691,
"kl": 0.37214960530400276,
"learning_rate": 9.999966748286364e-06,
"loss": -0.0409,
"step": 170,
"step_time": 8.105316379995202
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2780.0,
"completions/max_terminated_length": 2780.0,
"completions/mean_length": 2585.71875,
"completions/mean_terminated_length": 2585.71875,
"completions/min_length": 2429.0,
"completions/min_terminated_length": 2429.0,
"entropy": 0.3306998014450073,
"epoch": 0.00342,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.908154845237732,
"kl": 0.2468932829797268,
"learning_rate": 9.999966250141033e-06,
"loss": -0.0225,
"num_tokens": 8887896.0,
"reward": 0.024937499314546585,
"reward_std": 0.04533165693283081,
"rewards/rollout_reward_func/mean": 0.024937499314546585,
"rewards/rollout_reward_func/std": 0.05147622898221016,
"sampling/importance_sampling_ratio/max": 2.204131841659546,
"sampling/importance_sampling_ratio/mean": 0.9951881170272827,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.8882501125335693,
"sampling/sampling_logp_difference/mean": 0.06724405288696289,
"step": 171,
"step_time": 41.23671616201318
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.33214887976646423,
"epoch": 0.00344,
"grad_norm": 1.7956488132476807,
"kl": 0.26035095751285553,
"learning_rate": 9.999965748292042e-06,
"loss": -0.022,
"step": 172,
"step_time": 8.822865209003794
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.007134885177947581,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01299426017794758,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2734.0,
"completions/max_terminated_length": 2734.0,
"completions/mean_length": 2594.1875,
"completions/mean_terminated_length": 2594.1875,
"completions/min_length": 2444.0,
"completions/min_terminated_length": 2444.0,
"entropy": 0.3237563855946064,
"epoch": 0.00346,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5233908891677856,
"kl": 0.5627510249614716,
"learning_rate": 9.999965242739394e-06,
"loss": -0.0095,
"num_tokens": 8994761.0,
"reward": 0.01656249910593033,
"reward_std": 0.06224355101585388,
"rewards/rollout_reward_func/mean": 0.01656249910593033,
"rewards/rollout_reward_func/std": 0.06737517565488815,
"sampling/importance_sampling_ratio/max": 2.673325777053833,
"sampling/importance_sampling_ratio/mean": 0.9175304770469666,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 14.002936363220215,
"sampling/sampling_logp_difference/mean": 0.09262394905090332,
"step": 173,
"step_time": 41.60413134700502
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.005181760177947581,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00908801017794758,
"entropy": 0.32536034286022186,
"epoch": 0.00348,
"grad_norm": 1.4691952466964722,
"kl": 0.5308522153645754,
"learning_rate": 9.999964733483082e-06,
"loss": -0.0117,
"step": 174,
"step_time": 8.211871264997171
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.004557291744276881,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004557291744276881,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2716.0,
"completions/max_terminated_length": 2716.0,
"completions/mean_length": 2492.5,
"completions/mean_terminated_length": 2492.5,
"completions/min_length": 1020.0,
"completions/min_terminated_length": 1020.0,
"entropy": 0.3340105786919594,
"epoch": 0.0035,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8298368453979492,
"kl": 0.3679938018321991,
"learning_rate": 9.999964220523113e-06,
"loss": -0.0572,
"num_tokens": 9098126.0,
"reward": -0.07824999839067459,
"reward_std": 0.1797890067100525,
"rewards/rollout_reward_func/mean": -0.07824999839067459,
"rewards/rollout_reward_func/std": 0.31904685497283936,
"sampling/importance_sampling_ratio/max": 1.841526985168457,
"sampling/importance_sampling_ratio/mean": 0.9063608050346375,
"sampling/importance_sampling_ratio/min": 0.19973398745059967,
"sampling/sampling_logp_difference/max": 1.0407519340515137,
"sampling/sampling_logp_difference/mean": 0.06615821272134781,
"step": 175,
"step_time": 39.59234012100205
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.0072180707938969135,
"clip_ratio/low_mean": 0.004557291744276881,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011775362538173795,
"entropy": 0.33797262609004974,
"epoch": 0.00352,
"grad_norm": 1.5984530448913574,
"kl": 0.36107587814331055,
"learning_rate": 9.999963703859486e-06,
"loss": -0.0569,
"step": 176,
"step_time": 8.214162019983632
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2752.0,
"completions/max_terminated_length": 2752.0,
"completions/mean_length": 2572.03125,
"completions/mean_terminated_length": 2572.03125,
"completions/min_length": 2314.0,
"completions/min_terminated_length": 2314.0,
"entropy": 0.3543061949312687,
"epoch": 0.00354,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5375295877456665,
"kl": 0.2578230034559965,
"learning_rate": 9.999963183492201e-06,
"loss": -0.1343,
"num_tokens": 9203871.0,
"reward": -0.027187500149011612,
"reward_std": 0.11736124753952026,
"rewards/rollout_reward_func/mean": -0.027187500149011612,
"rewards/rollout_reward_func/std": 0.2549792528152466,
"sampling/importance_sampling_ratio/max": 2.8757715225219727,
"sampling/importance_sampling_ratio/mean": 0.8799307346343994,
"sampling/importance_sampling_ratio/min": 0.1571628898382187,
"sampling/sampling_logp_difference/max": 1.2581915855407715,
"sampling/sampling_logp_difference/mean": 0.07375901937484741,
"step": 177,
"step_time": 41.12113772200246
},
{
"clip_ratio/high_max": 0.015506628900766373,
"clip_ratio/high_mean": 0.007753314450383186,
"clip_ratio/low_mean": 0.0038470644503831863,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011600378900766373,
"entropy": 0.3563588745892048,
"epoch": 0.00356,
"grad_norm": 1.5801756381988525,
"kl": 0.24894549511373043,
"learning_rate": 9.999962659421257e-06,
"loss": -0.1372,
"step": 178,
"step_time": 9.163750717998482
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2703.0,
"completions/max_terminated_length": 2703.0,
"completions/mean_length": 2544.84375,
"completions/mean_terminated_length": 2544.0,
"completions/min_length": 2408.0,
"completions/min_terminated_length": 2408.0,
"entropy": 0.35032501444220543,
"epoch": 0.00358,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.473603367805481,
"kl": 0.3438793905079365,
"learning_rate": 9.999962131646657e-06,
"loss": -0.0244,
"num_tokens": 9308232.0,
"reward": 0.026249999180436134,
"reward_std": 0.0644674301147461,
"rewards/rollout_reward_func/mean": 0.026249999180436134,
"rewards/rollout_reward_func/std": 0.06804884225130081,
"sampling/importance_sampling_ratio/max": 2.7217800617218018,
"sampling/importance_sampling_ratio/mean": 1.1029736995697021,
"sampling/importance_sampling_ratio/min": 0.15165719389915466,
"sampling/sampling_logp_difference/max": 1.3789944648742676,
"sampling/sampling_logp_difference/mean": 0.06541239470243454,
"step": 179,
"step_time": 40.59504781400028
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.35000326856970787,
"epoch": 0.0036,
"grad_norm": 1.4767123460769653,
"kl": 0.3850640542805195,
"learning_rate": 9.999961600168402e-06,
"loss": -0.0251,
"step": 180,
"step_time": 8.215517988006468
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2703.0,
"completions/max_terminated_length": 2703.0,
"completions/mean_length": 2507.625,
"completions/mean_terminated_length": 2507.625,
"completions/min_length": 1636.0,
"completions/min_terminated_length": 1636.0,
"entropy": 0.3719157911837101,
"epoch": 0.00362,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.799222469329834,
"kl": 0.2718762047588825,
"learning_rate": 9.99996106498649e-06,
"loss": -0.0178,
"num_tokens": 9412169.0,
"reward": 0.062187496572732925,
"reward_std": 0.12166280299425125,
"rewards/rollout_reward_func/mean": 0.062187496572732925,
"rewards/rollout_reward_func/std": 0.23215307295322418,
"sampling/importance_sampling_ratio/max": 2.1873276233673096,
"sampling/importance_sampling_ratio/mean": 1.0135772228240967,
"sampling/importance_sampling_ratio/min": 0.2740500867366791,
"sampling/sampling_logp_difference/max": 0.9779013991355896,
"sampling/sampling_logp_difference/mean": 0.061763741075992584,
"step": 181,
"step_time": 41.3249060379967
},
{
"clip_ratio/high_max": 0.02734375,
"clip_ratio/high_mean": 0.013671875,
"clip_ratio/low_mean": 0.00569196417927742,
"clip_ratio/low_min": 0.0035714285913854837,
"clip_ratio/region_mean": 0.01936383917927742,
"entropy": 0.3769003711640835,
"epoch": 0.00364,
"grad_norm": 1.6409553289413452,
"kl": 0.26755072735249996,
"learning_rate": 9.999960526100922e-06,
"loss": -0.0218,
"step": 182,
"step_time": 8.139846026009764
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2638.0,
"completions/max_terminated_length": 2638.0,
"completions/mean_length": 2519.4375,
"completions/mean_terminated_length": 2519.4375,
"completions/min_length": 2406.0,
"completions/min_terminated_length": 2406.0,
"entropy": 0.3503524139523506,
"epoch": 0.00366,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1418262720108032,
"kl": 0.27561422996222973,
"learning_rate": 9.9999599835117e-06,
"loss": -0.0738,
"num_tokens": 9516483.0,
"reward": 0.021562498062849045,
"reward_std": 0.03244706243276596,
"rewards/rollout_reward_func/mean": 0.021562498062849045,
"rewards/rollout_reward_func/std": 0.04205291345715523,
"sampling/importance_sampling_ratio/max": 1.8527733087539673,
"sampling/importance_sampling_ratio/mean": 0.8473471403121948,
"sampling/importance_sampling_ratio/min": 0.31423965096473694,
"sampling/sampling_logp_difference/max": 0.8821654319763184,
"sampling/sampling_logp_difference/mean": 0.0577574148774147,
"step": 183,
"step_time": 40.83836308600439
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.3548332415521145,
"epoch": 0.00368,
"grad_norm": 1.1759247779846191,
"kl": 0.2614702060818672,
"learning_rate": 9.999959437218823e-06,
"loss": -0.0759,
"step": 184,
"step_time": 8.55310592699243
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00463598920032382,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00463598920032382,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2719.0,
"completions/max_terminated_length": 2719.0,
"completions/mean_length": 2476.6875,
"completions/mean_terminated_length": 2476.6875,
"completions/min_length": 1306.0,
"completions/min_terminated_length": 1306.0,
"entropy": 0.3436601832509041,
"epoch": 0.0037,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3706753253936768,
"kl": 0.30720300413668156,
"learning_rate": 9.999958887222293e-06,
"loss": 0.0834,
"num_tokens": 9619759.0,
"reward": 0.03656249865889549,
"reward_std": 0.24551644921302795,
"rewards/rollout_reward_func/mean": 0.03656249865889549,
"rewards/rollout_reward_func/std": 0.34222203493118286,
"sampling/importance_sampling_ratio/max": 2.1588592529296875,
"sampling/importance_sampling_ratio/mean": 0.9483487606048584,
"sampling/importance_sampling_ratio/min": 0.1842896193265915,
"sampling/sampling_logp_difference/max": 0.9215145111083984,
"sampling/sampling_logp_difference/mean": 0.05946381390094757,
"step": 185,
"step_time": 38.045946442005516
},
{
"clip_ratio/high_max": 0.0234375,
"clip_ratio/high_mean": 0.013671875,
"clip_ratio/low_mean": 0.0018382353009656072,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015510110300965607,
"entropy": 0.35040333122015,
"epoch": 0.00372,
"grad_norm": 1.1458492279052734,
"kl": 0.2852174621075392,
"learning_rate": 9.999958333522109e-06,
"loss": 0.0825,
"step": 186,
"step_time": 8.192382211011136
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2683.0,
"completions/max_terminated_length": 2683.0,
"completions/mean_length": 2569.78125,
"completions/mean_terminated_length": 2569.78125,
"completions/min_length": 2393.0,
"completions/min_terminated_length": 2393.0,
"entropy": 0.3721434064209461,
"epoch": 0.00374,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8181395530700684,
"kl": 0.2232329212129116,
"learning_rate": 9.999957776118273e-06,
"loss": -0.0295,
"num_tokens": 9726042.0,
"reward": 0.03468749672174454,
"reward_std": 0.06791973114013672,
"rewards/rollout_reward_func/mean": 0.03468749672174454,
"rewards/rollout_reward_func/std": 0.07025227695703506,
"sampling/importance_sampling_ratio/max": 1.8577862977981567,
"sampling/importance_sampling_ratio/mean": 1.1377999782562256,
"sampling/importance_sampling_ratio/min": 0.3496271073818207,
"sampling/sampling_logp_difference/max": 0.7724575996398926,
"sampling/sampling_logp_difference/mean": 0.05710921436548233,
"step": 187,
"step_time": 41.54417477098468
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.37350033223629,
"epoch": 0.00376,
"grad_norm": 1.7502919435501099,
"kl": 0.22246569395065308,
"learning_rate": 9.999957215010786e-06,
"loss": -0.0304,
"step": 188,
"step_time": 8.093648672998825
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2706.0,
"completions/max_terminated_length": 2706.0,
"completions/mean_length": 2539.53125,
"completions/mean_terminated_length": 2536.806396484375,
"completions/min_length": 1820.0,
"completions/min_terminated_length": 1820.0,
"entropy": 0.36657416820526123,
"epoch": 0.00378,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3249930143356323,
"kl": 0.2342336755245924,
"learning_rate": 9.999956650199647e-06,
"loss": -0.1187,
"num_tokens": 9830471.0,
"reward": -0.054999999701976776,
"reward_std": 0.1698419749736786,
"rewards/rollout_reward_func/mean": -0.054999999701976776,
"rewards/rollout_reward_func/std": 0.27874141931533813,
"sampling/importance_sampling_ratio/max": 1.967926025390625,
"sampling/importance_sampling_ratio/mean": 0.9288737773895264,
"sampling/importance_sampling_ratio/min": 0.30164018273353577,
"sampling/sampling_logp_difference/max": 0.8526759147644043,
"sampling/sampling_logp_difference/mean": 0.05662436783313751,
"step": 189,
"step_time": 39.74040631599928
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.36383185535669327,
"epoch": 0.0038,
"grad_norm": 1.2447773218154907,
"kl": 0.2557512894272804,
"learning_rate": 9.999956081684854e-06,
"loss": -0.1228,
"step": 190,
"step_time": 8.122944991009717
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2738.0,
"completions/max_terminated_length": 2738.0,
"completions/mean_length": 2498.125,
"completions/mean_terminated_length": 2498.125,
"completions/min_length": 1619.0,
"completions/min_terminated_length": 1619.0,
"entropy": 0.35211504995822906,
"epoch": 0.00382,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3515205383300781,
"kl": 0.22487179283052683,
"learning_rate": 9.999955509466414e-06,
"loss": -0.0227,
"num_tokens": 9933907.0,
"reward": 0.04243750125169754,
"reward_std": 0.2674861550331116,
"rewards/rollout_reward_func/mean": 0.04243750125169754,
"rewards/rollout_reward_func/std": 0.3605915307998657,
"sampling/importance_sampling_ratio/max": 2.468160629272461,
"sampling/importance_sampling_ratio/mean": 0.975672721862793,
"sampling/importance_sampling_ratio/min": 0.19411441683769226,
"sampling/sampling_logp_difference/max": 0.8760786056518555,
"sampling/sampling_logp_difference/mean": 0.05464397370815277,
"step": 191,
"step_time": 39.224735490002786
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.3466275744140148,
"epoch": 0.00384,
"grad_norm": 1.3361579179763794,
"kl": 0.23413463309407234,
"learning_rate": 9.999954933544324e-06,
"loss": -0.025,
"step": 192,
"step_time": 8.201387897010136
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2755.0,
"completions/max_terminated_length": 2755.0,
"completions/mean_length": 2555.09375,
"completions/mean_terminated_length": 2555.09375,
"completions/min_length": 2369.0,
"completions/min_terminated_length": 2369.0,
"entropy": 0.35569561645388603,
"epoch": 0.00386,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7639473676681519,
"kl": 0.23214801028370857,
"learning_rate": 9.999954353918583e-06,
"loss": 0.005,
"num_tokens": 10039509.0,
"reward": 0.02437499910593033,
"reward_std": 0.04079621285200119,
"rewards/rollout_reward_func/mean": 0.02437499910593033,
"rewards/rollout_reward_func/std": 0.04737887531518936,
"sampling/importance_sampling_ratio/max": 2.841304063796997,
"sampling/importance_sampling_ratio/mean": 1.0419715642929077,
"sampling/importance_sampling_ratio/min": 0.37415075302124023,
"sampling/sampling_logp_difference/max": 0.682642936706543,
"sampling/sampling_logp_difference/mean": 0.055786363780498505,
"step": 193,
"step_time": 41.76466753900604
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.3506021201610565,
"epoch": 0.00388,
"grad_norm": 1.7473597526550293,
"kl": 0.23999202623963356,
"learning_rate": 9.999953770589195e-06,
"loss": 0.0053,
"step": 194,
"step_time": 8.263513790996512
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2735.0,
"completions/max_terminated_length": 2735.0,
"completions/mean_length": 2538.0625,
"completions/mean_terminated_length": 2538.0625,
"completions/min_length": 1788.0,
"completions/min_terminated_length": 1788.0,
"entropy": 0.37049463018774986,
"epoch": 0.0039,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8461344242095947,
"kl": 0.36803081817924976,
"learning_rate": 9.999953183556157e-06,
"loss": -0.0015,
"num_tokens": 10144288.0,
"reward": -0.03406250476837158,
"reward_std": 0.18202264606952667,
"rewards/rollout_reward_func/mean": -0.03406250476837158,
"rewards/rollout_reward_func/std": 0.2601347267627716,
"sampling/importance_sampling_ratio/max": 2.4161949157714844,
"sampling/importance_sampling_ratio/mean": 1.04921293258667,
"sampling/importance_sampling_ratio/min": 0.33067557215690613,
"sampling/sampling_logp_difference/max": 0.6486377716064453,
"sampling/sampling_logp_difference/mean": 0.059844404458999634,
"step": 195,
"step_time": 41.45346045300539
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.36830903589725494,
"epoch": 0.00392,
"grad_norm": 1.9099000692367554,
"kl": 0.3604978770017624,
"learning_rate": 9.999952592819472e-06,
"loss": -0.0066,
"step": 196,
"step_time": 8.190971479001746
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2717.0,
"completions/max_terminated_length": 2717.0,
"completions/mean_length": 2420.9375,
"completions/mean_terminated_length": 2420.9375,
"completions/min_length": 389.0,
"completions/min_terminated_length": 389.0,
"entropy": 0.35077469423413277,
"epoch": 0.00394,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.002080202102661,
"kl": 0.3194318525493145,
"learning_rate": 9.999951998379141e-06,
"loss": -0.2282,
"num_tokens": 10245383.0,
"reward": -0.0068125054240226746,
"reward_std": 0.19143234193325043,
"rewards/rollout_reward_func/mean": -0.0068125054240226746,
"rewards/rollout_reward_func/std": 0.3449188470840454,
"sampling/importance_sampling_ratio/max": 2.887979507446289,
"sampling/importance_sampling_ratio/mean": 1.1211514472961426,
"sampling/importance_sampling_ratio/min": 0.1755070835351944,
"sampling/sampling_logp_difference/max": 1.1164432764053345,
"sampling/sampling_logp_difference/mean": 0.06820495426654816,
"step": 197,
"step_time": 38.74268743399443
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.3482266962528229,
"epoch": 0.00396,
"grad_norm": 1.8061574697494507,
"kl": 0.3401306103914976,
"learning_rate": 9.999951400235163e-06,
"loss": -0.2299,
"step": 198,
"step_time": 8.126102788002754
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2668.0,
"completions/max_terminated_length": 2668.0,
"completions/mean_length": 2516.75,
"completions/mean_terminated_length": 2516.75,
"completions/min_length": 2034.0,
"completions/min_terminated_length": 2034.0,
"entropy": 0.342928946018219,
"epoch": 0.00398,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2324050664901733,
"kl": 0.3182620648294687,
"learning_rate": 9.999950798387541e-06,
"loss": -0.0755,
"num_tokens": 10349358.0,
"reward": 0.11312500387430191,
"reward_std": 0.31299662590026855,
"rewards/rollout_reward_func/mean": 0.11312500387430191,
"rewards/rollout_reward_func/std": 0.39529845118522644,
"sampling/importance_sampling_ratio/max": 2.2911791801452637,
"sampling/importance_sampling_ratio/mean": 0.9121253490447998,
"sampling/importance_sampling_ratio/min": 0.1949577033519745,
"sampling/sampling_logp_difference/max": 1.2254157066345215,
"sampling/sampling_logp_difference/mean": 0.05940123647451401,
"step": 199,
"step_time": 38.75293362401135
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.33788008987903595,
"epoch": 0.004,
"grad_norm": 1.184295415878296,
"kl": 0.3464464507997036,
"learning_rate": 9.999950192836272e-06,
"loss": -0.0787,
"step": 200,
"step_time": 8.536211900005583
},
{
"clip_ratio/high_max": 0.015395220601931214,
"clip_ratio/high_mean": 0.007697610300965607,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011603860300965607,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2759.0,
"completions/max_terminated_length": 2759.0,
"completions/mean_length": 2531.75,
"completions/mean_terminated_length": 2531.75,
"completions/min_length": 2288.0,
"completions/min_terminated_length": 2288.0,
"entropy": 0.3628573678433895,
"epoch": 0.00402,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.393696665763855,
"kl": 0.4442015439271927,
"learning_rate": 9.999949583581358e-06,
"loss": -0.1296,
"num_tokens": 10453571.0,
"reward": 0.03125,
"reward_std": 0.04873351752758026,
"rewards/rollout_reward_func/mean": 0.03125,
"rewards/rollout_reward_func/std": 0.060147665441036224,
"sampling/importance_sampling_ratio/max": 2.2512755393981934,
"sampling/importance_sampling_ratio/mean": 0.9395947456359863,
"sampling/importance_sampling_ratio/min": 0.24555900692939758,
"sampling/sampling_logp_difference/max": 0.942570686340332,
"sampling/sampling_logp_difference/mean": 0.07334975898265839,
"step": 201,
"step_time": 42.58854235501349
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.00746193912345916,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01332131412345916,
"entropy": 0.3568928651511669,
"epoch": 0.00404,
"grad_norm": 1.1539993286132812,
"kl": 0.446804903447628,
"learning_rate": 9.999948970622801e-06,
"loss": -0.1318,
"step": 202,
"step_time": 8.253893647997756
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2730.0,
"completions/max_terminated_length": 2730.0,
"completions/mean_length": 2574.71875,
"completions/mean_terminated_length": 2574.71875,
"completions/min_length": 2381.0,
"completions/min_terminated_length": 2381.0,
"entropy": 0.3213490657508373,
"epoch": 0.00406,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6174315214157104,
"kl": 0.386203370988369,
"learning_rate": 9.9999483539606e-06,
"loss": -0.0016,
"num_tokens": 10559322.0,
"reward": 0.046687498688697815,
"reward_std": 0.05125562474131584,
"rewards/rollout_reward_func/mean": 0.046687498688697815,
"rewards/rollout_reward_func/std": 0.062241170555353165,
"sampling/importance_sampling_ratio/max": 2.171849012374878,
"sampling/importance_sampling_ratio/mean": 0.9831439852714539,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.944068431854248,
"sampling/sampling_logp_difference/mean": 0.06549233198165894,
"step": 203,
"step_time": 40.6149429290017
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013671875,
"entropy": 0.3156820274889469,
"epoch": 0.00408,
"grad_norm": 1.6070283651351929,
"kl": 0.4013451524078846,
"learning_rate": 9.999947733594757e-06,
"loss": -0.0065,
"step": 204,
"step_time": 8.203901119995862
},
{
"clip_ratio/high_max": 0.007694128900766373,
"clip_ratio/high_mean": 0.0038470644503831863,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009706439450383186,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2716.0,
"completions/max_terminated_length": 2716.0,
"completions/mean_length": 2486.21875,
"completions/mean_terminated_length": 2486.21875,
"completions/min_length": 1017.0,
"completions/min_terminated_length": 1017.0,
"entropy": 0.3160020150244236,
"epoch": 0.0041,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2670780420303345,
"kl": 0.41101067140698433,
"learning_rate": 9.99994710952527e-06,
"loss": -0.1007,
"num_tokens": 10662392.0,
"reward": -0.006562499329447746,
"reward_std": 0.11399046331644058,
"rewards/rollout_reward_func/mean": -0.006562499329447746,
"rewards/rollout_reward_func/std": 0.19189396500587463,
"sampling/importance_sampling_ratio/max": 2.0264596939086914,
"sampling/importance_sampling_ratio/mean": 0.8527935147285461,
"sampling/importance_sampling_ratio/min": 0.07731558382511139,
"sampling/sampling_logp_difference/max": 1.2514522075653076,
"sampling/sampling_logp_difference/mean": 0.07297109067440033,
"step": 205,
"step_time": 40.33754772198881
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.31296201795339584,
"epoch": 0.00412,
"grad_norm": 1.2826589345932007,
"kl": 0.4051324315369129,
"learning_rate": 9.999946481752143e-06,
"loss": -0.1028,
"step": 206,
"step_time": 8.621249777992489
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.006138392956927419,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008091517956927419,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2759.0,
"completions/max_terminated_length": 2759.0,
"completions/mean_length": 2496.40625,
"completions/mean_terminated_length": 2496.386962890625,
"completions/min_length": 1286.0,
"completions/min_terminated_length": 1286.0,
"entropy": 0.30267368629574776,
"epoch": 0.00414,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4600504636764526,
"kl": 0.24276616983115673,
"learning_rate": 9.999945850275376e-06,
"loss": 0.0199,
"num_tokens": 10765406.0,
"reward": 0.02281249687075615,
"reward_std": 0.2987138628959656,
"rewards/rollout_reward_func/mean": 0.02281249687075615,
"rewards/rollout_reward_func/std": 0.40383031964302063,
"sampling/importance_sampling_ratio/max": 2.8048996925354004,
"sampling/importance_sampling_ratio/mean": 1.0050692558288574,
"sampling/importance_sampling_ratio/min": 0.34381619095802307,
"sampling/sampling_logp_difference/max": 0.9117152690887451,
"sampling/sampling_logp_difference/mean": 0.0614398792386055,
"step": 207,
"step_time": 39.424753070990846
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.009765625,
"clip_ratio/low_mean": 0.010044642956927419,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01981026795692742,
"entropy": 0.3010764829814434,
"epoch": 0.00416,
"grad_norm": 1.2748208045959473,
"kl": 0.2418710347265005,
"learning_rate": 9.999945215094968e-06,
"loss": 0.016,
"step": 208,
"step_time": 8.264535922004143
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.005989583441987634,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011848958441987634,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2715.0,
"completions/max_terminated_length": 2715.0,
"completions/mean_length": 2515.71875,
"completions/mean_terminated_length": 2515.71875,
"completions/min_length": 1839.0,
"completions/min_terminated_length": 1839.0,
"entropy": 0.29903779923915863,
"epoch": 0.00418,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4785264730453491,
"kl": 0.3483623880892992,
"learning_rate": 9.99994457621092e-06,
"loss": -0.0908,
"num_tokens": 10869297.0,
"reward": 0.005625000223517418,
"reward_std": 0.10760631412267685,
"rewards/rollout_reward_func/mean": 0.005625000223517418,
"rewards/rollout_reward_func/std": 0.19536462426185608,
"sampling/importance_sampling_ratio/max": 2.326657772064209,
"sampling/importance_sampling_ratio/mean": 1.0822689533233643,
"sampling/importance_sampling_ratio/min": 0.0710102990269661,
"sampling/sampling_logp_difference/max": 1.217003583908081,
"sampling/sampling_logp_difference/mean": 0.0704931765794754,
"step": 209,
"step_time": 41.126067208992026
},
{
"clip_ratio/high_max": 0.01907169120386243,
"clip_ratio/high_mean": 0.011488970601931214,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015395220601931214,
"entropy": 0.295239444822073,
"epoch": 0.0042,
"grad_norm": 2.135641574859619,
"kl": 0.35493278689682484,
"learning_rate": 9.999943933623233e-06,
"loss": -0.0943,
"step": 210,
"step_time": 8.173974077988532
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.006138392956927419,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010044642956927419,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2724.0,
"completions/max_terminated_length": 2724.0,
"completions/mean_length": 2539.84375,
"completions/mean_terminated_length": 2539.84375,
"completions/min_length": 1788.0,
"completions/min_terminated_length": 1788.0,
"entropy": 0.2810978293418884,
"epoch": 0.00422,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.446002721786499,
"kl": 0.5180637389421463,
"learning_rate": 9.999943287331909e-06,
"loss": -0.0502,
"num_tokens": 10973915.0,
"reward": -0.04500000178813934,
"reward_std": 0.17551785707473755,
"rewards/rollout_reward_func/mean": -0.04500000178813934,
"rewards/rollout_reward_func/std": 0.26568230986595154,
"sampling/importance_sampling_ratio/max": 1.8341037034988403,
"sampling/importance_sampling_ratio/mean": 0.8888924717903137,
"sampling/importance_sampling_ratio/min": 0.3128281831741333,
"sampling/sampling_logp_difference/max": 1.6654303073883057,
"sampling/sampling_logp_difference/mean": 0.07020728290081024,
"step": 211,
"step_time": 39.432713571011845
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.014229910913854837,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.018136160913854837,
"entropy": 0.2733327057212591,
"epoch": 0.00424,
"grad_norm": 1.2415473461151123,
"kl": 0.5663974024355412,
"learning_rate": 9.999942637336943e-06,
"loss": -0.0513,
"step": 212,
"step_time": 8.662085884992848
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0031250000465661287,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005078124813735485,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2721.0,
"completions/max_terminated_length": 2721.0,
"completions/mean_length": 2471.5625,
"completions/mean_terminated_length": 2471.5625,
"completions/min_length": 403.0,
"completions/min_terminated_length": 403.0,
"entropy": 0.25025843642652035,
"epoch": 0.00426,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.2709338665008545,
"kl": 0.821235241368413,
"learning_rate": 9.999941983638343e-06,
"loss": -0.0337,
"num_tokens": 11077168.0,
"reward": 0.01850000023841858,
"reward_std": 0.2203158140182495,
"rewards/rollout_reward_func/mean": 0.01850000023841858,
"rewards/rollout_reward_func/std": 0.31408196687698364,
"sampling/importance_sampling_ratio/max": 2.0936169624328613,
"sampling/importance_sampling_ratio/mean": 0.9659475088119507,
"sampling/importance_sampling_ratio/min": 0.06496711075305939,
"sampling/sampling_logp_difference/max": 2.544590473175049,
"sampling/sampling_logp_difference/mean": 0.059280212968587875,
"step": 213,
"step_time": 38.46859342800599
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.24745017662644386,
"epoch": 0.00428,
"grad_norm": 1.4903650283813477,
"kl": 0.577656701207161,
"learning_rate": 9.999941326236106e-06,
"loss": -0.037,
"step": 214,
"step_time": 8.155131259009067
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2718.0,
"completions/max_terminated_length": 2718.0,
"completions/mean_length": 2575.0625,
"completions/mean_terminated_length": 2575.0625,
"completions/min_length": 2392.0,
"completions/min_terminated_length": 2392.0,
"entropy": 0.2778208311647177,
"epoch": 0.0043,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7159686088562012,
"kl": 0.7443384360522032,
"learning_rate": 9.999940665130233e-06,
"loss": 0.0089,
"num_tokens": 11182822.0,
"reward": -0.024375002831220627,
"reward_std": 0.10185873508453369,
"rewards/rollout_reward_func/mean": -0.024375002831220627,
"rewards/rollout_reward_func/std": 0.18804490566253662,
"sampling/importance_sampling_ratio/max": 2.0679190158843994,
"sampling/importance_sampling_ratio/mean": 0.9192431569099426,
"sampling/importance_sampling_ratio/min": 0.14288972318172455,
"sampling/sampling_logp_difference/max": 1.4988486766815186,
"sampling/sampling_logp_difference/mean": 0.08602313697338104,
"step": 215,
"step_time": 40.16992479900364
},
{
"clip_ratio/high_max": 0.01953125,
"clip_ratio/high_mean": 0.009765625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.28077778965234756,
"epoch": 0.00432,
"grad_norm": 1.4250431060791016,
"kl": 0.6058767847716808,
"learning_rate": 9.999940000320726e-06,
"loss": 0.006,
"step": 216,
"step_time": 8.206848283996806
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2706.0,
"completions/max_terminated_length": 2706.0,
"completions/mean_length": 2536.84375,
"completions/mean_terminated_length": 2536.84375,
"completions/min_length": 2381.0,
"completions/min_terminated_length": 2381.0,
"entropy": 0.28082339093089104,
"epoch": 0.00434,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.5246869325637817,
"kl": 0.5135170202702284,
"learning_rate": 9.999939331807582e-06,
"loss": -0.1763,
"num_tokens": 11287279.0,
"reward": 0.017500000074505806,
"reward_std": 0.04132015258073807,
"rewards/rollout_reward_func/mean": 0.017500000074505806,
"rewards/rollout_reward_func/std": 0.052915021777153015,
"sampling/importance_sampling_ratio/max": 2.8221874237060547,
"sampling/importance_sampling_ratio/mean": 0.8796664476394653,
"sampling/importance_sampling_ratio/min": 0.07916179299354553,
"sampling/sampling_logp_difference/max": 1.6334364414215088,
"sampling/sampling_logp_difference/mean": 0.0864379033446312,
"step": 217,
"step_time": 41.50737186798506
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.2852325811982155,
"epoch": 0.00436,
"grad_norm": 1.5693947076797485,
"kl": 0.46704200468957424,
"learning_rate": 9.999938659590807e-06,
"loss": -0.1787,
"step": 218,
"step_time": 8.133669030990859
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2785.0,
"completions/max_terminated_length": 2785.0,
"completions/mean_length": 2510.1875,
"completions/mean_terminated_length": 2510.1875,
"completions/min_length": 1017.0,
"completions/min_terminated_length": 1017.0,
"entropy": 0.28106593526899815,
"epoch": 0.00438,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.543866753578186,
"kl": 0.582334304228425,
"learning_rate": 9.999937983670399e-06,
"loss": -0.004,
"num_tokens": 11391012.0,
"reward": 0.00937499850988388,
"reward_std": 0.11015652120113373,
"rewards/rollout_reward_func/mean": 0.00937499850988388,
"rewards/rollout_reward_func/std": 0.19029581546783447,
"sampling/importance_sampling_ratio/max": 2.0876622200012207,
"sampling/importance_sampling_ratio/mean": 0.9376651048660278,
"sampling/importance_sampling_ratio/min": 0.13859038054943085,
"sampling/sampling_logp_difference/max": 1.4258623123168945,
"sampling/sampling_logp_difference/mean": 0.07284407317638397,
"step": 219,
"step_time": 40.03303195400076
},
{
"clip_ratio/high_max": 0.01953125,
"clip_ratio/high_mean": 0.013671875,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01953125,
"entropy": 0.282447412610054,
"epoch": 0.0044,
"grad_norm": 1.4197849035263062,
"kl": 0.6128968577831984,
"learning_rate": 9.999937304046356e-06,
"loss": -0.0071,
"step": 220,
"step_time": 8.333434138992743
},
{
"clip_ratio/high_max": 0.008072916883975267,
"clip_ratio/high_mean": 0.005989583441987634,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005989583441987634,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2701.0,
"completions/max_terminated_length": 2701.0,
"completions/mean_length": 2418.09375,
"completions/mean_terminated_length": 2418.09375,
"completions/min_length": 412.0,
"completions/min_terminated_length": 412.0,
"entropy": 0.2881895024329424,
"epoch": 0.00442,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5842020511627197,
"kl": 0.5021013058722019,
"learning_rate": 9.99993662071868e-06,
"loss": 0.0788,
"num_tokens": 11492112.0,
"reward": 0.0020000003278255463,
"reward_std": 0.23710773885250092,
"rewards/rollout_reward_func/mean": 0.0020000003278255463,
"rewards/rollout_reward_func/std": 0.3416566848754883,
"sampling/importance_sampling_ratio/max": 2.047086715698242,
"sampling/importance_sampling_ratio/mean": 0.8008227944374084,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.5975122451782227,
"sampling/sampling_logp_difference/mean": 0.07265321910381317,
"step": 221,
"step_time": 39.7871799089844
},
{
"clip_ratio/high_max": 0.0360576924867928,
"clip_ratio/high_mean": 0.025841346010565758,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.025841346010565758,
"entropy": 0.2943236604332924,
"epoch": 0.00444,
"grad_norm": 1.1485141515731812,
"kl": 0.37606994807720184,
"learning_rate": 9.999935933687375e-06,
"loss": 0.0736,
"step": 222,
"step_time": 8.64790611598437
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2734.0,
"completions/max_terminated_length": 2734.0,
"completions/mean_length": 2529.90625,
"completions/mean_terminated_length": 2529.90625,
"completions/min_length": 1315.0,
"completions/min_terminated_length": 1315.0,
"entropy": 0.3002755120396614,
"epoch": 0.00446,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6034713983535767,
"kl": 0.38438911363482475,
"learning_rate": 9.99993524295244e-06,
"loss": 0.0235,
"num_tokens": 11596899.0,
"reward": -0.016437498852610588,
"reward_std": 0.10589368641376495,
"rewards/rollout_reward_func/mean": -0.016437498852610588,
"rewards/rollout_reward_func/std": 0.19297447800636292,
"sampling/importance_sampling_ratio/max": 2.1702961921691895,
"sampling/importance_sampling_ratio/mean": 0.8773471117019653,
"sampling/importance_sampling_ratio/min": 0.1552795171737671,
"sampling/sampling_logp_difference/max": 1.5045547485351562,
"sampling/sampling_logp_difference/mean": 0.07015751302242279,
"step": 223,
"step_time": 41.07358841999667
},
{
"clip_ratio/high_max": 0.01953125,
"clip_ratio/high_mean": 0.01171875,
"clip_ratio/low_mean": 0.013671875,
"clip_ratio/low_min": 0.0078125,
"clip_ratio/region_mean": 0.025390625,
"entropy": 0.30388905853033066,
"epoch": 0.00448,
"grad_norm": 1.6556518077850342,
"kl": 0.3493770435452461,
"learning_rate": 9.999934548513875e-06,
"loss": 0.0229,
"step": 224,
"step_time": 8.234722809000232
},
{
"clip_ratio/high_max": 0.0234375,
"clip_ratio/high_mean": 0.01171875,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2728.0,
"completions/max_terminated_length": 2728.0,
"completions/mean_length": 2476.4375,
"completions/mean_terminated_length": 2477.51611328125,
"completions/min_length": 1948.0,
"completions/min_terminated_length": 1948.0,
"entropy": 0.28977199271321297,
"epoch": 0.0045,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9037975072860718,
"kl": 0.3731118068099022,
"learning_rate": 9.999933850371681e-06,
"loss": -0.0358,
"num_tokens": 11699859.0,
"reward": 0.030812501907348633,
"reward_std": 0.24771924316883087,
"rewards/rollout_reward_func/mean": 0.030812501907348633,
"rewards/rollout_reward_func/std": 0.3897912800312042,
"sampling/importance_sampling_ratio/max": 2.3635151386260986,
"sampling/importance_sampling_ratio/mean": 0.9108861684799194,
"sampling/importance_sampling_ratio/min": 0.016115080565214157,
"sampling/sampling_logp_difference/max": 1.8015650510787964,
"sampling/sampling_logp_difference/mean": 0.06312668323516846,
"step": 225,
"step_time": 39.49097987600544
},
{
"clip_ratio/high_max": 0.039583333767950535,
"clip_ratio/high_mean": 0.021875000093132257,
"clip_ratio/low_mean": 0.01171875,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.033593749860301614,
"entropy": 0.28937546350061893,
"epoch": 0.00452,
"grad_norm": 1.3335111141204834,
"kl": 0.38635273836553097,
"learning_rate": 9.999933148525858e-06,
"loss": -0.0415,
"step": 226,
"step_time": 8.190281943003356
},
{
"clip_ratio/high_max": 0.018342391354963183,
"clip_ratio/high_mean": 0.009171195677481592,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009171195677481592,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2713.0,
"completions/max_terminated_length": 2713.0,
"completions/mean_length": 2423.1875,
"completions/mean_terminated_length": 2423.1875,
"completions/min_length": 392.0,
"completions/min_terminated_length": 392.0,
"entropy": 0.32319737412035465,
"epoch": 0.00454,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7308000326156616,
"kl": 0.4490476939827204,
"learning_rate": 9.999932442976408e-06,
"loss": -0.046,
"num_tokens": 11801299.0,
"reward": -0.07231250405311584,
"reward_std": 0.22803862392902374,
"rewards/rollout_reward_func/mean": -0.07231250405311584,
"rewards/rollout_reward_func/std": 0.3053916096687317,
"sampling/importance_sampling_ratio/max": 2.4876017570495605,
"sampling/importance_sampling_ratio/mean": 0.8915628790855408,
"sampling/importance_sampling_ratio/min": 0.22262075543403625,
"sampling/sampling_logp_difference/max": 1.2082533836364746,
"sampling/sampling_logp_difference/mean": 0.05457788705825806,
"step": 227,
"step_time": 39.01180349101196
},
{
"clip_ratio/high_max": 0.026154891354963183,
"clip_ratio/high_mean": 0.015030570677481592,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.020889945793896914,
"entropy": 0.322303120046854,
"epoch": 0.00456,
"grad_norm": 1.5136457681655884,
"kl": 0.46418991312384605,
"learning_rate": 9.99993173372333e-06,
"loss": -0.0493,
"step": 228,
"step_time": 9.05183739597851
},
{
"clip_ratio/high_max": 0.009114583488553762,
"clip_ratio/high_mean": 0.006510416744276881,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006510416744276881,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2727.0,
"completions/max_terminated_length": 2727.0,
"completions/mean_length": 2484.28125,
"completions/mean_terminated_length": 2484.28125,
"completions/min_length": 1000.0,
"completions/min_terminated_length": 1000.0,
"entropy": 0.28009452298283577,
"epoch": 0.00458,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4699045419692993,
"kl": 0.3860022109001875,
"learning_rate": 9.999931020766626e-06,
"loss": 0.0266,
"num_tokens": 11904330.0,
"reward": 0.022187501192092896,
"reward_std": 0.1748497486114502,
"rewards/rollout_reward_func/mean": 0.022187501192092896,
"rewards/rollout_reward_func/std": 0.28647732734680176,
"sampling/importance_sampling_ratio/max": 2.1559696197509766,
"sampling/importance_sampling_ratio/mean": 0.9133785963058472,
"sampling/importance_sampling_ratio/min": 0.10093791782855988,
"sampling/sampling_logp_difference/max": 1.0922698974609375,
"sampling/sampling_logp_difference/mean": 0.05431393161416054,
"step": 229,
"step_time": 39.775225694000255
},
{
"clip_ratio/high_max": 0.028645833488553762,
"clip_ratio/high_mean": 0.014322916744276881,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01627604174427688,
"entropy": 0.2788592744618654,
"epoch": 0.0046,
"grad_norm": 1.1860452890396118,
"kl": 0.34408247936517,
"learning_rate": 9.999930304106296e-06,
"loss": 0.0194,
"step": 230,
"step_time": 8.248418390990992
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2766.0,
"completions/max_terminated_length": 2766.0,
"completions/mean_length": 2515.53125,
"completions/mean_terminated_length": 2515.53125,
"completions/min_length": 1013.0,
"completions/min_terminated_length": 1013.0,
"entropy": 0.27757587283849716,
"epoch": 0.00462,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5308306217193604,
"kl": 0.2734272815287113,
"learning_rate": 9.99992958374234e-06,
"loss": -0.1329,
"num_tokens": 12008684.0,
"reward": -0.022812500596046448,
"reward_std": 0.12868362665176392,
"rewards/rollout_reward_func/mean": -0.022812500596046448,
"rewards/rollout_reward_func/std": 0.18984195590019226,
"sampling/importance_sampling_ratio/max": 1.9966732263565063,
"sampling/importance_sampling_ratio/mean": 0.9174094796180725,
"sampling/importance_sampling_ratio/min": 0.1465490758419037,
"sampling/sampling_logp_difference/max": 1.04158353805542,
"sampling/sampling_logp_difference/mean": 0.05047091096639633,
"step": 231,
"step_time": 41.81639621800423
},
{
"clip_ratio/high_max": 0.01953125,
"clip_ratio/high_mean": 0.009765625,
"clip_ratio/low_mean": 0.0038470644503831863,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013612689450383186,
"entropy": 0.27455075457692146,
"epoch": 0.00464,
"grad_norm": 1.48916494846344,
"kl": 0.2888593841344118,
"learning_rate": 9.999928859674762e-06,
"loss": -0.135,
"step": 232,
"step_time": 8.280193610000424
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2694.0,
"completions/max_terminated_length": 2694.0,
"completions/mean_length": 2548.03125,
"completions/mean_terminated_length": 2548.03125,
"completions/min_length": 2429.0,
"completions/min_terminated_length": 2429.0,
"entropy": 0.29389649629592896,
"epoch": 0.00466,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9648756980895996,
"kl": 0.4565126970410347,
"learning_rate": 9.999928131903557e-06,
"loss": -0.0874,
"num_tokens": 12114168.0,
"reward": 0.008375001139938831,
"reward_std": 0.0767504870891571,
"rewards/rollout_reward_func/mean": 0.008375001139938831,
"rewards/rollout_reward_func/std": 0.08380843698978424,
"sampling/importance_sampling_ratio/max": 1.532104253768921,
"sampling/importance_sampling_ratio/mean": 0.8359918594360352,
"sampling/importance_sampling_ratio/min": 0.06460016965866089,
"sampling/sampling_logp_difference/max": 1.3589129447937012,
"sampling/sampling_logp_difference/mean": 0.05609213560819626,
"step": 233,
"step_time": 42.033232877001865
},
{
"clip_ratio/high_max": 0.01953125,
"clip_ratio/high_mean": 0.013556985184550285,
"clip_ratio/low_mean": 0.01953125,
"clip_ratio/low_min": 0.0078125,
"clip_ratio/region_mean": 0.033088235184550285,
"entropy": 0.2880493104457855,
"epoch": 0.00468,
"grad_norm": 1.3010988235473633,
"kl": 0.5239596497267485,
"learning_rate": 9.999927400428733e-06,
"loss": -0.0924,
"step": 234,
"step_time": 8.519997568997496
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2655.0,
"completions/max_terminated_length": 2655.0,
"completions/mean_length": 2469.875,
"completions/mean_terminated_length": 2469.875,
"completions/min_length": 1325.0,
"completions/min_terminated_length": 1325.0,
"entropy": 0.25798671692609787,
"epoch": 0.0047,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7982852458953857,
"kl": 0.40759785287082195,
"learning_rate": 9.999926665250287e-06,
"loss": -0.0376,
"num_tokens": 12216592.0,
"reward": 0.03531249985098839,
"reward_std": 0.16736529767513275,
"rewards/rollout_reward_func/mean": 0.03531249985098839,
"rewards/rollout_reward_func/std": 0.2541143298149109,
"sampling/importance_sampling_ratio/max": 2.9104549884796143,
"sampling/importance_sampling_ratio/mean": 0.8951402902603149,
"sampling/importance_sampling_ratio/min": 0.101445771753788,
"sampling/sampling_logp_difference/max": 1.7230520248413086,
"sampling/sampling_logp_difference/mean": 0.0640086904168129,
"step": 235,
"step_time": 40.341744027995446
},
{
"clip_ratio/high_max": 0.02734375,
"clip_ratio/high_mean": 0.016075721010565758,
"clip_ratio/low_mean": 0.01953125,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.03560697101056576,
"entropy": 0.25308277271687984,
"epoch": 0.00472,
"grad_norm": 1.467527985572815,
"kl": 0.4257641229778528,
"learning_rate": 9.999925926368217e-06,
"loss": -0.042,
"step": 236,
"step_time": 7.9956631579989335
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2712.0,
"completions/max_terminated_length": 2712.0,
"completions/mean_length": 2452.53125,
"completions/mean_terminated_length": 2452.53125,
"completions/min_length": 391.0,
"completions/min_terminated_length": 391.0,
"entropy": 0.2688504420220852,
"epoch": 0.00474,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3559819459915161,
"kl": 0.18561278842389584,
"learning_rate": 9.999925183782528e-06,
"loss": -0.0083,
"num_tokens": 12318451.0,
"reward": -0.0006874985992908478,
"reward_std": 0.11364820599555969,
"rewards/rollout_reward_func/mean": -0.0006874985992908478,
"rewards/rollout_reward_func/std": 0.19639894366264343,
"sampling/importance_sampling_ratio/max": 1.7619540691375732,
"sampling/importance_sampling_ratio/mean": 1.017667293548584,
"sampling/importance_sampling_ratio/min": 0.13964326679706573,
"sampling/sampling_logp_difference/max": 0.9589061737060547,
"sampling/sampling_logp_difference/mean": 0.045998990535736084,
"step": 237,
"step_time": 40.39693818700471
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.26928373239934444,
"epoch": 0.00476,
"grad_norm": 1.3246057033538818,
"kl": 0.19937893375754356,
"learning_rate": 9.99992443749322e-06,
"loss": -0.0084,
"step": 238,
"step_time": 8.14953701399645
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2671.0,
"completions/max_terminated_length": 2671.0,
"completions/mean_length": 2486.78125,
"completions/mean_terminated_length": 2486.78125,
"completions/min_length": 1348.0,
"completions/min_terminated_length": 1348.0,
"entropy": 0.2892954871058464,
"epoch": 0.00478,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.781704306602478,
"kl": 0.2630540318787098,
"learning_rate": 9.99992368750029e-06,
"loss": -0.0765,
"num_tokens": 12421757.0,
"reward": 0.01875000074505806,
"reward_std": 0.18447396159172058,
"rewards/rollout_reward_func/mean": 0.01875000074505806,
"rewards/rollout_reward_func/std": 0.2730207145214081,
"sampling/importance_sampling_ratio/max": 1.9122931957244873,
"sampling/importance_sampling_ratio/mean": 1.0855708122253418,
"sampling/importance_sampling_ratio/min": 0.31768321990966797,
"sampling/sampling_logp_difference/max": 1.4190378189086914,
"sampling/sampling_logp_difference/mean": 0.05036480724811554,
"step": 239,
"step_time": 41.20987309400516
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.010216346243396401,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.014122596243396401,
"entropy": 0.28818782418966293,
"epoch": 0.0048,
"grad_norm": 1.4414920806884766,
"kl": 0.2751742023974657,
"learning_rate": 9.999922933803743e-06,
"loss": -0.0806,
"step": 240,
"step_time": 8.008296779989905
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2711.0,
"completions/max_terminated_length": 2711.0,
"completions/mean_length": 2487.875,
"completions/mean_terminated_length": 2487.875,
"completions/min_length": 396.0,
"completions/min_terminated_length": 396.0,
"entropy": 0.24784068576991558,
"epoch": 0.00482,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8702000379562378,
"kl": 0.5258154030889273,
"learning_rate": 9.999922176403579e-06,
"loss": -0.098,
"num_tokens": 12524685.0,
"reward": 0.015625,
"reward_std": 0.08874667435884476,
"rewards/rollout_reward_func/mean": 0.015625,
"rewards/rollout_reward_func/std": 0.15146461129188538,
"sampling/importance_sampling_ratio/max": 2.4689791202545166,
"sampling/importance_sampling_ratio/mean": 1.0352541208267212,
"sampling/importance_sampling_ratio/min": 0.2149888277053833,
"sampling/sampling_logp_difference/max": 1.6170353889465332,
"sampling/sampling_logp_difference/mean": 0.051200881600379944,
"step": 241,
"step_time": 40.59959981197608
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.24527825601398945,
"epoch": 0.00484,
"grad_norm": 1.7831230163574219,
"kl": 0.5612409617751837,
"learning_rate": 9.999921415299796e-06,
"loss": -0.1007,
"step": 242,
"step_time": 8.120403088985768
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2669.0,
"completions/max_terminated_length": 2669.0,
"completions/mean_length": 2423.9375,
"completions/mean_terminated_length": 2422.54833984375,
"completions/min_length": 1038.0,
"completions/min_terminated_length": 1038.0,
"entropy": 0.27891671285033226,
"epoch": 0.00486,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3307502269744873,
"kl": 0.5300094112753868,
"learning_rate": 9.9999206504924e-06,
"loss": -0.1379,
"num_tokens": 12626151.0,
"reward": -0.0572500005364418,
"reward_std": 0.22322788834571838,
"rewards/rollout_reward_func/mean": -0.0572500005364418,
"rewards/rollout_reward_func/std": 0.306132048368454,
"sampling/importance_sampling_ratio/max": 2.0466506481170654,
"sampling/importance_sampling_ratio/mean": 0.8789876699447632,
"sampling/importance_sampling_ratio/min": 0.191788911819458,
"sampling/sampling_logp_difference/max": 0.933627724647522,
"sampling/sampling_logp_difference/mean": 0.05595602095127106,
"step": 243,
"step_time": 39.27026320499135
},
{
"clip_ratio/high_max": 0.019644474843516946,
"clip_ratio/high_mean": 0.009822237421758473,
"clip_ratio/low_mean": 0.010416666744276881,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.020238904166035354,
"entropy": 0.27759014815092087,
"epoch": 0.00488,
"grad_norm": 1.225545883178711,
"kl": 0.5295771025121212,
"learning_rate": 9.999919881981385e-06,
"loss": -0.1401,
"step": 244,
"step_time": 8.500708511011908
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.01171875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2716.0,
"completions/max_terminated_length": 2716.0,
"completions/mean_length": 2497.65625,
"completions/mean_terminated_length": 2497.65625,
"completions/min_length": 381.0,
"completions/min_terminated_length": 381.0,
"entropy": 0.250592777505517,
"epoch": 0.0049,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.694082498550415,
"kl": 0.4345118338242173,
"learning_rate": 9.99991910976676e-06,
"loss": -0.082,
"num_tokens": 12729532.0,
"reward": 0.0031249974854290485,
"reward_std": 0.1167527362704277,
"rewards/rollout_reward_func/mean": 0.0031249974854290485,
"rewards/rollout_reward_func/std": 0.20330238342285156,
"sampling/importance_sampling_ratio/max": 2.3724825382232666,
"sampling/importance_sampling_ratio/mean": 0.9662559032440186,
"sampling/importance_sampling_ratio/min": 0.043532487004995346,
"sampling/sampling_logp_difference/max": 1.2320823669433594,
"sampling/sampling_logp_difference/mean": 0.06254242360591888,
"step": 245,
"step_time": 41.82365344599384
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.013671875,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.021484375,
"entropy": 0.24536587297916412,
"epoch": 0.00492,
"grad_norm": 1.4003206491470337,
"kl": 0.4776889346539974,
"learning_rate": 9.999918333848517e-06,
"loss": -0.0839,
"step": 246,
"step_time": 8.143517907003115
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2674.0,
"completions/max_terminated_length": 2674.0,
"completions/mean_length": 2504.3125,
"completions/mean_terminated_length": 2504.3125,
"completions/min_length": 1564.0,
"completions/min_terminated_length": 1564.0,
"entropy": 0.23114917986094952,
"epoch": 0.00494,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.211030125617981,
"kl": 0.3891689907759428,
"learning_rate": 9.999917554226663e-06,
"loss": -0.1096,
"num_tokens": 12833664.0,
"reward": 0.07406249642372131,
"reward_std": 0.11376680433750153,
"rewards/rollout_reward_func/mean": 0.07406249642372131,
"rewards/rollout_reward_func/std": 0.2270122915506363,
"sampling/importance_sampling_ratio/max": 2.0455029010772705,
"sampling/importance_sampling_ratio/mean": 0.9785677194595337,
"sampling/importance_sampling_ratio/min": 0.29773226380348206,
"sampling/sampling_logp_difference/max": 1.1002979278564453,
"sampling/sampling_logp_difference/mean": 0.04772745072841644,
"step": 247,
"step_time": 41.15195580800355
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.22527994960546494,
"epoch": 0.00496,
"grad_norm": 1.1739858388900757,
"kl": 0.4161613564938307,
"learning_rate": 9.999916770901197e-06,
"loss": -0.1122,
"step": 248,
"step_time": 8.022777628990298
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2729.0,
"completions/max_terminated_length": 2729.0,
"completions/mean_length": 2477.5,
"completions/mean_terminated_length": 2477.5,
"completions/min_length": 365.0,
"completions/min_terminated_length": 365.0,
"entropy": 0.24676493927836418,
"epoch": 0.00498,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6627272367477417,
"kl": 0.24644476640969515,
"learning_rate": 9.999915983872118e-06,
"loss": -0.0664,
"num_tokens": 12936851.0,
"reward": 0.002499997615814209,
"reward_std": 0.11950606107711792,
"rewards/rollout_reward_func/mean": 0.002499997615814209,
"rewards/rollout_reward_func/std": 0.20879067480564117,
"sampling/importance_sampling_ratio/max": 2.1652746200561523,
"sampling/importance_sampling_ratio/mean": 1.0998907089233398,
"sampling/importance_sampling_ratio/min": 0.40092340111732483,
"sampling/sampling_logp_difference/max": 0.9734196662902832,
"sampling/sampling_logp_difference/mean": 0.04127663001418114,
"step": 249,
"step_time": 41.39586626898381
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.24513536132872105,
"epoch": 0.005,
"grad_norm": 1.5830745697021484,
"kl": 0.23460615891963243,
"learning_rate": 9.99991519313943e-06,
"loss": -0.0699,
"step": 250,
"step_time": 8.650949458977266
},
{
"clip_ratio/high_max": 0.01953125,
"clip_ratio/high_mean": 0.009765625,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013671875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2699.0,
"completions/max_terminated_length": 2699.0,
"completions/mean_length": 2560.125,
"completions/mean_terminated_length": 2560.125,
"completions/min_length": 2440.0,
"completions/min_terminated_length": 2440.0,
"entropy": 0.24511336907744408,
"epoch": 0.00502,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6060280799865723,
"kl": 0.24437487684190273,
"learning_rate": 9.999914398703129e-06,
"loss": -0.0899,
"num_tokens": 13042591.0,
"reward": 0.03218749910593033,
"reward_std": 0.07532116770744324,
"rewards/rollout_reward_func/mean": 0.03218749910593033,
"rewards/rollout_reward_func/std": 0.0803048387169838,
"sampling/importance_sampling_ratio/max": 2.953709363937378,
"sampling/importance_sampling_ratio/mean": 1.006150245666504,
"sampling/importance_sampling_ratio/min": 0.2181449681520462,
"sampling/sampling_logp_difference/max": 1.0847293138504028,
"sampling/sampling_logp_difference/mean": 0.05197053402662277,
"step": 251,
"step_time": 41.052220668010705
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.01171875,
"clip_ratio/low_min": 0.0078125,
"clip_ratio/region_mean": 0.01953125,
"entropy": 0.24010136537253857,
"epoch": 0.00504,
"grad_norm": 1.5573564767837524,
"kl": 0.24994135182350874,
"learning_rate": 9.99991360056322e-06,
"loss": -0.0979,
"step": 252,
"step_time": 8.086292344996764
},
{
"clip_ratio/high_max": 0.013247282709926367,
"clip_ratio/high_mean": 0.006623641354963183,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010529891354963183,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2742.0,
"completions/max_terminated_length": 2742.0,
"completions/mean_length": 2552.59375,
"completions/mean_terminated_length": 2551.61279296875,
"completions/min_length": 2362.0,
"completions/min_terminated_length": 2362.0,
"entropy": 0.26262959092855453,
"epoch": 0.00506,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5103635787963867,
"kl": 0.4165810886770487,
"learning_rate": 9.999912798719703e-06,
"loss": -0.0666,
"num_tokens": 13147490.0,
"reward": 0.036249998956918716,
"reward_std": 0.07718061655759811,
"rewards/rollout_reward_func/mean": 0.036249998956918716,
"rewards/rollout_reward_func/std": 0.07884284108877182,
"sampling/importance_sampling_ratio/max": 2.046628952026367,
"sampling/importance_sampling_ratio/mean": 1.0505952835083008,
"sampling/importance_sampling_ratio/min": 0.12941874563694,
"sampling/sampling_logp_difference/max": 1.3702301979064941,
"sampling/sampling_logp_difference/mean": 0.05030977725982666,
"step": 253,
"step_time": 40.4351113330049
},
{
"clip_ratio/high_max": 0.006623641354963183,
"clip_ratio/high_mean": 0.0033118206774815917,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005264945677481592,
"entropy": 0.2602510582655668,
"epoch": 0.00508,
"grad_norm": 1.5389606952667236,
"kl": 0.46528979297727346,
"learning_rate": 9.999911993172577e-06,
"loss": -0.0669,
"step": 254,
"step_time": 8.200509453003178
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2738.0,
"completions/max_terminated_length": 2738.0,
"completions/mean_length": 2551.5,
"completions/mean_terminated_length": 2547.0322265625,
"completions/min_length": 2381.0,
"completions/min_terminated_length": 2381.0,
"entropy": 0.26974271424114704,
"epoch": 0.0051,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5606977939605713,
"kl": 0.6086874194443226,
"learning_rate": 9.999911183921846e-06,
"loss": -0.053,
"num_tokens": 13253080.0,
"reward": 0.027187500149011612,
"reward_std": 0.05690932646393776,
"rewards/rollout_reward_func/mean": 0.027187500149011612,
"rewards/rollout_reward_func/std": 0.07035551965236664,
"sampling/importance_sampling_ratio/max": 1.5175119638442993,
"sampling/importance_sampling_ratio/mean": 0.8066157698631287,
"sampling/importance_sampling_ratio/min": 0.0783872902393341,
"sampling/sampling_logp_difference/max": 1.5892443656921387,
"sampling/sampling_logp_difference/mean": 0.057283081114292145,
"step": 255,
"step_time": 40.19062403999851
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.011124320677481592,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.01893682067748159,
"entropy": 0.26865614764392376,
"epoch": 0.00512,
"grad_norm": 1.0609331130981445,
"kl": 0.5949009079486132,
"learning_rate": 9.999910370967508e-06,
"loss": -0.0575,
"step": 256,
"step_time": 9.087063239996496
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2723.0,
"completions/max_terminated_length": 2723.0,
"completions/mean_length": 2566.0,
"completions/mean_terminated_length": 2566.0,
"completions/min_length": 2383.0,
"completions/min_terminated_length": 2383.0,
"entropy": 0.26759802363812923,
"epoch": 0.00514,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5336532592773438,
"kl": 0.2863121032714844,
"learning_rate": 9.999909554309565e-06,
"loss": 0.0539,
"num_tokens": 13358742.0,
"reward": 0.027187500149011612,
"reward_std": 0.05224156379699707,
"rewards/rollout_reward_func/mean": 0.027187500149011612,
"rewards/rollout_reward_func/std": 0.052254609763622284,
"sampling/importance_sampling_ratio/max": 2.2579076290130615,
"sampling/importance_sampling_ratio/mean": 1.145774006843567,
"sampling/importance_sampling_ratio/min": 0.29213541746139526,
"sampling/sampling_logp_difference/max": 1.19516921043396,
"sampling/sampling_logp_difference/mean": 0.04962974041700363,
"step": 257,
"step_time": 40.889848869999696
},
{
"clip_ratio/high_max": 0.018342391354963183,
"clip_ratio/high_mean": 0.009171195677481592,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.01698369567748159,
"entropy": 0.27176199294626713,
"epoch": 0.00516,
"grad_norm": 1.3151822090148926,
"kl": 0.2789830360561609,
"learning_rate": 9.999908733948019e-06,
"loss": 0.0496,
"step": 258,
"step_time": 8.165247033000924
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2657.0,
"completions/max_terminated_length": 2657.0,
"completions/mean_length": 2521.90625,
"completions/mean_terminated_length": 2521.90625,
"completions/min_length": 2107.0,
"completions/min_terminated_length": 2107.0,
"entropy": 0.23622582480311394,
"epoch": 0.00518,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5044795274734497,
"kl": 0.44032028317451477,
"learning_rate": 9.999907909882866e-06,
"loss": 0.0241,
"num_tokens": 13463276.0,
"reward": -0.002250001300126314,
"reward_std": 0.08689141273498535,
"rewards/rollout_reward_func/mean": -0.002250001300126314,
"rewards/rollout_reward_func/std": 0.15675972402095795,
"sampling/importance_sampling_ratio/max": 2.597571611404419,
"sampling/importance_sampling_ratio/mean": 0.8917824029922485,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.9829277992248535,
"sampling/sampling_logp_difference/mean": 0.048439737409353256,
"step": 259,
"step_time": 40.84053165099613
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.23765388131141663,
"epoch": 0.0052,
"grad_norm": 1.459328293800354,
"kl": 0.42800300754606724,
"learning_rate": 9.999907082114113e-06,
"loss": 0.023,
"step": 260,
"step_time": 8.04916648499784
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2638.0,
"completions/max_terminated_length": 2638.0,
"completions/mean_length": 2504.9375,
"completions/mean_terminated_length": 2504.9375,
"completions/min_length": 2209.0,
"completions/min_terminated_length": 2209.0,
"entropy": 0.2611371409147978,
"epoch": 0.00522,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9348623752593994,
"kl": 0.8414626121520996,
"learning_rate": 9.999906250641757e-06,
"loss": 0.0554,
"num_tokens": 13566928.0,
"reward": 0.03687499463558197,
"reward_std": 0.17472779750823975,
"rewards/rollout_reward_func/mean": 0.03687499463558197,
"rewards/rollout_reward_func/std": 0.25037309527397156,
"sampling/importance_sampling_ratio/max": 2.3028323650360107,
"sampling/importance_sampling_ratio/mean": 0.8207236528396606,
"sampling/importance_sampling_ratio/min": 0.11133048683404922,
"sampling/sampling_logp_difference/max": 1.8047242164611816,
"sampling/sampling_logp_difference/mean": 0.06321839988231659,
"step": 261,
"step_time": 40.17522978800116
},
{
"clip_ratio/high_max": 0.0234375,
"clip_ratio/high_mean": 0.01171875,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.017578125,
"entropy": 0.266588669270277,
"epoch": 0.00524,
"grad_norm": 1.7961900234222412,
"kl": 0.6727431304752827,
"learning_rate": 9.9999054154658e-06,
"loss": 0.0517,
"step": 262,
"step_time": 8.483183490003285
},
{
"clip_ratio/high_max": 0.010529891354963183,
"clip_ratio/high_mean": 0.005264945677481592,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007218070677481592,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2704.0,
"completions/max_terminated_length": 2704.0,
"completions/mean_length": 2549.53125,
"completions/mean_terminated_length": 2544.54833984375,
"completions/min_length": 2336.0,
"completions/min_terminated_length": 2336.0,
"entropy": 0.2517882902175188,
"epoch": 0.00526,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.2822296619415283,
"kl": 0.27465381287038326,
"learning_rate": 9.999904576586242e-06,
"loss": -0.2304,
"num_tokens": 13672548.0,
"reward": 0.017374996095895767,
"reward_std": 0.15025976300239563,
"rewards/rollout_reward_func/mean": 0.017374996095895767,
"rewards/rollout_reward_func/std": 0.2624521553516388,
"sampling/importance_sampling_ratio/max": 2.4090371131896973,
"sampling/importance_sampling_ratio/mean": 1.0326308012008667,
"sampling/importance_sampling_ratio/min": 0.15842844545841217,
"sampling/sampling_logp_difference/max": 1.7556763887405396,
"sampling/sampling_logp_difference/mean": 0.037125516682863235,
"step": 263,
"step_time": 40.47858677999466
},
{
"clip_ratio/high_max": 0.0234375,
"clip_ratio/high_mean": 0.013077445793896914,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013077445793896914,
"entropy": 0.2592135164886713,
"epoch": 0.00528,
"grad_norm": 1.7404944896697998,
"kl": 0.2305660918354988,
"learning_rate": 9.999903734003084e-06,
"loss": -0.2328,
"step": 264,
"step_time": 8.174356406998413
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2761.0,
"completions/max_terminated_length": 2761.0,
"completions/mean_length": 2463.1875,
"completions/mean_terminated_length": 2463.1875,
"completions/min_length": 1167.0,
"completions/min_terminated_length": 1167.0,
"entropy": 0.2684826646000147,
"epoch": 0.0053,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.1070209741592407,
"kl": 0.22240010928362608,
"learning_rate": 9.999902887716329e-06,
"loss": -0.0723,
"num_tokens": 13774961.0,
"reward": 0.0507499985396862,
"reward_std": 0.1717434823513031,
"rewards/rollout_reward_func/mean": 0.0507499985396862,
"rewards/rollout_reward_func/std": 0.29026561975479126,
"sampling/importance_sampling_ratio/max": 2.1808736324310303,
"sampling/importance_sampling_ratio/mean": 1.1364681720733643,
"sampling/importance_sampling_ratio/min": 0.30863240361213684,
"sampling/sampling_logp_difference/max": 0.7152066230773926,
"sampling/sampling_logp_difference/mean": 0.039849814027547836,
"step": 265,
"step_time": 40.26420498000516
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.2720201928168535,
"epoch": 0.00532,
"grad_norm": 0.9975273609161377,
"kl": 0.2078150687739253,
"learning_rate": 9.999902037725978e-06,
"loss": -0.0747,
"step": 266,
"step_time": 8.240107650999562
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2718.0,
"completions/max_terminated_length": 2718.0,
"completions/mean_length": 2549.78125,
"completions/mean_terminated_length": 2549.78125,
"completions/min_length": 2220.0,
"completions/min_terminated_length": 2220.0,
"entropy": 0.2624861355870962,
"epoch": 0.00534,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.9669392108917236,
"kl": 0.37778835743665695,
"learning_rate": 9.999901184032026e-06,
"loss": 0.0082,
"num_tokens": 13880362.0,
"reward": 0.037312500178813934,
"reward_std": 0.043149061501026154,
"rewards/rollout_reward_func/mean": 0.037312500178813934,
"rewards/rollout_reward_func/std": 0.0625389888882637,
"sampling/importance_sampling_ratio/max": 1.9471007585525513,
"sampling/importance_sampling_ratio/mean": 0.9066208004951477,
"sampling/importance_sampling_ratio/min": 0.1761004775762558,
"sampling/sampling_logp_difference/max": 1.6079485416412354,
"sampling/sampling_logp_difference/mean": 0.04763280600309372,
"step": 267,
"step_time": 42.28432625199639
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0033118206774815917,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0033118206774815917,
"entropy": 0.2652330882847309,
"epoch": 0.00536,
"grad_norm": 1.0828042030334473,
"kl": 0.37003960087895393,
"learning_rate": 9.999900326634479e-06,
"loss": 0.0064,
"step": 268,
"step_time": 8.174676374001137
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0033118206774815917,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005264945677481592,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2634.0,
"completions/max_terminated_length": 2634.0,
"completions/mean_length": 2555.71875,
"completions/mean_terminated_length": 2556.64501953125,
"completions/min_length": 2424.0,
"completions/min_terminated_length": 2424.0,
"entropy": 0.3060521185398102,
"epoch": 0.00538,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.796020269393921,
"kl": 0.24016336910426617,
"learning_rate": 9.999899465533338e-06,
"loss": -0.1678,
"num_tokens": 13985568.0,
"reward": 0.03531249985098839,
"reward_std": 0.060174889862537384,
"rewards/rollout_reward_func/mean": 0.03531249985098839,
"rewards/rollout_reward_func/std": 0.06242333725094795,
"sampling/importance_sampling_ratio/max": 2.0808072090148926,
"sampling/importance_sampling_ratio/mean": 0.9158138632774353,
"sampling/importance_sampling_ratio/min": 0.30511394143104553,
"sampling/sampling_logp_difference/max": 1.0109217166900635,
"sampling/sampling_logp_difference/mean": 0.03978275507688522,
"step": 269,
"step_time": 41.190066035007476
},
{
"clip_ratio/high_max": 0.009341032709926367,
"clip_ratio/high_mean": 0.004670516354963183,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010529891587793827,
"entropy": 0.30568893253803253,
"epoch": 0.0054,
"grad_norm": 1.3609435558319092,
"kl": 0.2305306103080511,
"learning_rate": 9.999898600728599e-06,
"loss": -0.1692,
"step": 270,
"step_time": 8.029273982981977
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2813.0,
"completions/max_terminated_length": 2813.0,
"completions/mean_length": 2546.625,
"completions/mean_terminated_length": 2546.625,
"completions/min_length": 2405.0,
"completions/min_terminated_length": 2405.0,
"entropy": 0.2608827296644449,
"epoch": 0.00542,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5197490453720093,
"kl": 0.23819277435541153,
"learning_rate": 9.99989773222027e-06,
"loss": -0.0306,
"num_tokens": 14090773.0,
"reward": 0.02968749962747097,
"reward_std": 0.03806859254837036,
"rewards/rollout_reward_func/mean": 0.02968749962747097,
"rewards/rollout_reward_func/std": 0.04582465440034866,
"sampling/importance_sampling_ratio/max": 1.835666298866272,
"sampling/importance_sampling_ratio/mean": 1.0339916944503784,
"sampling/importance_sampling_ratio/min": 0.5572738647460938,
"sampling/sampling_logp_difference/max": 0.5970277786254883,
"sampling/sampling_logp_difference/mean": 0.03422776237130165,
"step": 271,
"step_time": 41.47874712299381
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.015625,
"entropy": 0.25844705663621426,
"epoch": 0.00544,
"grad_norm": 1.385913372039795,
"kl": 0.2487698830664158,
"learning_rate": 9.999896860008346e-06,
"loss": -0.0337,
"step": 272,
"step_time": 9.234016997012077
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2687.0,
"completions/max_terminated_length": 2687.0,
"completions/mean_length": 2528.03125,
"completions/mean_terminated_length": 2528.03125,
"completions/min_length": 1631.0,
"completions/min_terminated_length": 1631.0,
"entropy": 0.2662625387310982,
"epoch": 0.00546,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4977853298187256,
"kl": 0.3419014122337103,
"learning_rate": 9.999895984092831e-06,
"loss": 0.0226,
"num_tokens": 14195896.0,
"reward": 0.055937498807907104,
"reward_std": 0.11488151550292969,
"rewards/rollout_reward_func/mean": 0.055937498807907104,
"rewards/rollout_reward_func/std": 0.19372175633907318,
"sampling/importance_sampling_ratio/max": 2.1736485958099365,
"sampling/importance_sampling_ratio/mean": 0.8548949956893921,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.2168569564819336,
"sampling/sampling_logp_difference/mean": 0.05159320309758186,
"step": 273,
"step_time": 40.65139246901526
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.26714046485722065,
"epoch": 0.00548,
"grad_norm": 1.2127920389175415,
"kl": 0.3403823059052229,
"learning_rate": 9.999895104473725e-06,
"loss": 0.0179,
"step": 274,
"step_time": 8.083952489992953
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2748.0,
"completions/max_terminated_length": 2748.0,
"completions/mean_length": 2558.0,
"completions/mean_terminated_length": 2558.0,
"completions/min_length": 2395.0,
"completions/min_terminated_length": 2395.0,
"entropy": 0.25967661291360855,
"epoch": 0.0055,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3464138507843018,
"kl": 0.2503334507346153,
"learning_rate": 9.99989422115103e-06,
"loss": -0.0259,
"num_tokens": 14300978.0,
"reward": 0.0003124997019767761,
"reward_std": 0.05055317282676697,
"rewards/rollout_reward_func/mean": 0.0003124997019767761,
"rewards/rollout_reward_func/std": 0.052883580327034,
"sampling/importance_sampling_ratio/max": 1.650503158569336,
"sampling/importance_sampling_ratio/mean": 1.0017483234405518,
"sampling/importance_sampling_ratio/min": 0.39458227157592773,
"sampling/sampling_logp_difference/max": 0.954500675201416,
"sampling/sampling_logp_difference/mean": 0.03852980583906174,
"step": 275,
"step_time": 40.93569000100979
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.259623683989048,
"epoch": 0.00552,
"grad_norm": 1.112290859222412,
"kl": 0.2553240805864334,
"learning_rate": 9.999893334124745e-06,
"loss": -0.0344,
"step": 276,
"step_time": 8.276769702002639
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2681.0,
"completions/max_terminated_length": 2681.0,
"completions/mean_length": 2450.15625,
"completions/mean_terminated_length": 2450.15625,
"completions/min_length": 1182.0,
"completions/min_terminated_length": 1182.0,
"entropy": 0.2840597964823246,
"epoch": 0.00554,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.009488105773926,
"kl": 0.8314682450145483,
"learning_rate": 9.99989244339487e-06,
"loss": -0.0156,
"num_tokens": 14402590.0,
"reward": -0.04656250402331352,
"reward_std": 0.17522019147872925,
"rewards/rollout_reward_func/mean": -0.04656250402331352,
"rewards/rollout_reward_func/std": 0.2510700821876526,
"sampling/importance_sampling_ratio/max": 2.424591302871704,
"sampling/importance_sampling_ratio/mean": 1.0289294719696045,
"sampling/importance_sampling_ratio/min": 0.3011532127857208,
"sampling/sampling_logp_difference/max": 0.9531664848327637,
"sampling/sampling_logp_difference/mean": 0.04903682321310043,
"step": 277,
"step_time": 38.54316767599812
},
{
"clip_ratio/high_max": 0.01953125,
"clip_ratio/high_mean": 0.009765625,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013671875,
"entropy": 0.289629552513361,
"epoch": 0.00556,
"grad_norm": 1.7072014808654785,
"kl": 0.6213064789772034,
"learning_rate": 9.999891548961409e-06,
"loss": -0.0216,
"step": 278,
"step_time": 8.533860239993373
},
{
"clip_ratio/high_max": 0.02734375,
"clip_ratio/high_mean": 0.013671875,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017578125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2745.0,
"completions/max_terminated_length": 2745.0,
"completions/mean_length": 2451.25,
"completions/mean_terminated_length": 2451.25,
"completions/min_length": 474.0,
"completions/min_terminated_length": 474.0,
"entropy": 0.27815776132047176,
"epoch": 0.00558,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5860378742218018,
"kl": 0.4317518621683121,
"learning_rate": 9.999890650824362e-06,
"loss": -0.1737,
"num_tokens": 14504584.0,
"reward": -0.03437500074505806,
"reward_std": 0.18133552372455597,
"rewards/rollout_reward_func/mean": -0.03437500074505806,
"rewards/rollout_reward_func/std": 0.2963099777698517,
"sampling/importance_sampling_ratio/max": 2.2448441982269287,
"sampling/importance_sampling_ratio/mean": 0.9443175792694092,
"sampling/importance_sampling_ratio/min": 0.33127740025520325,
"sampling/sampling_logp_difference/max": 1.2472968101501465,
"sampling/sampling_logp_difference/mean": 0.04774777963757515,
"step": 279,
"step_time": 40.06899703200179
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0047940341755747795,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0047940341755747795,
"entropy": 0.28123702481389046,
"epoch": 0.0056,
"grad_norm": 1.623889684677124,
"kl": 0.3846174022182822,
"learning_rate": 9.999889748983727e-06,
"loss": -0.1747,
"step": 280,
"step_time": 8.234413726997445
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2735.0,
"completions/max_terminated_length": 2735.0,
"completions/mean_length": 2562.5,
"completions/mean_terminated_length": 2562.5,
"completions/min_length": 2294.0,
"completions/min_terminated_length": 2294.0,
"entropy": 0.28916312009096146,
"epoch": 0.00562,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.611228108406067,
"kl": 0.36890734918415546,
"learning_rate": 9.999888843439508e-06,
"loss": -0.0793,
"num_tokens": 14610050.0,
"reward": 0.07218749821186066,
"reward_std": 0.12699973583221436,
"rewards/rollout_reward_func/mean": 0.07218749821186066,
"rewards/rollout_reward_func/std": 0.22096173465251923,
"sampling/importance_sampling_ratio/max": 2.3326196670532227,
"sampling/importance_sampling_ratio/mean": 1.0808019638061523,
"sampling/importance_sampling_ratio/min": 0.31041672825813293,
"sampling/sampling_logp_difference/max": 0.9732460975646973,
"sampling/sampling_logp_difference/mean": 0.04236404597759247,
"step": 281,
"step_time": 39.99579230199015
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.288146510720253,
"epoch": 0.00564,
"grad_norm": 1.490877628326416,
"kl": 0.3540782444179058,
"learning_rate": 9.999887934191706e-06,
"loss": -0.0836,
"step": 282,
"step_time": 8.19389802900696
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2688.0,
"completions/max_terminated_length": 2688.0,
"completions/mean_length": 2480.78125,
"completions/mean_terminated_length": 2480.78125,
"completions/min_length": 1477.0,
"completions/min_terminated_length": 1477.0,
"entropy": 0.285388445481658,
"epoch": 0.00566,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3937627077102661,
"kl": 0.27888352051377296,
"learning_rate": 9.99988702124032e-06,
"loss": -0.0913,
"num_tokens": 14713203.0,
"reward": -0.02093750238418579,
"reward_std": 0.17884045839309692,
"rewards/rollout_reward_func/mean": -0.02093750238418579,
"rewards/rollout_reward_func/std": 0.2822188436985016,
"sampling/importance_sampling_ratio/max": 1.673972725868225,
"sampling/importance_sampling_ratio/mean": 0.9629006385803223,
"sampling/importance_sampling_ratio/min": 0.18777649104595184,
"sampling/sampling_logp_difference/max": 1.1442310810089111,
"sampling/sampling_logp_difference/mean": 0.04829523712396622,
"step": 283,
"step_time": 40.134448617005546
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.28673238307237625,
"epoch": 0.00568,
"grad_norm": 1.3932827711105347,
"kl": 0.26963402703404427,
"learning_rate": 9.99988610458535e-06,
"loss": -0.0945,
"step": 284,
"step_time": 9.01298976700491
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2680.0,
"completions/max_terminated_length": 2680.0,
"completions/mean_length": 2563.0625,
"completions/mean_terminated_length": 2563.0625,
"completions/min_length": 2370.0,
"completions/min_terminated_length": 2370.0,
"entropy": 0.2974665407091379,
"epoch": 0.0057,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.531980276107788,
"kl": 0.2993941828608513,
"learning_rate": 9.999885184226803e-06,
"loss": -0.0469,
"num_tokens": 14818454.0,
"reward": 0.046937502920627594,
"reward_std": 0.06454025954008102,
"rewards/rollout_reward_func/mean": 0.046937502920627594,
"rewards/rollout_reward_func/std": 0.07435742765665054,
"sampling/importance_sampling_ratio/max": 1.6543657779693604,
"sampling/importance_sampling_ratio/mean": 0.9675144553184509,
"sampling/importance_sampling_ratio/min": 0.44254976511001587,
"sampling/sampling_logp_difference/max": 0.790473461151123,
"sampling/sampling_logp_difference/mean": 0.05086861550807953,
"step": 285,
"step_time": 40.94515013400087
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.2969541698694229,
"epoch": 0.00572,
"grad_norm": 1.4943524599075317,
"kl": 0.2985868602991104,
"learning_rate": 9.999884260164672e-06,
"loss": -0.0504,
"step": 286,
"step_time": 8.124935107989586
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2706.0,
"completions/max_terminated_length": 2706.0,
"completions/mean_length": 2444.1875,
"completions/mean_terminated_length": 2441.60009765625,
"completions/min_length": 1153.0,
"completions/min_terminated_length": 1153.0,
"entropy": 0.38077646121382713,
"epoch": 0.00574,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.0460994243621826,
"kl": 0.5203682221472263,
"learning_rate": 9.999883332398963e-06,
"loss": -0.2072,
"num_tokens": 14919885.0,
"reward": -0.10437499731779099,
"reward_std": 0.21541458368301392,
"rewards/rollout_reward_func/mean": -0.10437499731779099,
"rewards/rollout_reward_func/std": 0.38278213143348694,
"sampling/importance_sampling_ratio/max": 2.872018575668335,
"sampling/importance_sampling_ratio/mean": 0.9405064582824707,
"sampling/importance_sampling_ratio/min": 1.4718626317744565e-10,
"sampling/sampling_logp_difference/max": 16.81309700012207,
"sampling/sampling_logp_difference/mean": 0.09047360718250275,
"step": 287,
"step_time": 38.90498008499708
},
{
"clip_ratio/high_max": 0.01678685937076807,
"clip_ratio/high_mean": 0.008393429685384035,
"clip_ratio/low_mean": 0.005085495300590992,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013478924985975027,
"entropy": 0.3831331916153431,
"epoch": 0.00576,
"grad_norm": 1.58086359500885,
"kl": 0.4781217612326145,
"learning_rate": 9.999882400929674e-06,
"loss": -0.2108,
"step": 288,
"step_time": 8.091997119998268
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0072180707938969135,
"clip_ratio/low_min": 0.0027173913549631834,
"clip_ratio/region_mean": 0.0072180707938969135,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2729.0,
"completions/max_terminated_length": 2729.0,
"completions/mean_length": 2518.25,
"completions/mean_terminated_length": 2518.25,
"completions/min_length": 1788.0,
"completions/min_terminated_length": 1788.0,
"entropy": 0.3387117236852646,
"epoch": 0.00578,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3513823747634888,
"kl": 0.9334820155054331,
"learning_rate": 9.99988146575681e-06,
"loss": -0.0198,
"num_tokens": 15024124.0,
"reward": 0.01056249625980854,
"reward_std": 0.18217787146568298,
"rewards/rollout_reward_func/mean": 0.01056249625980854,
"rewards/rollout_reward_func/std": 0.29733407497406006,
"sampling/importance_sampling_ratio/max": 1.9554623365402222,
"sampling/importance_sampling_ratio/mean": 0.7435547113418579,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.4795942306518555,
"sampling/sampling_logp_difference/mean": 0.057336658239364624,
"step": 289,
"step_time": 39.99209722400701
},
{
"clip_ratio/high_max": 0.017574606114067137,
"clip_ratio/high_mean": 0.008787303057033569,
"clip_ratio/low_mean": 0.0013586956774815917,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01014599873451516,
"entropy": 0.3432500846683979,
"epoch": 0.0058,
"grad_norm": 1.1709140539169312,
"kl": 0.8469271510839462,
"learning_rate": 9.999880526880366e-06,
"loss": -0.0209,
"step": 290,
"step_time": 8.202411102996848
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.006023503257893026,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009929753257893026,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2665.0,
"completions/max_terminated_length": 2665.0,
"completions/mean_length": 2454.25,
"completions/mean_terminated_length": 2454.25,
"completions/min_length": 1069.0,
"completions/min_terminated_length": 1069.0,
"entropy": 0.30111620761454105,
"epoch": 0.00582,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.09559166431427,
"kl": 0.43403775803744793,
"learning_rate": 9.999879584300349e-06,
"loss": 0.035,
"num_tokens": 15125746.0,
"reward": 0.028999999165534973,
"reward_std": 0.17057111859321594,
"rewards/rollout_reward_func/mean": 0.028999999165534973,
"rewards/rollout_reward_func/std": 0.28665322065353394,
"sampling/importance_sampling_ratio/max": 1.8856747150421143,
"sampling/importance_sampling_ratio/mean": 1.0690852403640747,
"sampling/importance_sampling_ratio/min": 0.49591487646102905,
"sampling/sampling_logp_difference/max": 1.3662643432617188,
"sampling/sampling_logp_difference/mean": 0.047344714403152466,
"step": 291,
"step_time": 39.35214727100538
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.3075068034231663,
"epoch": 0.00584,
"grad_norm": 1.0851396322250366,
"kl": 0.39455585554242134,
"learning_rate": 9.999878638016756e-06,
"loss": 0.0324,
"step": 292,
"step_time": 8.023776346992236
},
{
"clip_ratio/high_max": 0.0037878789007663727,
"clip_ratio/high_mean": 0.0018939394503831863,
"clip_ratio/low_mean": 0.0018382353009656072,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0037321747513487935,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2712.0,
"completions/max_terminated_length": 2712.0,
"completions/mean_length": 2457.25,
"completions/mean_terminated_length": 2457.25,
"completions/min_length": 487.0,
"completions/min_terminated_length": 487.0,
"entropy": 0.2864312566816807,
"epoch": 0.00586,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0765045881271362,
"kl": 0.3095027208328247,
"learning_rate": 9.99987768802959e-06,
"loss": -0.0325,
"num_tokens": 15228123.0,
"reward": 0.029375001788139343,
"reward_std": 0.31879091262817383,
"rewards/rollout_reward_func/mean": 0.029375001788139343,
"rewards/rollout_reward_func/std": 0.41879963874816895,
"sampling/importance_sampling_ratio/max": 1.6113137006759644,
"sampling/importance_sampling_ratio/mean": 0.8610400557518005,
"sampling/importance_sampling_ratio/min": 0.25993117690086365,
"sampling/sampling_logp_difference/max": 0.8391194343566895,
"sampling/sampling_logp_difference/mean": 0.04624336585402489,
"step": 293,
"step_time": 38.63772508600232
},
{
"clip_ratio/high_max": 0.015885416883975267,
"clip_ratio/high_mean": 0.007942708441987634,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011848958441987634,
"entropy": 0.2931174673140049,
"epoch": 0.00588,
"grad_norm": 0.8959990739822388,
"kl": 0.2937803156673908,
"learning_rate": 9.99987673433885e-06,
"loss": -0.0353,
"step": 294,
"step_time": 8.147136625986604
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2764.0,
"completions/max_terminated_length": 2764.0,
"completions/mean_length": 2505.09375,
"completions/mean_terminated_length": 2505.09375,
"completions/min_length": 1295.0,
"completions/min_terminated_length": 1295.0,
"entropy": 0.3480229079723358,
"epoch": 0.0059,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3411896228790283,
"kl": 0.24798605777323246,
"learning_rate": 9.999875776944539e-06,
"loss": 0.0081,
"num_tokens": 15332692.0,
"reward": -0.0015624996740370989,
"reward_std": 0.07828617095947266,
"rewards/rollout_reward_func/mean": -0.0015624996740370989,
"rewards/rollout_reward_func/std": 0.0892300084233284,
"sampling/importance_sampling_ratio/max": 1.658194899559021,
"sampling/importance_sampling_ratio/mean": 0.9416136741638184,
"sampling/importance_sampling_ratio/min": 0.3543452024459839,
"sampling/sampling_logp_difference/max": 0.45158815383911133,
"sampling/sampling_logp_difference/mean": 0.042671412229537964,
"step": 295,
"step_time": 41.45644505799282
},
{
"clip_ratio/high_max": 0.015395220601931214,
"clip_ratio/high_mean": 0.007697610300965607,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007697610300965607,
"entropy": 0.35514455288648605,
"epoch": 0.00592,
"grad_norm": 1.2621599435806274,
"kl": 0.23556838184595108,
"learning_rate": 9.999874815846656e-06,
"loss": 0.0039,
"step": 296,
"step_time": 8.289278871998249
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2792.0,
"completions/max_terminated_length": 2792.0,
"completions/mean_length": 2558.21875,
"completions/mean_terminated_length": 2558.21875,
"completions/min_length": 732.0,
"completions/min_terminated_length": 732.0,
"entropy": 0.331636942923069,
"epoch": 0.00594,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0715786218643188,
"kl": 0.319051219150424,
"learning_rate": 9.999873851045202e-06,
"loss": -0.0962,
"num_tokens": 15438054.0,
"reward": -0.015625,
"reward_std": 0.12047843635082245,
"rewards/rollout_reward_func/mean": -0.015625,
"rewards/rollout_reward_func/std": 0.1917649358510971,
"sampling/importance_sampling_ratio/max": 1.5377081632614136,
"sampling/importance_sampling_ratio/mean": 0.8829039335250854,
"sampling/importance_sampling_ratio/min": 0.27864915132522583,
"sampling/sampling_logp_difference/max": 0.9243254661560059,
"sampling/sampling_logp_difference/mean": 0.05171851068735123,
"step": 297,
"step_time": 40.14633480698831
},
{
"clip_ratio/high_max": 0.01953125,
"clip_ratio/high_mean": 0.01171875,
"clip_ratio/low_mean": 0.005800189450383186,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017518939450383186,
"entropy": 0.3420854024589062,
"epoch": 0.00596,
"grad_norm": 0.9368278384208679,
"kl": 0.321561723947525,
"learning_rate": 9.999872882540181e-06,
"loss": -0.1013,
"step": 298,
"step_time": 8.38549888000125
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0033118206774815917,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005264945677481592,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2659.0,
"completions/max_terminated_length": 2659.0,
"completions/mean_length": 2429.96875,
"completions/mean_terminated_length": 2429.96875,
"completions/min_length": 1326.0,
"completions/min_terminated_length": 1326.0,
"entropy": 0.3992190286517143,
"epoch": 0.00598,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.0931684970855713,
"kl": 0.2717692907899618,
"learning_rate": 9.999871910331592e-06,
"loss": 0.0356,
"num_tokens": 15539177.0,
"reward": 0.006874997168779373,
"reward_std": 0.13145233690738678,
"rewards/rollout_reward_func/mean": 0.006874997168779373,
"rewards/rollout_reward_func/std": 0.227219358086586,
"sampling/importance_sampling_ratio/max": 1.9978420734405518,
"sampling/importance_sampling_ratio/mean": 1.1109166145324707,
"sampling/importance_sampling_ratio/min": 0.28789734840393066,
"sampling/sampling_logp_difference/max": 1.2725002765655518,
"sampling/sampling_logp_difference/mean": 0.04605109244585037,
"step": 299,
"step_time": 39.261059892996855
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.004185267956927419,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010044642956927419,
"entropy": 0.4111786112189293,
"epoch": 0.006,
"grad_norm": 1.949076771736145,
"kl": 0.263056967407465,
"learning_rate": 9.999870934419434e-06,
"loss": 0.0301,
"step": 300,
"step_time": 8.51909788800549
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.009765625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2695.0,
"completions/max_terminated_length": 2695.0,
"completions/mean_length": 2472.09375,
"completions/mean_terminated_length": 2472.09375,
"completions/min_length": 1107.0,
"completions/min_terminated_length": 1107.0,
"entropy": 0.3823564685881138,
"epoch": 0.00602,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9711835980415344,
"kl": 0.24662657268345356,
"learning_rate": 9.999869954803708e-06,
"loss": 0.0073,
"num_tokens": 15642001.0,
"reward": -0.02225000225007534,
"reward_std": 0.2383129745721817,
"rewards/rollout_reward_func/mean": -0.02225000225007534,
"rewards/rollout_reward_func/std": 0.34792760014533997,
"sampling/importance_sampling_ratio/max": 2.2050437927246094,
"sampling/importance_sampling_ratio/mean": 0.9682941436767578,
"sampling/importance_sampling_ratio/min": 0.21761596202850342,
"sampling/sampling_logp_difference/max": 0.6958246231079102,
"sampling/sampling_logp_difference/mean": 0.04852741211652756,
"step": 301,
"step_time": 39.94850182400842
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.3918728120625019,
"epoch": 0.00604,
"grad_norm": 1.0577794313430786,
"kl": 0.24203552678227425,
"learning_rate": 9.999868971484418e-06,
"loss": 0.0044,
"step": 302,
"step_time": 8.059815880988026
},
{
"clip_ratio/high_max": 0.010937500046566129,
"clip_ratio/high_mean": 0.005468750023283064,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007421875023283064,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2725.0,
"completions/max_terminated_length": 2725.0,
"completions/mean_length": 2496.375,
"completions/mean_terminated_length": 2498.806396484375,
"completions/min_length": 364.0,
"completions/min_terminated_length": 364.0,
"entropy": 0.4163655452430248,
"epoch": 0.00606,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3073053359985352,
"kl": 0.26006307639181614,
"learning_rate": 9.999867984461565e-06,
"loss": -0.0813,
"num_tokens": 15745188.0,
"reward": -0.010625004768371582,
"reward_std": 0.12208487838506699,
"rewards/rollout_reward_func/mean": -0.010625004768371582,
"rewards/rollout_reward_func/std": 0.20892871916294098,
"sampling/importance_sampling_ratio/max": 2.2923381328582764,
"sampling/importance_sampling_ratio/mean": 1.1277439594268799,
"sampling/importance_sampling_ratio/min": 0.47326311469078064,
"sampling/sampling_logp_difference/max": 0.8685226440429688,
"sampling/sampling_logp_difference/mean": 0.044995859265327454,
"step": 303,
"step_time": 40.56155824400048
},
{
"clip_ratio/high_max": 0.013654891401529312,
"clip_ratio/high_mean": 0.006827445700764656,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006827445700764656,
"entropy": 0.42382679879665375,
"epoch": 0.00608,
"grad_norm": 1.2154861688613892,
"kl": 0.26275861263275146,
"learning_rate": 9.999866993735148e-06,
"loss": -0.0856,
"step": 304,
"step_time": 8.188002009002957
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2720.0,
"completions/max_terminated_length": 2720.0,
"completions/mean_length": 2543.78125,
"completions/mean_terminated_length": 2543.78125,
"completions/min_length": 1320.0,
"completions/min_terminated_length": 1320.0,
"entropy": 0.4170003570616245,
"epoch": 0.0061,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9974532723426819,
"kl": 0.4499119780957699,
"learning_rate": 9.99986599930517e-06,
"loss": -0.0162,
"num_tokens": 15849934.0,
"reward": 0.06724999845027924,
"reward_std": 0.10333256423473358,
"rewards/rollout_reward_func/mean": 0.06724999845027924,
"rewards/rollout_reward_func/std": 0.19078291952610016,
"sampling/importance_sampling_ratio/max": 1.4692877531051636,
"sampling/importance_sampling_ratio/mean": 0.8283271789550781,
"sampling/importance_sampling_ratio/min": 0.20920588076114655,
"sampling/sampling_logp_difference/max": 1.1342777013778687,
"sampling/sampling_logp_difference/mean": 0.0569009929895401,
"step": 305,
"step_time": 40.22573580900644
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.42669572681188583,
"epoch": 0.00612,
"grad_norm": 0.966021716594696,
"kl": 0.4435073509812355,
"learning_rate": 9.999865001171628e-06,
"loss": -0.0204,
"step": 306,
"step_time": 9.38175861498894
},
{
"clip_ratio/high_max": 0.006623641354963183,
"clip_ratio/high_mean": 0.0033118206774815917,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0033118206774815917,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2688.0,
"completions/max_terminated_length": 2688.0,
"completions/mean_length": 2498.71875,
"completions/mean_terminated_length": 2498.71875,
"completions/min_length": 1621.0,
"completions/min_terminated_length": 1621.0,
"entropy": 0.4614461474120617,
"epoch": 0.00614,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7010865211486816,
"kl": 0.26754089444875717,
"learning_rate": 9.999863999334527e-06,
"loss": -0.0394,
"num_tokens": 15953702.0,
"reward": -0.018437497317790985,
"reward_std": 0.13519792258739471,
"rewards/rollout_reward_func/mean": -0.018437497317790985,
"rewards/rollout_reward_func/std": 0.22020678222179413,
"sampling/importance_sampling_ratio/max": 1.7836391925811768,
"sampling/importance_sampling_ratio/mean": 1.0035184621810913,
"sampling/importance_sampling_ratio/min": 0.5491339564323425,
"sampling/sampling_logp_difference/max": 0.7282247543334961,
"sampling/sampling_logp_difference/mean": 0.037387143820524216,
"step": 307,
"step_time": 39.49807976799639
},
{
"clip_ratio/high_max": 0.006623641354963183,
"clip_ratio/high_mean": 0.0033118206774815917,
"clip_ratio/low_mean": 0.0033735795877873898,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006685400381684303,
"entropy": 0.46664348989725113,
"epoch": 0.00616,
"grad_norm": 1.189290165901184,
"kl": 0.26736610010266304,
"learning_rate": 9.999862993793865e-06,
"loss": -0.0438,
"step": 308,
"step_time": 8.11864329600212
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2704.0,
"completions/max_terminated_length": 2704.0,
"completions/mean_length": 2507.71875,
"completions/mean_terminated_length": 2507.71875,
"completions/min_length": 1357.0,
"completions/min_terminated_length": 1357.0,
"entropy": 0.45517005771398544,
"epoch": 0.00618,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.906209111213684,
"kl": 1.1019243709743023,
"learning_rate": 9.999861984549646e-06,
"loss": 0.0762,
"num_tokens": 16057213.0,
"reward": 0.02031249925494194,
"reward_std": 0.07454593479633331,
"rewards/rollout_reward_func/mean": 0.02031249925494194,
"rewards/rollout_reward_func/std": 0.08433743566274643,
"sampling/importance_sampling_ratio/max": 2.71134352684021,
"sampling/importance_sampling_ratio/mean": 1.1158506870269775,
"sampling/importance_sampling_ratio/min": 0.40567341446876526,
"sampling/sampling_logp_difference/max": 0.7820019721984863,
"sampling/sampling_logp_difference/mean": 0.05114158242940903,
"step": 309,
"step_time": 39.111676544001966
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.45416729897260666,
"epoch": 0.0062,
"grad_norm": 1.8273950815200806,
"kl": 0.976007841527462,
"learning_rate": 9.99986097160187e-06,
"loss": 0.0695,
"step": 310,
"step_time": 8.081985718003125
},
{
"clip_ratio/high_max": 0.0033783784601837397,
"clip_ratio/high_mean": 0.0016891892300918698,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00364231423009187,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2718.0,
"completions/max_terminated_length": 2718.0,
"completions/mean_length": 2470.25,
"completions/mean_terminated_length": 2470.25,
"completions/min_length": 1356.0,
"completions/min_terminated_length": 1356.0,
"entropy": 0.47122444957494736,
"epoch": 0.00622,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.186075210571289,
"kl": 0.27417293563485146,
"learning_rate": 9.999859954950535e-06,
"loss": -0.0863,
"num_tokens": 16159452.0,
"reward": 0.0234375,
"reward_std": 0.0810212641954422,
"rewards/rollout_reward_func/mean": 0.0234375,
"rewards/rollout_reward_func/std": 0.09953470528125763,
"sampling/importance_sampling_ratio/max": 1.6803293228149414,
"sampling/importance_sampling_ratio/mean": 0.907636821269989,
"sampling/importance_sampling_ratio/min": 0.40213456749916077,
"sampling/sampling_logp_difference/max": 0.9505100250244141,
"sampling/sampling_logp_difference/mean": 0.048430029302835464,
"step": 311,
"step_time": 38.429599290000624
},
{
"clip_ratio/high_max": 0.006756756920367479,
"clip_ratio/high_mean": 0.0033783784601837397,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00728462846018374,
"entropy": 0.4736868105828762,
"epoch": 0.00624,
"grad_norm": 1.6169250011444092,
"kl": 0.2743623908609152,
"learning_rate": 9.999858934595648e-06,
"loss": -0.0915,
"step": 312,
"step_time": 9.370438503989135
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2758.0,
"completions/max_terminated_length": 2758.0,
"completions/mean_length": 2513.5625,
"completions/mean_terminated_length": 2513.5625,
"completions/min_length": 1564.0,
"completions/min_terminated_length": 1564.0,
"entropy": 0.4444139339029789,
"epoch": 0.00626,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9361081719398499,
"kl": 0.29342188127338886,
"learning_rate": 9.999857910537204e-06,
"loss": -0.0681,
"num_tokens": 16263096.0,
"reward": 0.0018750019371509552,
"reward_std": 0.11465057730674744,
"rewards/rollout_reward_func/mean": 0.0018750019371509552,
"rewards/rollout_reward_func/std": 0.18235798180103302,
"sampling/importance_sampling_ratio/max": 1.9316868782043457,
"sampling/importance_sampling_ratio/mean": 0.9139528274536133,
"sampling/importance_sampling_ratio/min": 0.3404541313648224,
"sampling/sampling_logp_difference/max": 0.85491943359375,
"sampling/sampling_logp_difference/mean": 0.04442521184682846,
"step": 313,
"step_time": 40.23155441199924
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.44232723861932755,
"epoch": 0.00628,
"grad_norm": 0.871970534324646,
"kl": 0.30031659826636314,
"learning_rate": 9.999856882775207e-06,
"loss": -0.0709,
"step": 314,
"step_time": 8.274247516004834
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2775.0,
"completions/max_terminated_length": 2775.0,
"completions/mean_length": 2577.53125,
"completions/mean_terminated_length": 2577.53125,
"completions/min_length": 2379.0,
"completions/min_terminated_length": 2379.0,
"entropy": 0.45178864896297455,
"epoch": 0.0063,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0735334157943726,
"kl": 0.28461507335305214,
"learning_rate": 9.999855851309658e-06,
"loss": 0.0263,
"num_tokens": 16369024.0,
"reward": 0.028999999165534973,
"reward_std": 0.05209977924823761,
"rewards/rollout_reward_func/mean": 0.028999999165534973,
"rewards/rollout_reward_func/std": 0.0565662607550621,
"sampling/importance_sampling_ratio/max": 1.7128064632415771,
"sampling/importance_sampling_ratio/mean": 1.0174338817596436,
"sampling/importance_sampling_ratio/min": 0.26667365431785583,
"sampling/sampling_logp_difference/max": 0.8575940132141113,
"sampling/sampling_logp_difference/mean": 0.04983839392662048,
"step": 315,
"step_time": 42.039271956993616
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.4506670571863651,
"epoch": 0.00632,
"grad_norm": 1.147428274154663,
"kl": 0.28853642009198666,
"learning_rate": 9.999854816140558e-06,
"loss": 0.0242,
"step": 316,
"step_time": 8.311607102994458
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2640.0,
"completions/max_terminated_length": 2640.0,
"completions/mean_length": 2452.21875,
"completions/mean_terminated_length": 2452.21875,
"completions/min_length": 600.0,
"completions/min_terminated_length": 600.0,
"entropy": 0.43626048415899277,
"epoch": 0.00634,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.334524393081665,
"kl": 0.26437416300177574,
"learning_rate": 9.999853777267907e-06,
"loss": -0.0384,
"num_tokens": 16470702.0,
"reward": 0.012187500484287739,
"reward_std": 0.11599558591842651,
"rewards/rollout_reward_func/mean": 0.012187500484287739,
"rewards/rollout_reward_func/std": 0.19708740711212158,
"sampling/importance_sampling_ratio/max": 1.745671033859253,
"sampling/importance_sampling_ratio/mean": 1.029480218887329,
"sampling/importance_sampling_ratio/min": 0.3974767029285431,
"sampling/sampling_logp_difference/max": 0.8084192276000977,
"sampling/sampling_logp_difference/mean": 0.046191953122615814,
"step": 317,
"step_time": 40.42614766400948
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0024038462433964014,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004356971010565758,
"entropy": 0.4329594671726227,
"epoch": 0.00636,
"grad_norm": 1.2568414211273193,
"kl": 0.2660627197474241,
"learning_rate": 9.999852734691707e-06,
"loss": -0.0389,
"step": 318,
"step_time": 8.729695830006676
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2713.0,
"completions/max_terminated_length": 2713.0,
"completions/mean_length": 2463.28125,
"completions/mean_terminated_length": 2459.48388671875,
"completions/min_length": 414.0,
"completions/min_terminated_length": 414.0,
"entropy": 0.4093780815601349,
"epoch": 0.00638,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9679552316665649,
"kl": 0.25920648127794266,
"learning_rate": 9.999851688411959e-06,
"loss": -0.1556,
"num_tokens": 16573266.0,
"reward": -0.044062502682209015,
"reward_std": 0.18410122394561768,
"rewards/rollout_reward_func/mean": -0.044062502682209015,
"rewards/rollout_reward_func/std": 0.27064865827560425,
"sampling/importance_sampling_ratio/max": 1.572900414466858,
"sampling/importance_sampling_ratio/mean": 0.9186617136001587,
"sampling/importance_sampling_ratio/min": 0.25353381037712097,
"sampling/sampling_logp_difference/max": 0.6579174995422363,
"sampling/sampling_logp_difference/mean": 0.040446434170007706,
"step": 319,
"step_time": 38.13957930600009
},
{
"clip_ratio/high_max": 0.006623641354963183,
"clip_ratio/high_mean": 0.0033118206774815917,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0033118206774815917,
"entropy": 0.40444113314151764,
"epoch": 0.0064,
"grad_norm": 0.9774654507637024,
"kl": 0.2562381671741605,
"learning_rate": 9.999850638428662e-06,
"loss": -0.158,
"step": 320,
"step_time": 8.146959678997518
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3095.0,
"completions/max_terminated_length": 3095.0,
"completions/mean_length": 2866.9375,
"completions/mean_terminated_length": 2866.9375,
"completions/min_length": 1676.0,
"completions/min_terminated_length": 1676.0,
"entropy": 0.40554892271757126,
"epoch": 0.00642,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.0908348560333252,
"kl": 0.2836337350308895,
"learning_rate": 9.99984958474182e-06,
"loss": -0.1248,
"num_tokens": 16688566.0,
"reward": -0.08843749761581421,
"reward_std": 0.16875624656677246,
"rewards/rollout_reward_func/mean": -0.08843749761581421,
"rewards/rollout_reward_func/std": 0.2703416049480438,
"sampling/importance_sampling_ratio/max": 2.3541653156280518,
"sampling/importance_sampling_ratio/mean": 0.9503089189529419,
"sampling/importance_sampling_ratio/min": 0.2944200336933136,
"sampling/sampling_logp_difference/max": 0.7107000350952148,
"sampling/sampling_logp_difference/mean": 0.045195046812295914,
"step": 321,
"step_time": 41.59577051900851
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.005642361124046147,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005642361124046147,
"entropy": 0.4032099135220051,
"epoch": 0.00644,
"grad_norm": 0.9083360433578491,
"kl": 0.2763382289558649,
"learning_rate": 9.999848527351434e-06,
"loss": -0.1272,
"step": 322,
"step_time": 9.127111081004841
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3129.0,
"completions/max_terminated_length": 3129.0,
"completions/mean_length": 2868.71875,
"completions/mean_terminated_length": 2868.71875,
"completions/min_length": 2418.0,
"completions/min_terminated_length": 2418.0,
"entropy": 0.4014575108885765,
"epoch": 0.00646,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.3745849132537842,
"kl": 0.3217862006276846,
"learning_rate": 9.999847466257501e-06,
"loss": -0.0052,
"num_tokens": 16803971.0,
"reward": 0.03343750163912773,
"reward_std": 0.08887212723493576,
"rewards/rollout_reward_func/mean": 0.03343750163912773,
"rewards/rollout_reward_func/std": 0.18239639699459076,
"sampling/importance_sampling_ratio/max": 2.078562021255493,
"sampling/importance_sampling_ratio/mean": 1.028127670288086,
"sampling/importance_sampling_ratio/min": 0.30963262915611267,
"sampling/sampling_logp_difference/max": 0.856985330581665,
"sampling/sampling_logp_difference/mean": 0.04022815078496933,
"step": 323,
"step_time": 42.87567100799788
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.005208333372138441,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0069444444961845875,
"entropy": 0.4006517715752125,
"epoch": 0.00648,
"grad_norm": 1.3022223711013794,
"kl": 0.32581575214862823,
"learning_rate": 9.999846401460027e-06,
"loss": -0.01,
"step": 324,
"step_time": 9.157112452005094
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3087.0,
"completions/max_terminated_length": 3087.0,
"completions/mean_length": 2870.625,
"completions/mean_terminated_length": 2870.625,
"completions/min_length": 1981.0,
"completions/min_terminated_length": 1981.0,
"entropy": 0.39831674844026566,
"epoch": 0.0065,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.0668267011642456,
"kl": 0.2798405773937702,
"learning_rate": 9.999845332959009e-06,
"loss": -0.1367,
"num_tokens": 16919173.0,
"reward": -0.03593750298023224,
"reward_std": 0.10618676245212555,
"rewards/rollout_reward_func/mean": -0.03593750298023224,
"rewards/rollout_reward_func/std": 0.2160026729106903,
"sampling/importance_sampling_ratio/max": 2.6769375801086426,
"sampling/importance_sampling_ratio/mean": 1.0329153537750244,
"sampling/importance_sampling_ratio/min": 0.25040295720100403,
"sampling/sampling_logp_difference/max": 0.8099632263183594,
"sampling/sampling_logp_difference/mean": 0.047026216983795166,
"step": 325,
"step_time": 42.815426508008386
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0018382353009656072,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005310457549057901,
"entropy": 0.39466894418001175,
"epoch": 0.00652,
"grad_norm": 1.083456039428711,
"kl": 0.2784327268600464,
"learning_rate": 9.999844260754452e-06,
"loss": -0.1412,
"step": 326,
"step_time": 9.105896988992754
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3089.0,
"completions/max_terminated_length": 3089.0,
"completions/mean_length": 2871.90625,
"completions/mean_terminated_length": 2871.90625,
"completions/min_length": 2233.0,
"completions/min_terminated_length": 2233.0,
"entropy": 0.46241355314850807,
"epoch": 0.00654,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.365126132965088,
"kl": 0.4783841446042061,
"learning_rate": 9.999843184846355e-06,
"loss": 0.0579,
"num_tokens": 17034934.0,
"reward": -0.004687502980232239,
"reward_std": 0.2152300775051117,
"rewards/rollout_reward_func/mean": -0.004687502980232239,
"rewards/rollout_reward_func/std": 0.3430412709712982,
"sampling/importance_sampling_ratio/max": 2.176283121109009,
"sampling/importance_sampling_ratio/mean": 0.9271606802940369,
"sampling/importance_sampling_ratio/min": 0.1929233819246292,
"sampling/sampling_logp_difference/max": 1.0973834991455078,
"sampling/sampling_logp_difference/mean": 0.04590904712677002,
"step": 327,
"step_time": 40.807620658990345
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"entropy": 0.4616936966776848,
"epoch": 0.00656,
"grad_norm": 1.2530312538146973,
"kl": 0.4540918581187725,
"learning_rate": 9.999842105234718e-06,
"loss": 0.0566,
"step": 328,
"step_time": 9.13476190099027
},
{
"clip_ratio/high_max": 0.0027173913549631834,
"clip_ratio/high_mean": 0.0013586956774815917,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0013586956774815917,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3089.0,
"completions/max_terminated_length": 3089.0,
"completions/mean_length": 2894.5,
"completions/mean_terminated_length": 2894.5,
"completions/min_length": 2710.0,
"completions/min_terminated_length": 2710.0,
"entropy": 0.4044684022665024,
"epoch": 0.00658,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.2801345586776733,
"kl": 0.29569943621754646,
"learning_rate": 9.999841021919543e-06,
"loss": 0.028,
"num_tokens": 17150980.0,
"reward": 0.008749999105930328,
"reward_std": 0.033390406519174576,
"rewards/rollout_reward_func/mean": 0.008749999105930328,
"rewards/rollout_reward_func/std": 0.03669842332601547,
"sampling/importance_sampling_ratio/max": 1.9134129285812378,
"sampling/importance_sampling_ratio/mean": 0.9784555435180664,
"sampling/importance_sampling_ratio/min": 7.799136336750223e-10,
"sampling/sampling_logp_difference/max": 9.550741195678711,
"sampling/sampling_logp_difference/mean": 0.07563318312168121,
"step": 329,
"step_time": 42.932895781996194
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.004151570028625429,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0058876811526715755,
"entropy": 0.40519747510552406,
"epoch": 0.0066,
"grad_norm": 1.2309134006500244,
"kl": 0.3003210015594959,
"learning_rate": 9.999839934900832e-06,
"loss": 0.0236,
"step": 330,
"step_time": 9.125532052996277
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3089.0,
"completions/max_terminated_length": 3089.0,
"completions/mean_length": 2796.0625,
"completions/mean_terminated_length": 2796.0625,
"completions/min_length": 1274.0,
"completions/min_terminated_length": 1274.0,
"entropy": 0.38685447722673416,
"epoch": 0.00662,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.919028639793396,
"kl": 0.37493203580379486,
"learning_rate": 9.999838844178584e-06,
"loss": -0.0388,
"num_tokens": 17263747.0,
"reward": -0.0508125014603138,
"reward_std": 0.14278432726860046,
"rewards/rollout_reward_func/mean": -0.0508125014603138,
"rewards/rollout_reward_func/std": 0.251789391040802,
"sampling/importance_sampling_ratio/max": 1.6912277936935425,
"sampling/importance_sampling_ratio/mean": 0.9828731417655945,
"sampling/importance_sampling_ratio/min": 0.2559169828891754,
"sampling/sampling_logp_difference/max": 1.3397047519683838,
"sampling/sampling_logp_difference/mean": 0.04227147251367569,
"step": 331,
"step_time": 42.20254211300198
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005208333372138441,
"entropy": 0.38628678396344185,
"epoch": 0.00664,
"grad_norm": 0.8906185030937195,
"kl": 0.354989618062973,
"learning_rate": 9.999837749752804e-06,
"loss": -0.0404,
"step": 332,
"step_time": 9.063552910993167
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0016447368543595076,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0016447368543595076,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3035.0,
"completions/max_terminated_length": 3035.0,
"completions/mean_length": 2776.28125,
"completions/mean_terminated_length": 2776.28125,
"completions/min_length": 1117.0,
"completions/min_terminated_length": 1117.0,
"entropy": 0.415100060403347,
"epoch": 0.00666,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.906931757926941,
"kl": 0.371683020144701,
"learning_rate": 9.999836651623489e-06,
"loss": 0.0841,
"num_tokens": 17375797.0,
"reward": -0.04500000178813934,
"reward_std": 0.11221498996019363,
"rewards/rollout_reward_func/mean": -0.04500000178813934,
"rewards/rollout_reward_func/std": 0.19660012423992157,
"sampling/importance_sampling_ratio/max": 2.257512331008911,
"sampling/importance_sampling_ratio/mean": 0.9947605729103088,
"sampling/importance_sampling_ratio/min": 0.2314082235097885,
"sampling/sampling_logp_difference/max": 0.7822046279907227,
"sampling/sampling_logp_difference/mean": 0.04253339767456055,
"step": 333,
"step_time": 41.125564955997106
},
{
"clip_ratio/high_max": 0.006850600708276033,
"clip_ratio/high_mean": 0.0034253003541380167,
"clip_ratio/low_mean": 0.0032986111473292112,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006723911501467228,
"entropy": 0.4193525053560734,
"epoch": 0.00668,
"grad_norm": 2.091108560562134,
"kl": 0.3612441271543503,
"learning_rate": 9.99983554979064e-06,
"loss": 0.0828,
"step": 334,
"step_time": 10.233171193001908
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005208333372138441,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 3031.0,
"completions/max_terminated_length": 3031.0,
"completions/mean_length": 2885.46875,
"completions/mean_terminated_length": 2884.61279296875,
"completions/min_length": 1654.0,
"completions/min_terminated_length": 1654.0,
"entropy": 0.4298589825630188,
"epoch": 0.0067,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.805285096168518,
"kl": 0.5581814311444759,
"learning_rate": 9.999834444254261e-06,
"loss": -0.0295,
"num_tokens": 17491709.0,
"reward": 0.045625001192092896,
"reward_std": 0.13078023493289948,
"rewards/rollout_reward_func/mean": 0.045625001192092896,
"rewards/rollout_reward_func/std": 0.1976301074028015,
"sampling/importance_sampling_ratio/max": 2.2158279418945312,
"sampling/importance_sampling_ratio/mean": 1.0222558975219727,
"sampling/importance_sampling_ratio/min": 0.21492932736873627,
"sampling/sampling_logp_difference/max": 1.4235153198242188,
"sampling/sampling_logp_difference/mean": 0.043566472828388214,
"step": 335,
"step_time": 41.51342618201306
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"entropy": 0.4308301880955696,
"epoch": 0.00672,
"grad_norm": 1.9020830392837524,
"kl": 0.4618966430425644,
"learning_rate": 9.999833335014352e-06,
"loss": -0.0332,
"step": 336,
"step_time": 9.007301712008484
},
{
"clip_ratio/high_max": 0.0033783784601837397,
"clip_ratio/high_mean": 0.0016891892300918698,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0016891892300918698,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3094.0,
"completions/max_terminated_length": 3094.0,
"completions/mean_length": 2867.5,
"completions/mean_terminated_length": 2867.5,
"completions/min_length": 2390.0,
"completions/min_terminated_length": 2390.0,
"entropy": 0.36968783289194107,
"epoch": 0.00674,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1672717332839966,
"kl": 0.30326209031045437,
"learning_rate": 9.999832222070915e-06,
"loss": -0.0387,
"num_tokens": 17606576.0,
"reward": 0.048750001937150955,
"reward_std": 0.10282598435878754,
"rewards/rollout_reward_func/mean": 0.048750001937150955,
"rewards/rollout_reward_func/std": 0.21335643529891968,
"sampling/importance_sampling_ratio/max": 2.6462104320526123,
"sampling/importance_sampling_ratio/mean": 0.9777675271034241,
"sampling/importance_sampling_ratio/min": 0.2115698605775833,
"sampling/sampling_logp_difference/max": 0.7081310749053955,
"sampling/sampling_logp_difference/mean": 0.04413309693336487,
"step": 337,
"step_time": 41.790469111991115
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.01215277798473835,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.01736111135687679,
"entropy": 0.3639492504298687,
"epoch": 0.00676,
"grad_norm": 0.9901954531669617,
"kl": 0.32282317988574505,
"learning_rate": 9.999831105423947e-06,
"loss": -0.0425,
"step": 338,
"step_time": 9.130401213995356
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 3035.0,
"completions/max_terminated_length": 3035.0,
"completions/mean_length": 2874.03125,
"completions/mean_terminated_length": 2872.10009765625,
"completions/min_length": 2728.0,
"completions/min_terminated_length": 2728.0,
"entropy": 0.44732575863599777,
"epoch": 0.00678,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.9907981157302856,
"kl": 0.34680038318037987,
"learning_rate": 9.999829985073454e-06,
"loss": -0.0037,
"num_tokens": 17722240.0,
"reward": 0.019687499850988388,
"reward_std": 0.04209454730153084,
"rewards/rollout_reward_func/mean": 0.019687499850988388,
"rewards/rollout_reward_func/std": 0.06765588372945786,
"sampling/importance_sampling_ratio/max": 2.3998336791992188,
"sampling/importance_sampling_ratio/mean": 1.0840644836425781,
"sampling/importance_sampling_ratio/min": 0.3234512209892273,
"sampling/sampling_logp_difference/max": 0.5507974624633789,
"sampling/sampling_logp_difference/mean": 0.045105550438165665,
"step": 339,
"step_time": 43.570160997980565
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.007708333316259086,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007708333316259086,
"entropy": 0.43794915825128555,
"epoch": 0.0068,
"grad_norm": 0.9109601974487305,
"kl": 0.39482543990015984,
"learning_rate": 9.999828861019437e-06,
"loss": -0.0068,
"step": 340,
"step_time": 9.491892766018282
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3138.0,
"completions/max_terminated_length": 3138.0,
"completions/mean_length": 2891.6875,
"completions/mean_terminated_length": 2891.6875,
"completions/min_length": 2728.0,
"completions/min_terminated_length": 2728.0,
"entropy": 0.37105344980955124,
"epoch": 0.00682,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.1111348867416382,
"kl": 0.31960206665098667,
"learning_rate": 9.999827733261892e-06,
"loss": -0.021,
"num_tokens": 17838273.0,
"reward": 0.015625,
"reward_std": 0.04219720885157585,
"rewards/rollout_reward_func/mean": 0.015625,
"rewards/rollout_reward_func/std": 0.0623692162334919,
"sampling/importance_sampling_ratio/max": 2.786299228668213,
"sampling/importance_sampling_ratio/mean": 1.0912487506866455,
"sampling/importance_sampling_ratio/min": 0.47787514328956604,
"sampling/sampling_logp_difference/max": 0.7863068580627441,
"sampling/sampling_logp_difference/mean": 0.0443718284368515,
"step": 341,
"step_time": 43.160379425004066
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.010416666744276881,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012152777868323028,
"entropy": 0.36442771926522255,
"epoch": 0.00684,
"grad_norm": 0.9744181036949158,
"kl": 0.35923222079873085,
"learning_rate": 9.999826601800824e-06,
"loss": -0.0279,
"step": 342,
"step_time": 9.232285209007387
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 3061.0,
"completions/max_terminated_length": 3061.0,
"completions/mean_length": 2847.1875,
"completions/mean_terminated_length": 2850.9677734375,
"completions/min_length": 1435.0,
"completions/min_terminated_length": 1435.0,
"entropy": 0.36714230850338936,
"epoch": 0.00686,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.2039743661880493,
"kl": 0.48806050047278404,
"learning_rate": 9.999825466636233e-06,
"loss": 0.0756,
"num_tokens": 17953390.0,
"reward": -0.02656250260770321,
"reward_std": 0.09342575073242188,
"rewards/rollout_reward_func/mean": -0.02656250260770321,
"rewards/rollout_reward_func/std": 0.19873161613941193,
"sampling/importance_sampling_ratio/max": 2.302706718444824,
"sampling/importance_sampling_ratio/mean": 1.0160350799560547,
"sampling/importance_sampling_ratio/min": 0.3926359713077545,
"sampling/sampling_logp_difference/max": 0.729058027267456,
"sampling/sampling_logp_difference/mean": 0.04609353467822075,
"step": 343,
"step_time": 41.09620379800617
},
{
"clip_ratio/high_max": 0.005972222192212939,
"clip_ratio/high_mean": 0.0029861110961064696,
"clip_ratio/low_mean": 0.008680555620230734,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011666666832752526,
"entropy": 0.3644689805805683,
"epoch": 0.00688,
"grad_norm": 1.2154171466827393,
"kl": 0.5221099816262722,
"learning_rate": 9.999824327768121e-06,
"loss": 0.0713,
"step": 344,
"step_time": 9.09271458301373
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3139.0,
"completions/max_terminated_length": 3139.0,
"completions/mean_length": 2892.75,
"completions/mean_terminated_length": 2892.75,
"completions/min_length": 2692.0,
"completions/min_terminated_length": 2692.0,
"entropy": 0.36128484085202217,
"epoch": 0.0069,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.27363920211792,
"kl": 0.35059909522533417,
"learning_rate": 9.99982318519649e-06,
"loss": 0.0944,
"num_tokens": 18069396.0,
"reward": 0.023749999701976776,
"reward_std": 0.04824655503034592,
"rewards/rollout_reward_func/mean": 0.023749999701976776,
"rewards/rollout_reward_func/std": 0.05993275344371796,
"sampling/importance_sampling_ratio/max": 2.097679615020752,
"sampling/importance_sampling_ratio/mean": 1.0275201797485352,
"sampling/importance_sampling_ratio/min": 0.33659499883651733,
"sampling/sampling_logp_difference/max": 0.8113938570022583,
"sampling/sampling_logp_difference/mean": 0.04761318117380142,
"step": 345,
"step_time": 44.28958571599651
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005208333372138441,
"entropy": 0.3594363294541836,
"epoch": 0.00692,
"grad_norm": 1.2742985486984253,
"kl": 0.36022347025573254,
"learning_rate": 9.999822038921339e-06,
"loss": 0.0911,
"step": 346,
"step_time": 9.229209993995028
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0069444444961845875,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 3096.0,
"completions/max_terminated_length": 3096.0,
"completions/mean_length": 2891.53125,
"completions/mean_terminated_length": 2903.419189453125,
"completions/min_length": 2523.0,
"completions/min_terminated_length": 2743.0,
"entropy": 0.4008607342839241,
"epoch": 0.00694,
"frac_reward_zero_std": 0.125,
"grad_norm": 6.8911871910095215,
"kl": 0.4811227209866047,
"learning_rate": 9.99982088894267e-06,
"loss": 1.0538,
"num_tokens": 18185863.0,
"reward": 0.0018749996088445187,
"reward_std": 0.053620822727680206,
"rewards/rollout_reward_func/mean": 0.0018749996088445187,
"rewards/rollout_reward_func/std": 0.07095830142498016,
"sampling/importance_sampling_ratio/max": 2.899974822998047,
"sampling/importance_sampling_ratio/mean": 1.0153355598449707,
"sampling/importance_sampling_ratio/min": 0.29657140374183655,
"sampling/sampling_logp_difference/max": 1.0978436470031738,
"sampling/sampling_logp_difference/mean": 0.05201619863510132,
"step": 347,
"step_time": 42.26013625401538
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.003906250116415322,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0073784723645076156,
"entropy": 0.39887847751379013,
"epoch": 0.00696,
"grad_norm": 5.855113506317139,
"kl": 0.4611879512667656,
"learning_rate": 9.999819735260483e-06,
"loss": 1.045,
"step": 348,
"step_time": 9.195489862002432
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3180.0,
"completions/max_terminated_length": 3180.0,
"completions/mean_length": 2724.71875,
"completions/mean_terminated_length": 2724.71875,
"completions/min_length": 588.0,
"completions/min_terminated_length": 588.0,
"entropy": 0.340648140758276,
"epoch": 0.00698,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6283471584320068,
"kl": 0.41364636458456516,
"learning_rate": 9.999818577874782e-06,
"loss": -0.0522,
"num_tokens": 18296679.0,
"reward": -0.017812497913837433,
"reward_std": 0.21186214685440063,
"rewards/rollout_reward_func/mean": -0.017812497913837433,
"rewards/rollout_reward_func/std": 0.3267013728618622,
"sampling/importance_sampling_ratio/max": 2.397644519805908,
"sampling/importance_sampling_ratio/mean": 1.040598750114441,
"sampling/importance_sampling_ratio/min": 0.2771419286727905,
"sampling/sampling_logp_difference/max": 0.5928010940551758,
"sampling/sampling_logp_difference/mean": 0.043335799127817154,
"step": 349,
"step_time": 40.2587990250031
},
{
"clip_ratio/high_max": 0.013888888992369175,
"clip_ratio/high_mean": 0.0069444444961845875,
"clip_ratio/low_mean": 0.007068452658131719,
"clip_ratio/low_min": 0.004464285913854837,
"clip_ratio/region_mean": 0.014012897154316306,
"entropy": 0.342149056494236,
"epoch": 0.007,
"grad_norm": 1.0604374408721924,
"kl": 0.4015281666070223,
"learning_rate": 9.999817416785565e-06,
"loss": -0.0561,
"step": 350,
"step_time": 9.21041988900106
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3111.0,
"completions/max_terminated_length": 3111.0,
"completions/mean_length": 2883.34375,
"completions/mean_terminated_length": 2883.34375,
"completions/min_length": 2304.0,
"completions/min_terminated_length": 2304.0,
"entropy": 0.3668610565364361,
"epoch": 0.00702,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3156640529632568,
"kl": 0.5218501426279545,
"learning_rate": 9.999816251992836e-06,
"loss": -0.1535,
"num_tokens": 18412354.0,
"reward": -0.0028124996460974216,
"reward_std": 0.030912719666957855,
"rewards/rollout_reward_func/mean": -0.0028124996460974216,
"rewards/rollout_reward_func/std": 0.03656495362520218,
"sampling/importance_sampling_ratio/max": 1.953458309173584,
"sampling/importance_sampling_ratio/mean": 0.966667115688324,
"sampling/importance_sampling_ratio/min": 0.17398038506507874,
"sampling/sampling_logp_difference/max": 1.056915521621704,
"sampling/sampling_logp_difference/mean": 0.044610172510147095,
"step": 351,
"step_time": 44.04102398098621
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.0069444444961845875,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666744276881,
"entropy": 0.36827195063233376,
"epoch": 0.00704,
"grad_norm": 1.2125022411346436,
"kl": 0.5398768447339535,
"learning_rate": 9.999815083496593e-06,
"loss": -0.1541,
"step": 352,
"step_time": 9.19050597500609
},
{
"clip_ratio/high_max": 0.001700680237263441,
"clip_ratio/high_mean": 0.0008503401186317205,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0025864513590931892,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 3061.0,
"completions/max_terminated_length": 3061.0,
"completions/mean_length": 2861.5,
"completions/mean_terminated_length": 2857.633544921875,
"completions/min_length": 1941.0,
"completions/min_terminated_length": 1941.0,
"entropy": 0.45415887236595154,
"epoch": 0.00706,
"frac_reward_zero_std": 0.125,
"grad_norm": 2.05373477935791,
"kl": 0.484494686126709,
"learning_rate": 9.99981391129684e-06,
"loss": -0.2377,
"num_tokens": 18527528.0,
"reward": 0.007187499664723873,
"reward_std": 0.04172711446881294,
"rewards/rollout_reward_func/mean": 0.007187499664723873,
"rewards/rollout_reward_func/std": 0.061760954558849335,
"sampling/importance_sampling_ratio/max": 2.0689685344696045,
"sampling/importance_sampling_ratio/mean": 1.0267624855041504,
"sampling/importance_sampling_ratio/min": 0.4079192876815796,
"sampling/sampling_logp_difference/max": 1.1987524032592773,
"sampling/sampling_logp_difference/mean": 0.046385399997234344,
"step": 353,
"step_time": 42.472357441991335
},
{
"clip_ratio/high_max": 0.006779738934710622,
"clip_ratio/high_mean": 0.003389869467355311,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003389869467355311,
"entropy": 0.4550458900630474,
"epoch": 0.00708,
"grad_norm": 1.9996660947799683,
"kl": 0.4796774350106716,
"learning_rate": 9.999812735393578e-06,
"loss": -0.2464,
"step": 354,
"step_time": 9.066415241984942
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 3044.0,
"completions/max_terminated_length": 3044.0,
"completions/mean_length": 2909.34375,
"completions/mean_terminated_length": 2909.67724609375,
"completions/min_length": 2728.0,
"completions/min_terminated_length": 2728.0,
"entropy": 0.36910005658864975,
"epoch": 0.0071,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.3549381494522095,
"kl": 0.41893161833286285,
"learning_rate": 9.999811555786805e-06,
"loss": -0.1236,
"num_tokens": 18644345.0,
"reward": 0.009687500074505806,
"reward_std": 0.0260639488697052,
"rewards/rollout_reward_func/mean": 0.009687500074505806,
"rewards/rollout_reward_func/std": 0.03306731954216957,
"sampling/importance_sampling_ratio/max": 1.8643220663070679,
"sampling/importance_sampling_ratio/mean": 1.0049974918365479,
"sampling/importance_sampling_ratio/min": 5.805537139867252e-22,
"sampling/sampling_logp_difference/max": 15.75226879119873,
"sampling/sampling_logp_difference/mean": 0.11400792747735977,
"step": 355,
"step_time": 42.56496216600499
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005208333372138441,
"entropy": 0.367544986307621,
"epoch": 0.00712,
"grad_norm": 1.193587303161621,
"kl": 0.41215749084949493,
"learning_rate": 9.999810372476526e-06,
"loss": -0.1262,
"step": 356,
"step_time": 9.78580809700361
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0015625000232830644,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0015625000232830644,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3050.0,
"completions/max_terminated_length": 3050.0,
"completions/mean_length": 2820.96875,
"completions/mean_terminated_length": 2820.96875,
"completions/min_length": 1553.0,
"completions/min_terminated_length": 1553.0,
"entropy": 0.38109608739614487,
"epoch": 0.00714,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.147459626197815,
"kl": 0.3814978711307049,
"learning_rate": 9.99980918546274e-06,
"loss": 0.0172,
"num_tokens": 18758190.0,
"reward": -0.019999999552965164,
"reward_std": 0.05093258619308472,
"rewards/rollout_reward_func/mean": -0.019999999552965164,
"rewards/rollout_reward_func/std": 0.07330889254808426,
"sampling/importance_sampling_ratio/max": 1.8108247518539429,
"sampling/importance_sampling_ratio/mean": 1.094908356666565,
"sampling/importance_sampling_ratio/min": 0.24349598586559296,
"sampling/sampling_logp_difference/max": 0.6769726276397705,
"sampling/sampling_logp_difference/mean": 0.04435478150844574,
"step": 357,
"step_time": 42.417569707999064
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"entropy": 0.3823031932115555,
"epoch": 0.00716,
"grad_norm": 1.1464769840240479,
"kl": 0.36772672832012177,
"learning_rate": 9.999807994745449e-06,
"loss": 0.0154,
"step": 358,
"step_time": 9.048302499992133
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3050.0,
"completions/max_terminated_length": 3050.0,
"completions/mean_length": 2754.90625,
"completions/mean_terminated_length": 2754.90625,
"completions/min_length": 664.0,
"completions/min_terminated_length": 664.0,
"entropy": 0.3804786093533039,
"epoch": 0.00718,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8674813508987427,
"kl": 0.7694353275001049,
"learning_rate": 9.999806800324652e-06,
"loss": -0.1264,
"num_tokens": 18869464.0,
"reward": -0.08562499284744263,
"reward_std": 0.26934367418289185,
"rewards/rollout_reward_func/mean": -0.08562499284744263,
"rewards/rollout_reward_func/std": 0.41496941447257996,
"sampling/importance_sampling_ratio/max": 1.7930198907852173,
"sampling/importance_sampling_ratio/mean": 0.8733890056610107,
"sampling/importance_sampling_ratio/min": 0.1389148235321045,
"sampling/sampling_logp_difference/max": 2.197103500366211,
"sampling/sampling_logp_difference/mean": 0.057078905403614044,
"step": 359,
"step_time": 40.268112329998985
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.003574346425011754,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005310457549057901,
"entropy": 0.3829173110425472,
"epoch": 0.0072,
"grad_norm": 1.289261817932129,
"kl": 0.8045071884989738,
"learning_rate": 9.999805602200355e-06,
"loss": -0.1284,
"step": 360,
"step_time": 9.023047832975863
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3122.0,
"completions/max_terminated_length": 3122.0,
"completions/mean_length": 2811.6875,
"completions/mean_terminated_length": 2811.6875,
"completions/min_length": 2029.0,
"completions/min_terminated_length": 2029.0,
"entropy": 0.37365251034498215,
"epoch": 0.00722,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.1524171829223633,
"kl": 0.3612435422837734,
"learning_rate": 9.999804400372553e-06,
"loss": -0.0505,
"num_tokens": 18983136.0,
"reward": -0.009312499314546585,
"reward_std": 0.0296157393604517,
"rewards/rollout_reward_func/mean": -0.009312499314546585,
"rewards/rollout_reward_func/std": 0.05717795714735985,
"sampling/importance_sampling_ratio/max": 1.9707398414611816,
"sampling/importance_sampling_ratio/mean": 1.0513246059417725,
"sampling/importance_sampling_ratio/min": 0.38958939909935,
"sampling/sampling_logp_difference/max": 0.6325550079345703,
"sampling/sampling_logp_difference/mean": 0.042889274656772614,
"step": 361,
"step_time": 40.995919900989975
},
{
"clip_ratio/high_max": 0.01736111124046147,
"clip_ratio/high_mean": 0.008680555620230734,
"clip_ratio/low_mean": 0.005208333372138441,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013888888992369175,
"entropy": 0.377502653747797,
"epoch": 0.00724,
"grad_norm": 1.0273391008377075,
"kl": 0.34734607115387917,
"learning_rate": 9.999803194841253e-06,
"loss": -0.0535,
"step": 362,
"step_time": 10.375120690987387
},
{
"clip_ratio/high_max": 0.006850600708276033,
"clip_ratio/high_mean": 0.0034253003541380167,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005161411478184164,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3006.0,
"completions/max_terminated_length": 3006.0,
"completions/mean_length": 2882.75,
"completions/mean_terminated_length": 2882.75,
"completions/min_length": 2511.0,
"completions/min_terminated_length": 2511.0,
"entropy": 0.36708877235651016,
"epoch": 0.00726,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0917654037475586,
"kl": 0.35414266400039196,
"learning_rate": 9.999801985606451e-06,
"loss": -0.0491,
"num_tokens": 19098506.0,
"reward": 0.02918749861419201,
"reward_std": 0.05671289935708046,
"rewards/rollout_reward_func/mean": 0.02918749861419201,
"rewards/rollout_reward_func/std": 0.06964584439992905,
"sampling/importance_sampling_ratio/max": 1.5679417848587036,
"sampling/importance_sampling_ratio/mean": 0.9486284255981445,
"sampling/importance_sampling_ratio/min": 0.35323700308799744,
"sampling/sampling_logp_difference/max": 0.720933198928833,
"sampling/sampling_logp_difference/mean": 0.04150357097387314,
"step": 363,
"step_time": 41.8443327970017
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0069444444961845875,
"entropy": 0.36721600964665413,
"epoch": 0.00728,
"grad_norm": 1.1443796157836914,
"kl": 0.3532056175172329,
"learning_rate": 9.999800772668154e-06,
"loss": -0.0496,
"step": 364,
"step_time": 8.918639629009704
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3044.0,
"completions/max_terminated_length": 3044.0,
"completions/mean_length": 2879.5,
"completions/mean_terminated_length": 2879.5,
"completions/min_length": 2698.0,
"completions/min_terminated_length": 2698.0,
"entropy": 0.35799355804920197,
"epoch": 0.0073,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.0182232856750488,
"kl": 0.29627991281449795,
"learning_rate": 9.999799556026358e-06,
"loss": -0.0254,
"num_tokens": 19214265.0,
"reward": 0.019375000149011612,
"reward_std": 0.033470965921878815,
"rewards/rollout_reward_func/mean": 0.019375000149011612,
"rewards/rollout_reward_func/std": 0.047785647213459015,
"sampling/importance_sampling_ratio/max": 1.9531196355819702,
"sampling/importance_sampling_ratio/mean": 1.0406684875488281,
"sampling/importance_sampling_ratio/min": 0.38062629103660583,
"sampling/sampling_logp_difference/max": 0.47331881523132324,
"sampling/sampling_logp_difference/mean": 0.03672315180301666,
"step": 365,
"step_time": 41.54418921201432
},
{
"clip_ratio/high_max": 0.013888888992369175,
"clip_ratio/high_mean": 0.008680555736646056,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008680555736646056,
"entropy": 0.3587690182030201,
"epoch": 0.00732,
"grad_norm": 0.9805796146392822,
"kl": 0.29380563274025917,
"learning_rate": 9.999798335681066e-06,
"loss": -0.028,
"step": 366,
"step_time": 9.014416432997677
},
{
"clip_ratio/high_max": 0.008472222136333585,
"clip_ratio/high_mean": 0.004236111068166792,
"clip_ratio/low_mean": 0.0018382353009656072,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0060743463691323996,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3022.0,
"completions/max_terminated_length": 3022.0,
"completions/mean_length": 2685.65625,
"completions/mean_terminated_length": 2685.65625,
"completions/min_length": 479.0,
"completions/min_terminated_length": 479.0,
"entropy": 0.4594216123223305,
"epoch": 0.00734,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1722557544708252,
"kl": 0.7497690804302692,
"learning_rate": 9.99979711163228e-06,
"loss": -0.1539,
"num_tokens": 19323711.0,
"reward": -0.06843750178813934,
"reward_std": 0.2887628674507141,
"rewards/rollout_reward_func/mean": -0.06843750178813934,
"rewards/rollout_reward_func/std": 0.3824210464954376,
"sampling/importance_sampling_ratio/max": 2.1770527362823486,
"sampling/importance_sampling_ratio/mean": 0.8495855331420898,
"sampling/importance_sampling_ratio/min": 0.11624876409769058,
"sampling/sampling_logp_difference/max": 1.3837306499481201,
"sampling/sampling_logp_difference/mean": 0.05317322537302971,
"step": 367,
"step_time": 39.4116383000146
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.004464285913854837,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006200397037900984,
"entropy": 0.4617273025214672,
"epoch": 0.00736,
"grad_norm": 1.0443613529205322,
"kl": 0.7092177867889404,
"learning_rate": 9.999795883880002e-06,
"loss": -0.1565,
"step": 368,
"step_time": 9.343507058991236
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3099.0,
"completions/max_terminated_length": 3099.0,
"completions/mean_length": 2932.46875,
"completions/mean_terminated_length": 2932.46875,
"completions/min_length": 2739.0,
"completions/min_terminated_length": 2739.0,
"entropy": 0.37396084517240524,
"epoch": 0.00738,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.0847474336624146,
"kl": 0.45913570187985897,
"learning_rate": 9.999794652424228e-06,
"loss": 0.0163,
"num_tokens": 19441108.0,
"reward": 0.0228124987334013,
"reward_std": 0.034713372588157654,
"rewards/rollout_reward_func/mean": 0.0228124987334013,
"rewards/rollout_reward_func/std": 0.047940440475940704,
"sampling/importance_sampling_ratio/max": 2.0848076343536377,
"sampling/importance_sampling_ratio/mean": 0.9931883811950684,
"sampling/importance_sampling_ratio/min": 0.39272943139076233,
"sampling/sampling_logp_difference/max": 0.545560359954834,
"sampling/sampling_logp_difference/mean": 0.03934935852885246,
"step": 369,
"step_time": 42.14784694199625
},
{
"clip_ratio/high_max": 0.0024999999441206455,
"clip_ratio/high_mean": 0.0012499999720603228,
"clip_ratio/low_mean": 0.005208333372138441,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006458333344198763,
"entropy": 0.3823331892490387,
"epoch": 0.0074,
"grad_norm": 1.1212048530578613,
"kl": 0.45100564509630203,
"learning_rate": 9.999793417264967e-06,
"loss": 0.0129,
"step": 370,
"step_time": 9.197131928005547
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3129.0,
"completions/max_terminated_length": 3129.0,
"completions/mean_length": 2884.8125,
"completions/mean_terminated_length": 2884.8125,
"completions/min_length": 1562.0,
"completions/min_terminated_length": 1562.0,
"entropy": 0.39911437407135963,
"epoch": 0.00742,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.948852002620697,
"kl": 0.4024066887795925,
"learning_rate": 9.999792178402215e-06,
"loss": -0.006,
"num_tokens": 19557067.0,
"reward": 0.052187494933605194,
"reward_std": 0.11198446899652481,
"rewards/rollout_reward_func/mean": 0.052187494933605194,
"rewards/rollout_reward_func/std": 0.23370416462421417,
"sampling/importance_sampling_ratio/max": 1.9198561906814575,
"sampling/importance_sampling_ratio/mean": 0.9719012975692749,
"sampling/importance_sampling_ratio/min": 0.4445561170578003,
"sampling/sampling_logp_difference/max": 0.6034307479858398,
"sampling/sampling_logp_difference/mean": 0.03672725707292557,
"step": 371,
"step_time": 41.623454950007726
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"entropy": 0.40554217994213104,
"epoch": 0.00744,
"grad_norm": 0.936628520488739,
"kl": 0.39416519552469254,
"learning_rate": 9.999790935835974e-06,
"loss": -0.0082,
"step": 372,
"step_time": 9.187493012999767
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 3139.0,
"completions/max_terminated_length": 3139.0,
"completions/mean_length": 2842.71875,
"completions/mean_terminated_length": 2836.935302734375,
"completions/min_length": 1748.0,
"completions/min_terminated_length": 1748.0,
"entropy": 0.3921775594353676,
"epoch": 0.00746,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.032710313796997,
"kl": 0.5857649389654398,
"learning_rate": 9.999789689566245e-06,
"loss": 0.0834,
"num_tokens": 19671541.0,
"reward": 0.010624999180436134,
"reward_std": 0.06529676914215088,
"rewards/rollout_reward_func/mean": 0.010624999180436134,
"rewards/rollout_reward_func/std": 0.07615508139133453,
"sampling/importance_sampling_ratio/max": 1.830490231513977,
"sampling/importance_sampling_ratio/mean": 0.9969909191131592,
"sampling/importance_sampling_ratio/min": 0.5150743126869202,
"sampling/sampling_logp_difference/max": 0.8365006446838379,
"sampling/sampling_logp_difference/mean": 0.03898666799068451,
"step": 373,
"step_time": 43.149007094005356
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0024999999441206455,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005972222192212939,
"entropy": 0.3927090987563133,
"epoch": 0.00748,
"grad_norm": 1.5853525400161743,
"kl": 0.5225659962743521,
"learning_rate": 9.999788439593031e-06,
"loss": 0.0768,
"step": 374,
"step_time": 9.183209674003592
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0030381944961845875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004774305620230734,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3078.0,
"completions/max_terminated_length": 3078.0,
"completions/mean_length": 2853.59375,
"completions/mean_terminated_length": 2853.59375,
"completions/min_length": 2066.0,
"completions/min_terminated_length": 2066.0,
"entropy": 0.49389762803912163,
"epoch": 0.0075,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.9626293182373047,
"kl": 0.34199972450733185,
"learning_rate": 9.999787185916332e-06,
"loss": 0.0764,
"num_tokens": 19786483.0,
"reward": -0.025937503203749657,
"reward_std": 0.10800082981586456,
"rewards/rollout_reward_func/mean": -0.025937503203749657,
"rewards/rollout_reward_func/std": 0.20182807743549347,
"sampling/importance_sampling_ratio/max": 2.4222664833068848,
"sampling/importance_sampling_ratio/mean": 1.0802035331726074,
"sampling/importance_sampling_ratio/min": 0.32061922550201416,
"sampling/sampling_logp_difference/max": 0.8447649478912354,
"sampling/sampling_logp_difference/mean": 0.047866012901067734,
"step": 375,
"step_time": 43.30249591200118
},
{
"clip_ratio/high_max": 0.005434782709926367,
"clip_ratio/high_mean": 0.0027173913549631834,
"clip_ratio/low_mean": 0.005208333372138441,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007925724727101624,
"entropy": 0.4943077266216278,
"epoch": 0.00752,
"grad_norm": 1.645762324333191,
"kl": 0.335026815533638,
"learning_rate": 9.999785928536149e-06,
"loss": 0.0703,
"step": 376,
"step_time": 9.22750177400303
},
{
"clip_ratio/high_max": 0.009444444440305233,
"clip_ratio/high_mean": 0.006458333344198763,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006458333344198763,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2993.0,
"completions/max_terminated_length": 2993.0,
"completions/mean_length": 2863.90625,
"completions/mean_terminated_length": 2863.90625,
"completions/min_length": 2730.0,
"completions/min_terminated_length": 2730.0,
"entropy": 0.4318506419658661,
"epoch": 0.00754,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5574791431427002,
"kl": 0.39671508595347404,
"learning_rate": 9.999784667452484e-06,
"loss": 0.0282,
"num_tokens": 19901626.0,
"reward": 0.048124998807907104,
"reward_std": 0.09666197746992111,
"rewards/rollout_reward_func/mean": 0.048124998807907104,
"rewards/rollout_reward_func/std": 0.21012187004089355,
"sampling/importance_sampling_ratio/max": 1.8046319484710693,
"sampling/importance_sampling_ratio/mean": 0.9822795987129211,
"sampling/importance_sampling_ratio/min": 0.3637678623199463,
"sampling/sampling_logp_difference/max": 0.6638021469116211,
"sampling/sampling_logp_difference/mean": 0.04675694555044174,
"step": 377,
"step_time": 42.498453273998166
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0069444444961845875,
"entropy": 0.43226491659879684,
"epoch": 0.00756,
"grad_norm": 1.3629974126815796,
"kl": 0.389215424656868,
"learning_rate": 9.999783402665337e-06,
"loss": 0.0279,
"step": 378,
"step_time": 8.923731430011685
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2952.0,
"completions/max_terminated_length": 2952.0,
"completions/mean_length": 2829.75,
"completions/mean_terminated_length": 2829.75,
"completions/min_length": 2634.0,
"completions/min_terminated_length": 2634.0,
"entropy": 0.41170306876301765,
"epoch": 0.00758,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.12644362449646,
"kl": 0.2883479632437229,
"learning_rate": 9.999782134174711e-06,
"loss": -0.0153,
"num_tokens": 20015346.0,
"reward": 0.04062499850988388,
"reward_std": 0.11773143708705902,
"rewards/rollout_reward_func/mean": 0.04062499850988388,
"rewards/rollout_reward_func/std": 0.2285146564245224,
"sampling/importance_sampling_ratio/max": 1.940473198890686,
"sampling/importance_sampling_ratio/mean": 1.0289239883422852,
"sampling/importance_sampling_ratio/min": 0.442949116230011,
"sampling/sampling_logp_difference/max": 0.48134374618530273,
"sampling/sampling_logp_difference/mean": 0.037492986768484116,
"step": 379,
"step_time": 43.1588148939918
},
{
"clip_ratio/high_max": 0.020833333721384406,
"clip_ratio/high_mean": 0.010416666860692203,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01215277798473835,
"entropy": 0.41560249775648117,
"epoch": 0.0076,
"grad_norm": 1.1223279237747192,
"kl": 0.2790991757065058,
"learning_rate": 9.999780861980606e-06,
"loss": -0.0182,
"step": 380,
"step_time": 8.82811975498771
},
{
"clip_ratio/high_max": 0.0008445946150459349,
"clip_ratio/high_mean": 0.00042229730752296746,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0038945196429267526,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 3095.0,
"completions/max_terminated_length": 3032.0,
"completions/mean_length": 2888.90625,
"completions/mean_terminated_length": 2882.258056640625,
"completions/min_length": 2744.0,
"completions/min_terminated_length": 2744.0,
"entropy": 0.46602268517017365,
"epoch": 0.00762,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.0885858535766602,
"kl": 0.31250107660889626,
"learning_rate": 9.999779586083026e-06,
"loss": -0.0936,
"num_tokens": 20131915.0,
"reward": 0.043437499552965164,
"reward_std": 0.09384177625179291,
"rewards/rollout_reward_func/mean": 0.043437499552965164,
"rewards/rollout_reward_func/std": 0.21206526458263397,
"sampling/importance_sampling_ratio/max": 2.0978126525878906,
"sampling/importance_sampling_ratio/mean": 0.9543898105621338,
"sampling/importance_sampling_ratio/min": 0.3171147108078003,
"sampling/sampling_logp_difference/max": 0.505486249923706,
"sampling/sampling_logp_difference/mean": 0.03892179951071739,
"step": 381,
"step_time": 42.904207296000095
},
{
"clip_ratio/high_max": 0.012950450414791703,
"clip_ratio/high_mean": 0.006475225207395852,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009947447455488145,
"entropy": 0.4669545851647854,
"epoch": 0.00764,
"grad_norm": 1.0434297323226929,
"kl": 0.30925088562071323,
"learning_rate": 9.999778306481967e-06,
"loss": -0.0977,
"step": 382,
"step_time": 9.16786321499967
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3044.0,
"completions/max_terminated_length": 3044.0,
"completions/mean_length": 2866.03125,
"completions/mean_terminated_length": 2866.03125,
"completions/min_length": 2663.0,
"completions/min_terminated_length": 2663.0,
"entropy": 0.4193827398121357,
"epoch": 0.00766,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.5363037586212158,
"kl": 0.28570458851754665,
"learning_rate": 9.999777023177434e-06,
"loss": -0.0633,
"num_tokens": 20247423.0,
"reward": 0.012437500059604645,
"reward_std": 0.03864006698131561,
"rewards/rollout_reward_func/mean": 0.012437500059604645,
"rewards/rollout_reward_func/std": 0.04760519042611122,
"sampling/importance_sampling_ratio/max": 1.9009040594100952,
"sampling/importance_sampling_ratio/mean": 1.0575485229492188,
"sampling/importance_sampling_ratio/min": 0.2904524803161621,
"sampling/sampling_logp_difference/max": 0.7339715957641602,
"sampling/sampling_logp_difference/mean": 0.03420642018318176,
"step": 383,
"step_time": 41.434931342999334
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"entropy": 0.4187196418642998,
"epoch": 0.00768,
"grad_norm": 1.4340078830718994,
"kl": 0.28724461793899536,
"learning_rate": 9.999775736169428e-06,
"loss": -0.066,
"step": 384,
"step_time": 9.791899923991878
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008680555620230734,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 3010.0,
"completions/max_terminated_length": 3010.0,
"completions/mean_length": 2776.15625,
"completions/mean_terminated_length": 2773.12890625,
"completions/min_length": 1602.0,
"completions/min_terminated_length": 1602.0,
"entropy": 0.45826057717204094,
"epoch": 0.0077,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.7409992218017578,
"kl": 0.3255313076078892,
"learning_rate": 9.99977444545795e-06,
"loss": -0.0843,
"num_tokens": 20360386.0,
"reward": 0.11749999225139618,
"reward_std": 0.20005768537521362,
"rewards/rollout_reward_func/mean": 0.11749999225139618,
"rewards/rollout_reward_func/std": 0.3573288023471832,
"sampling/importance_sampling_ratio/max": 1.6205370426177979,
"sampling/importance_sampling_ratio/mean": 0.9788841009140015,
"sampling/importance_sampling_ratio/min": 0.4105953872203827,
"sampling/sampling_logp_difference/max": 0.9278536438941956,
"sampling/sampling_logp_difference/mean": 0.03812399506568909,
"step": 385,
"step_time": 39.60660175299563
},
{
"clip_ratio/high_max": 0.004248619778081775,
"clip_ratio/high_mean": 0.0021243098890408874,
"clip_ratio/low_mean": 0.003574346425011754,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005698656314052641,
"entropy": 0.45393380895256996,
"epoch": 0.00772,
"grad_norm": 1.9177309274673462,
"kl": 0.339593093842268,
"learning_rate": 9.999773151043e-06,
"loss": -0.0865,
"step": 386,
"step_time": 8.906994643999496
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005208333372138441,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3082.0,
"completions/max_terminated_length": 3082.0,
"completions/mean_length": 2868.125,
"completions/mean_terminated_length": 2868.125,
"completions/min_length": 2607.0,
"completions/min_terminated_length": 2607.0,
"entropy": 0.42804471775889397,
"epoch": 0.00774,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6265130043029785,
"kl": 0.39343132451176643,
"learning_rate": 9.999771852924581e-06,
"loss": -0.1124,
"num_tokens": 20475280.0,
"reward": 0.009687500074505806,
"reward_std": 0.049455564469099045,
"rewards/rollout_reward_func/mean": 0.009687500074505806,
"rewards/rollout_reward_func/std": 0.05811470001935959,
"sampling/importance_sampling_ratio/max": 2.3641629219055176,
"sampling/importance_sampling_ratio/mean": 1.0422842502593994,
"sampling/importance_sampling_ratio/min": 0.5156167149543762,
"sampling/sampling_logp_difference/max": 0.5555129051208496,
"sampling/sampling_logp_difference/mean": 0.03403444588184357,
"step": 387,
"step_time": 41.99420854899654
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0029861112125217915,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006458333460614085,
"entropy": 0.42400502040982246,
"epoch": 0.00776,
"grad_norm": 1.9294583797454834,
"kl": 0.4124392867088318,
"learning_rate": 9.999770551102692e-06,
"loss": -0.1169,
"step": 388,
"step_time": 9.106344018990058
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.005208333372138441,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005208333372138441,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3153.0,
"completions/max_terminated_length": 3153.0,
"completions/mean_length": 2858.8125,
"completions/mean_terminated_length": 2858.8125,
"completions/min_length": 1947.0,
"completions/min_terminated_length": 1947.0,
"entropy": 0.4226231873035431,
"epoch": 0.00778,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.3834099769592285,
"kl": 0.3721582032740116,
"learning_rate": 9.999769245577337e-06,
"loss": -0.1146,
"num_tokens": 20590096.0,
"reward": 0.019687499850988388,
"reward_std": 0.05479207634925842,
"rewards/rollout_reward_func/mean": 0.019687499850988388,
"rewards/rollout_reward_func/std": 0.07186386734247208,
"sampling/importance_sampling_ratio/max": 1.890951156616211,
"sampling/importance_sampling_ratio/mean": 1.0242218971252441,
"sampling/importance_sampling_ratio/min": 0.4614587426185608,
"sampling/sampling_logp_difference/max": 0.7055144309997559,
"sampling/sampling_logp_difference/mean": 0.04106433689594269,
"step": 389,
"step_time": 41.78263785800664
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"entropy": 0.4172542728483677,
"epoch": 0.0078,
"grad_norm": 1.3047667741775513,
"kl": 0.3840856868773699,
"learning_rate": 9.999767936348516e-06,
"loss": -0.1166,
"step": 390,
"step_time": 10.408334011008264
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0028409091755747795,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004577020299620926,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3019.0,
"completions/max_terminated_length": 3019.0,
"completions/mean_length": 2887.90625,
"completions/mean_terminated_length": 2887.90625,
"completions/min_length": 2633.0,
"completions/min_terminated_length": 2633.0,
"entropy": 0.44282426685094833,
"epoch": 0.00782,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0893396139144897,
"kl": 0.3564727380871773,
"learning_rate": 9.999766623416231e-06,
"loss": -0.1109,
"num_tokens": 20705858.0,
"reward": -0.017812497913837433,
"reward_std": 0.11623667925596237,
"rewards/rollout_reward_func/mean": -0.017812497913837433,
"rewards/rollout_reward_func/std": 0.2033388316631317,
"sampling/importance_sampling_ratio/max": 1.5181227922439575,
"sampling/importance_sampling_ratio/mean": 0.8538496494293213,
"sampling/importance_sampling_ratio/min": 0.15931598842144012,
"sampling/sampling_logp_difference/max": 1.129029631614685,
"sampling/sampling_logp_difference/mean": 0.04136330261826515,
"step": 391,
"step_time": 41.96269005000795
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005208333372138441,
"entropy": 0.4361180029809475,
"epoch": 0.00784,
"grad_norm": 1.076133370399475,
"kl": 0.3754804767668247,
"learning_rate": 9.999765306780483e-06,
"loss": -0.1136,
"step": 392,
"step_time": 9.008581123009208
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3042.0,
"completions/max_terminated_length": 3042.0,
"completions/mean_length": 2740.0625,
"completions/mean_terminated_length": 2740.0625,
"completions/min_length": 392.0,
"completions/min_terminated_length": 392.0,
"entropy": 0.36063743010163307,
"epoch": 0.00786,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.2201619148254395,
"kl": 0.3248476982116699,
"learning_rate": 9.999763986441271e-06,
"loss": -0.0414,
"num_tokens": 20816808.0,
"reward": -0.019999999552965164,
"reward_std": 0.15125074982643127,
"rewards/rollout_reward_func/mean": -0.019999999552965164,
"rewards/rollout_reward_func/std": 0.30613091588020325,
"sampling/importance_sampling_ratio/max": 1.5498796701431274,
"sampling/importance_sampling_ratio/mean": 1.0666983127593994,
"sampling/importance_sampling_ratio/min": 0.414148211479187,
"sampling/sampling_logp_difference/max": 0.6058452129364014,
"sampling/sampling_logp_difference/mean": 0.03376508131623268,
"step": 393,
"step_time": 40.33082154300064
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0045770201832056046,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.0045770201832056046,
"entropy": 0.3569633737206459,
"epoch": 0.00788,
"grad_norm": 1.142669677734375,
"kl": 0.32624403573572636,
"learning_rate": 9.999762662398599e-06,
"loss": -0.0454,
"step": 394,
"step_time": 9.000440201001766
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 3137.0,
"completions/max_terminated_length": 3137.0,
"completions/mean_length": 2868.34375,
"completions/mean_terminated_length": 2860.0,
"completions/min_length": 1590.0,
"completions/min_terminated_length": 1590.0,
"entropy": 0.3322554640471935,
"epoch": 0.0079,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2661420106887817,
"kl": 0.4089338220655918,
"learning_rate": 9.999761334652469e-06,
"loss": -0.1237,
"num_tokens": 20932270.0,
"reward": 0.07843749970197678,
"reward_std": 0.16960112750530243,
"rewards/rollout_reward_func/mean": 0.07843749970197678,
"rewards/rollout_reward_func/std": 0.25980275869369507,
"sampling/importance_sampling_ratio/max": 1.9775424003601074,
"sampling/importance_sampling_ratio/mean": 0.9813051223754883,
"sampling/importance_sampling_ratio/min": 0.36013737320899963,
"sampling/sampling_logp_difference/max": 0.9966578483581543,
"sampling/sampling_logp_difference/mean": 0.03481311723589897,
"step": 395,
"step_time": 41.39294739000616
},
{
"clip_ratio/high_max": 0.004352503921836615,
"clip_ratio/high_mean": 0.0021762519609183073,
"clip_ratio/low_mean": 0.012500000302679837,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.014676252263598144,
"entropy": 0.32350194081664085,
"epoch": 0.00792,
"grad_norm": 1.1234732866287231,
"kl": 0.42060235887765884,
"learning_rate": 9.999760003202882e-06,
"loss": -0.1291,
"step": 396,
"step_time": 9.728634478997265
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3056.0,
"completions/max_terminated_length": 3056.0,
"completions/mean_length": 2912.625,
"completions/mean_terminated_length": 2912.625,
"completions/min_length": 2377.0,
"completions/min_terminated_length": 2377.0,
"entropy": 0.3340754322707653,
"epoch": 0.00794,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8994367122650146,
"kl": 0.4744664132595062,
"learning_rate": 9.999758668049834e-06,
"loss": 0.0768,
"num_tokens": 21049220.0,
"reward": 0.056312501430511475,
"reward_std": 0.11309725791215897,
"rewards/rollout_reward_func/mean": 0.056312501430511475,
"rewards/rollout_reward_func/std": 0.22624170780181885,
"sampling/importance_sampling_ratio/max": 1.8281199932098389,
"sampling/importance_sampling_ratio/mean": 0.8943976163864136,
"sampling/importance_sampling_ratio/min": 0.25516989827156067,
"sampling/sampling_logp_difference/max": 0.9102246761322021,
"sampling/sampling_logp_difference/mean": 0.04049454256892204,
"step": 397,
"step_time": 41.44624646900047
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.005208333372138441,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.008680555736646056,
"entropy": 0.32801349833607674,
"epoch": 0.00796,
"grad_norm": 0.8551763892173767,
"kl": 0.4777488671243191,
"learning_rate": 9.999757329193334e-06,
"loss": 0.0744,
"step": 398,
"step_time": 9.080709051006124
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0029861110961064696,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0047222222201526165,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3098.0,
"completions/max_terminated_length": 3098.0,
"completions/mean_length": 2884.09375,
"completions/mean_terminated_length": 2884.09375,
"completions/min_length": 2236.0,
"completions/min_terminated_length": 2236.0,
"entropy": 0.3948609419167042,
"epoch": 0.00798,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4350974559783936,
"kl": 0.5110477805137634,
"learning_rate": 9.999755986633378e-06,
"loss": 0.0408,
"num_tokens": 21165049.0,
"reward": 0.00624999962747097,
"reward_std": 0.03348710387945175,
"rewards/rollout_reward_func/mean": 0.00624999962747097,
"rewards/rollout_reward_func/std": 0.04293655976653099,
"sampling/importance_sampling_ratio/max": 2.2736153602600098,
"sampling/importance_sampling_ratio/mean": 0.9841665029525757,
"sampling/importance_sampling_ratio/min": 0.16349110007286072,
"sampling/sampling_logp_difference/max": 0.5857527256011963,
"sampling/sampling_logp_difference/mean": 0.0461178719997406,
"step": 399,
"step_time": 41.288154925001436
},
{
"clip_ratio/high_max": 0.020833333721384406,
"clip_ratio/high_mean": 0.010416666860692203,
"clip_ratio/low_mean": 0.009930555592291057,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02034722233656794,
"entropy": 0.3919810652732849,
"epoch": 0.008,
"grad_norm": 1.3463600873947144,
"kl": 0.534088172018528,
"learning_rate": 9.999754640369969e-06,
"loss": 0.035,
"step": 400,
"step_time": 9.14792783199664
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0069444444961845875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3113.0,
"completions/max_terminated_length": 3113.0,
"completions/mean_length": 2888.625,
"completions/mean_terminated_length": 2888.625,
"completions/min_length": 2569.0,
"completions/min_terminated_length": 2569.0,
"entropy": 0.34803007543087006,
"epoch": 0.00802,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1884585618972778,
"kl": 0.4120374508202076,
"learning_rate": 9.99975329040311e-06,
"loss": -0.0251,
"num_tokens": 21280860.0,
"reward": -0.020624998956918716,
"reward_std": 0.09246805310249329,
"rewards/rollout_reward_func/mean": -0.020624998956918716,
"rewards/rollout_reward_func/std": 0.1875123679637909,
"sampling/importance_sampling_ratio/max": 2.7591519355773926,
"sampling/importance_sampling_ratio/mean": 1.0442249774932861,
"sampling/importance_sampling_ratio/min": 0.558386504650116,
"sampling/sampling_logp_difference/max": 0.5773515701293945,
"sampling/sampling_logp_difference/mean": 0.03956456109881401,
"step": 401,
"step_time": 43.588559673997224
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.006560457521118224,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006560457521118224,
"entropy": 0.34330132976174355,
"epoch": 0.00804,
"grad_norm": 1.2133526802062988,
"kl": 0.42174356430768967,
"learning_rate": 9.9997519367328e-06,
"loss": -0.0269,
"step": 402,
"step_time": 9.196838259005744
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00046296295477077365,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00046296295477077365,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 3085.0,
"completions/max_terminated_length": 3085.0,
"completions/mean_length": 2836.3125,
"completions/mean_terminated_length": 2866.43359375,
"completions/min_length": 1967.0,
"completions/min_terminated_length": 2182.0,
"entropy": 0.38612882420420647,
"epoch": 0.00806,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.1789888143539429,
"kl": 0.5264580063521862,
"learning_rate": 9.999750579359042e-06,
"loss": -0.0421,
"num_tokens": 21395052.0,
"reward": -0.0034374999813735485,
"reward_std": 0.13805632293224335,
"rewards/rollout_reward_func/mean": -0.0034374999813735485,
"rewards/rollout_reward_func/std": 0.2645215690135956,
"sampling/importance_sampling_ratio/max": 1.6176923513412476,
"sampling/importance_sampling_ratio/mean": 0.8611575365066528,
"sampling/importance_sampling_ratio/min": 0.04078269377350807,
"sampling/sampling_logp_difference/max": 1.5039923191070557,
"sampling/sampling_logp_difference/mean": 0.038096338510513306,
"step": 403,
"step_time": 41.359143236004456
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.006714665098115802,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010186887346208096,
"entropy": 0.38065794110298157,
"epoch": 0.00808,
"grad_norm": 0.9908080101013184,
"kl": 0.5645314268767834,
"learning_rate": 9.999749218281836e-06,
"loss": -0.0446,
"step": 404,
"step_time": 9.103962293986115
}
],
"logging_steps": 1.0,
"max_steps": 100000,
"num_input_tokens_seen": 21395052,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}