{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0075, "eval_steps": 500, "global_step": 750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.53125, "completions/mean_terminated_length": 6.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.6140154600143433, "epoch": 1e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.031173178926110268, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0083, "num_tokens": 51512.0, "reward": -18.692380905151367, "reward_std": 12.806804656982422, "rewards/rollout_reward_func/mean": -18.692380905151367, "rewards/rollout_reward_func/std": 12.806804656982422, "sampling/importance_sampling_ratio/max": 0.23727846145629883, "sampling/importance_sampling_ratio/mean": 0.04031756892800331, "sampling/importance_sampling_ratio/min": 3.481591903664594e-08, "sampling/sampling_logp_difference/max": 2.1349408626556396, "sampling/sampling_logp_difference/mean": 0.45596328377723694, "step": 1, "step_time": 16.312519789998987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.6140154600143433, "epoch": 2e-05, "grad_norm": 0.031002141535282135, "kl": 0.0, "learning_rate": 2.2857142857142855e-07, "loss": -0.0083, "step": 2, "step_time": 3.799612433999755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.09375, "completions/mean_terminated_length": 5.583333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.820828855037689, "epoch": 3e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.047646887600421906, "kl": 0.00026412515580886975, "learning_rate": 4.571428571428571e-07, "loss": -0.0082, "num_tokens": 100979.0, "reward": -22.612224578857422, "reward_std": 11.96308422088623, "rewards/rollout_reward_func/mean": -22.612224578857422, "rewards/rollout_reward_func/std": 11.96308422088623, "sampling/importance_sampling_ratio/max": 0.22435404360294342, "sampling/importance_sampling_ratio/mean": 0.04121093451976776, "sampling/importance_sampling_ratio/min": 1.1456299944256898e-06, "sampling/sampling_logp_difference/max": 2.6010499000549316, "sampling/sampling_logp_difference/mean": 0.5101106762886047, "step": 3, "step_time": 15.371164268001849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.822629690170288, "epoch": 4e-05, "grad_norm": 0.04581320285797119, "kl": 0.00025257498782593757, "learning_rate": 6.857142857142857e-07, "loss": -0.0083, "step": 4, "step_time": 3.7135908559994277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 11.8125, "completions/mean_terminated_length": 4.833333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.753183960914612, "epoch": 5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.036030594259500504, "kl": 0.00027985989436274394, "learning_rate": 9.142857142857142e-07, "loss": -0.0086, "num_tokens": 151195.0, "reward": -20.888906478881836, "reward_std": 14.63569450378418, "rewards/rollout_reward_func/mean": -20.888906478881836, "rewards/rollout_reward_func/std": 14.635693550109863, "sampling/importance_sampling_ratio/max": 0.4147861897945404, "sampling/importance_sampling_ratio/mean": 0.055877745151519775, "sampling/importance_sampling_ratio/min": 1.1205758543297861e-10, "sampling/sampling_logp_difference/max": 2.4953346252441406, "sampling/sampling_logp_difference/mean": 0.5026437044143677, "step": 5, "step_time": 15.295146061998821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.759257435798645, "epoch": 6e-05, "grad_norm": 0.03551256284117699, "kl": 0.0002325155001017265, "learning_rate": 1.1428571428571428e-06, "loss": -0.0087, "step": 6, "step_time": 3.8558542879991364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.78125, "completions/mean_terminated_length": 6.357142925262451, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.830631911754608, "epoch": 7e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.042188528925180435, "kl": 0.0002670372086868156, "learning_rate": 1.3714285714285715e-06, "loss": -0.0102, "num_tokens": 202412.0, "reward": -15.936664581298828, "reward_std": 15.54003620147705, "rewards/rollout_reward_func/mean": -15.936664581298828, "rewards/rollout_reward_func/std": 15.54003620147705, "sampling/importance_sampling_ratio/max": 0.37060609459877014, "sampling/importance_sampling_ratio/mean": 0.06183728575706482, "sampling/importance_sampling_ratio/min": 3.771448220390994e-08, "sampling/sampling_logp_difference/max": 2.2962136268615723, "sampling/sampling_logp_difference/mean": 0.5226446390151978, "step": 7, "step_time": 16.190033603000302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.8242526054382324, "epoch": 8e-05, "grad_norm": 0.04301215335726738, "kl": 0.0002708155516302213, "learning_rate": 1.6e-06, "loss": -0.0103, "step": 8, "step_time": 4.357938441999067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.125, "completions/mean_terminated_length": 5.777777671813965, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.6884061694145203, "epoch": 9e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.032209765166044235, "kl": 0.0002783657910185866, "learning_rate": 1.8285714285714284e-06, "loss": -0.0072, "num_tokens": 252904.0, "reward": -20.980567932128906, "reward_std": 12.941821098327637, "rewards/rollout_reward_func/mean": -20.980567932128906, "rewards/rollout_reward_func/std": 12.94182014465332, "sampling/importance_sampling_ratio/max": 0.4079720675945282, "sampling/importance_sampling_ratio/mean": 0.04804783686995506, "sampling/importance_sampling_ratio/min": 2.1960536287224386e-06, "sampling/sampling_logp_difference/max": 2.331645965576172, "sampling/sampling_logp_difference/mean": 0.49316897988319397, "step": 9, "step_time": 15.089990179999404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.6856271028518677, "epoch": 0.0001, "grad_norm": 0.03318795934319496, "kl": 0.00029193030786700547, "learning_rate": 2.057142857142857e-06, "loss": -0.0071, "step": 10, "step_time": 3.7561135510022723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.21875, "completions/mean_terminated_length": 5.0714287757873535, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 4.021190762519836, "epoch": 0.00011, "frac_reward_zero_std": 0.0, "grad_norm": 0.05151060223579407, "kl": 0.000337009274517186, "learning_rate": 2.2857142857142856e-06, "loss": -0.0103, "num_tokens": 305176.0, "reward": -22.636486053466797, "reward_std": 11.091877937316895, "rewards/rollout_reward_func/mean": -22.636486053466797, "rewards/rollout_reward_func/std": 11.091876983642578, "sampling/importance_sampling_ratio/max": 0.42275112867355347, "sampling/importance_sampling_ratio/mean": 0.07614487409591675, "sampling/importance_sampling_ratio/min": 1.113817260645078e-09, "sampling/sampling_logp_difference/max": 2.172708749771118, "sampling/sampling_logp_difference/mean": 0.5632918477058411, "step": 11, "step_time": 16.066931891999957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.021433472633362, "epoch": 0.00012, "grad_norm": 0.04956513270735741, "kl": 0.0003119707944279071, "learning_rate": 2.5142857142857142e-06, "loss": -0.0102, "step": 12, "step_time": 3.9120434480000768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 13.09375, "completions/mean_terminated_length": 5.666666507720947, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.9536149501800537, "epoch": 0.00013, "frac_reward_zero_std": 0.0, "grad_norm": 0.0228031724691391, "kl": 0.00028348819250823, "learning_rate": 2.742857142857143e-06, "loss": -0.0036, "num_tokens": 356402.0, "reward": -23.272144317626953, "reward_std": 11.115742683410645, "rewards/rollout_reward_func/mean": -23.272144317626953, "rewards/rollout_reward_func/std": 11.115741729736328, "sampling/importance_sampling_ratio/max": 0.2914349436759949, "sampling/importance_sampling_ratio/mean": 0.036564625799655914, "sampling/importance_sampling_ratio/min": 2.532835940982636e-09, "sampling/sampling_logp_difference/max": 2.230201482772827, "sampling/sampling_logp_difference/mean": 0.5622317790985107, "step": 13, "step_time": 15.476010878000125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.952685594558716, "epoch": 0.00014, "grad_norm": 0.022675946354866028, "kl": 0.00031018767913337797, "learning_rate": 2.9714285714285716e-06, "loss": -0.0035, "step": 14, "step_time": 3.7448571440017986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.78125, "completions/mean_terminated_length": 5.615385055541992, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.6398487091064453, "epoch": 0.00015, "frac_reward_zero_std": 0.0, "grad_norm": 0.03931271284818649, "kl": 0.00028440320602385327, "learning_rate": 3.2e-06, "loss": -0.0019, "num_tokens": 407842.0, "reward": -20.04269027709961, "reward_std": 9.898455619812012, "rewards/rollout_reward_func/mean": -20.04269027709961, "rewards/rollout_reward_func/std": 9.898455619812012, "sampling/importance_sampling_ratio/max": 0.2702883780002594, "sampling/importance_sampling_ratio/mean": 0.051980484277009964, "sampling/importance_sampling_ratio/min": 3.938894224120304e-06, "sampling/sampling_logp_difference/max": 2.0124897956848145, "sampling/sampling_logp_difference/mean": 0.4658633768558502, "step": 15, "step_time": 14.951290131999485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.633188545703888, "epoch": 0.00016, "grad_norm": 0.0432465635240078, "kl": 0.0004554625047603622, "learning_rate": 3.428571428571428e-06, "loss": -0.0022, "step": 16, "step_time": 3.7893396370009214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 13.34375, "completions/mean_terminated_length": 5.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.759763181209564, "epoch": 0.00017, "frac_reward_zero_std": 0.0, "grad_norm": 0.02176361344754696, "kl": 0.0004362140316516161, "learning_rate": 3.657142857142857e-06, "loss": -0.0042, "num_tokens": 461766.0, "reward": -20.33690643310547, "reward_std": 12.035628318786621, "rewards/rollout_reward_func/mean": -20.33690643310547, "rewards/rollout_reward_func/std": 12.035626411437988, "sampling/importance_sampling_ratio/max": 0.38780006766319275, "sampling/importance_sampling_ratio/mean": 0.04404982179403305, "sampling/importance_sampling_ratio/min": 4.119255834211799e-07, "sampling/sampling_logp_difference/max": 2.2223212718963623, "sampling/sampling_logp_difference/mean": 0.5080235004425049, "step": 17, "step_time": 17.07055659800062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.7581313252449036, "epoch": 0.00018, "grad_norm": 0.021057378500699997, "kl": 0.0005332547370926477, "learning_rate": 3.885714285714286e-06, "loss": -0.0042, "step": 18, "step_time": 4.519106182999167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 7.529411792755127, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.652438700199127, "epoch": 0.00019, "frac_reward_zero_std": 0.0, "grad_norm": 0.04074760898947716, "kl": 0.0011479635286377743, "learning_rate": 4.114285714285714e-06, "loss": -0.006, "num_tokens": 512133.0, "reward": -19.293210983276367, "reward_std": 13.340627670288086, "rewards/rollout_reward_func/mean": -19.293210983276367, "rewards/rollout_reward_func/std": 13.340627670288086, "sampling/importance_sampling_ratio/max": 0.45808351039886475, "sampling/importance_sampling_ratio/mean": 0.068780317902565, "sampling/importance_sampling_ratio/min": 1.537143212715364e-08, "sampling/sampling_logp_difference/max": 3.4627504348754883, "sampling/sampling_logp_difference/mean": 0.4878249764442444, "step": 19, "step_time": 15.33414286900097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.6475561261177063, "epoch": 0.0002, "grad_norm": 0.04242050647735596, "kl": 0.0014431287127081305, "learning_rate": 4.342857142857142e-06, "loss": -0.0061, "step": 20, "step_time": 3.7389318300001833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.125, "completions/mean_terminated_length": 7.142857551574707, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 4.2954264879226685, "epoch": 0.00021, "frac_reward_zero_std": 0.0, "grad_norm": 0.03284507617354393, "kl": 0.0009923928882926702, "learning_rate": 4.571428571428571e-06, "loss": -0.0065, "num_tokens": 564450.0, "reward": -16.966655731201172, "reward_std": 15.528156280517578, "rewards/rollout_reward_func/mean": -16.966655731201172, "rewards/rollout_reward_func/std": 15.528157234191895, "sampling/importance_sampling_ratio/max": 0.28076866269111633, "sampling/importance_sampling_ratio/mean": 0.041839905083179474, "sampling/importance_sampling_ratio/min": 2.1938219296746553e-10, "sampling/sampling_logp_difference/max": 2.781865358352661, "sampling/sampling_logp_difference/mean": 0.617704451084137, "step": 21, "step_time": 16.402419639999607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.290382981300354, "epoch": 0.00022, "grad_norm": 0.03325243666768074, "kl": 0.0013481528731063008, "learning_rate": 4.8e-06, "loss": -0.0066, "step": 22, "step_time": 3.922172239002066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.59375, "completions/mean_terminated_length": 6.8947367668151855, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.7868679761886597, "epoch": 0.00023, "frac_reward_zero_std": 0.0, "grad_norm": 0.059608493000268936, "kl": 0.0032771070254966617, "learning_rate": 5.0285714285714285e-06, "loss": -0.0076, "num_tokens": 618119.0, "reward": -14.567885398864746, "reward_std": 14.219583511352539, "rewards/rollout_reward_func/mean": -14.567885398864746, "rewards/rollout_reward_func/std": 14.219583511352539, "sampling/importance_sampling_ratio/max": 0.37001144886016846, "sampling/importance_sampling_ratio/mean": 0.09722670167684555, "sampling/importance_sampling_ratio/min": 5.300555550036279e-09, "sampling/sampling_logp_difference/max": 2.6479554176330566, "sampling/sampling_logp_difference/mean": 0.49056705832481384, "step": 23, "step_time": 17.07615896399875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.7781103253364563, "epoch": 0.00024, "grad_norm": 0.060081060975790024, "kl": 0.0038230472709983587, "learning_rate": 5.257142857142857e-06, "loss": -0.0078, "step": 24, "step_time": 4.032116533999215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.9375, "completions/mean_terminated_length": 7.473684310913086, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.648766279220581, "epoch": 0.00025, "frac_reward_zero_std": 0.0, "grad_norm": 0.05247579514980316, "kl": 0.0022041110205464065, "learning_rate": 5.485714285714286e-06, "loss": -0.0104, "num_tokens": 670919.0, "reward": -17.022605895996094, "reward_std": 11.738938331604004, "rewards/rollout_reward_func/mean": -17.022605895996094, "rewards/rollout_reward_func/std": 11.738938331604004, "sampling/importance_sampling_ratio/max": 0.4829924702644348, "sampling/importance_sampling_ratio/mean": 0.09604013711214066, "sampling/importance_sampling_ratio/min": 4.846964496429962e-10, "sampling/sampling_logp_difference/max": 2.9055347442626953, "sampling/sampling_logp_difference/mean": 0.4915693700313568, "step": 25, "step_time": 17.745348843998727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.6338467597961426, "epoch": 0.00026, "grad_norm": 0.05278094857931137, "kl": 0.0026012091839220375, "learning_rate": 5.7142857142857145e-06, "loss": -0.0104, "step": 26, "step_time": 4.5060613030027525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.125, "completions/mean_terminated_length": 5.555555820465088, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.3268803358078003, "epoch": 0.00027, "frac_reward_zero_std": 0.0, "grad_norm": 0.07933399826288223, "kl": 0.005449813150335103, "learning_rate": 5.942857142857143e-06, "loss": -0.0254, "num_tokens": 726566.0, "reward": -16.93682098388672, "reward_std": 12.645853996276855, "rewards/rollout_reward_func/mean": -16.93682098388672, "rewards/rollout_reward_func/std": 12.645853996276855, "sampling/importance_sampling_ratio/max": 0.44983530044555664, "sampling/importance_sampling_ratio/mean": 0.13893800973892212, "sampling/importance_sampling_ratio/min": 6.259582005441189e-05, "sampling/sampling_logp_difference/max": 1.6426305770874023, "sampling/sampling_logp_difference/mean": 0.3998596668243408, "step": 27, "step_time": 16.15029823400073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.3023842573165894, "epoch": 0.00028, "grad_norm": 0.07852059602737427, "kl": 0.007373624248430133, "learning_rate": 6.171428571428571e-06, "loss": -0.026, "step": 28, "step_time": 4.406761196999469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 5.538461685180664, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.8288868069648743, "epoch": 0.00029, "frac_reward_zero_std": 0.0, "grad_norm": 0.03285776823759079, "kl": 0.006483525270596147, "learning_rate": 6.4e-06, "loss": -0.0056, "num_tokens": 780418.0, "reward": -18.308406829833984, "reward_std": 14.760459899902344, "rewards/rollout_reward_func/mean": -18.308406829833984, "rewards/rollout_reward_func/std": 14.760459899902344, "sampling/importance_sampling_ratio/max": 0.3498446047306061, "sampling/importance_sampling_ratio/mean": 0.054024189710617065, "sampling/importance_sampling_ratio/min": 7.4122565862921874e-09, "sampling/sampling_logp_difference/max": 2.6132895946502686, "sampling/sampling_logp_difference/mean": 0.5108392238616943, "step": 29, "step_time": 16.456739635000304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.8066287636756897, "epoch": 0.0003, "grad_norm": 0.03431409224867821, "kl": 0.007965424680151045, "learning_rate": 6.628571428571428e-06, "loss": -0.0057, "step": 30, "step_time": 3.915029690000665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.4375, "completions/mean_terminated_length": 5.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.600776791572571, "epoch": 0.00031, "frac_reward_zero_std": 0.0, "grad_norm": 0.05127133056521416, "kl": 0.019262989284470677, "learning_rate": 6.857142857142856e-06, "loss": -0.0141, "num_tokens": 833688.0, "reward": -12.870410919189453, "reward_std": 13.22743034362793, "rewards/rollout_reward_func/mean": -12.870410919189453, "rewards/rollout_reward_func/std": 13.22743034362793, "sampling/importance_sampling_ratio/max": 0.5499036312103271, "sampling/importance_sampling_ratio/mean": 0.1407327950000763, "sampling/importance_sampling_ratio/min": 1.473029669796233e-06, "sampling/sampling_logp_difference/max": 2.385876417160034, "sampling/sampling_logp_difference/mean": 0.4969593584537506, "step": 31, "step_time": 16.413537358000212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.54784619808197, "epoch": 0.00032, "grad_norm": 0.05661388114094734, "kl": 0.02403791481629014, "learning_rate": 7.085714285714285e-06, "loss": -0.0143, "step": 32, "step_time": 3.9106435180019616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.65625, "completions/mean_terminated_length": 5.315789699554443, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.4069011211395264, "epoch": 0.00033, "frac_reward_zero_std": 0.0, "grad_norm": 0.10634185373783112, "kl": 0.015931935980916023, "learning_rate": 7.314285714285714e-06, "loss": -0.0235, "num_tokens": 886285.0, "reward": -17.876117706298828, "reward_std": 14.024675369262695, "rewards/rollout_reward_func/mean": -17.876117706298828, "rewards/rollout_reward_func/std": 14.024675369262695, "sampling/importance_sampling_ratio/max": 0.6559111475944519, "sampling/importance_sampling_ratio/mean": 0.1724713146686554, "sampling/importance_sampling_ratio/min": 2.6141703912685443e-08, "sampling/sampling_logp_difference/max": 1.8154839277267456, "sampling/sampling_logp_difference/mean": 0.4429629147052765, "step": 33, "step_time": 17.23916837899924 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 3.362492322921753, "epoch": 0.00034, "grad_norm": 0.11059129983186722, "kl": 0.019211921375244856, "learning_rate": 7.542857142857142e-06, "loss": -0.0241, "step": 34, "step_time": 3.7984450850008216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.90625, "completions/mean_terminated_length": 5.190476417541504, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.392121970653534, "epoch": 0.00035, "frac_reward_zero_std": 0.0, "grad_norm": 0.08997725695371628, "kl": 0.04110337048768997, "learning_rate": 7.771428571428572e-06, "loss": -0.0182, "num_tokens": 943138.0, "reward": -14.276116371154785, "reward_std": 14.80237102508545, "rewards/rollout_reward_func/mean": -14.276116371154785, "rewards/rollout_reward_func/std": 14.802370071411133, "sampling/importance_sampling_ratio/max": 0.5592784881591797, "sampling/importance_sampling_ratio/mean": 0.2044554054737091, "sampling/importance_sampling_ratio/min": 1.838321963987255e-06, "sampling/sampling_logp_difference/max": 2.2490005493164062, "sampling/sampling_logp_difference/mean": 0.42422500252723694, "step": 35, "step_time": 17.073514709999472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.3112645149230957, "epoch": 0.00036, "grad_norm": 0.09390687942504883, "kl": 0.04649016307666898, "learning_rate": 8e-06, "loss": -0.0186, "step": 36, "step_time": 3.9646652889978213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.4375, "completions/mean_terminated_length": 5.039999961853027, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.1206422448158264, "epoch": 0.00037, "frac_reward_zero_std": 0.0, "grad_norm": 0.0959034189581871, "kl": 0.07994297333061695, "learning_rate": 7.999999999907456e-06, "loss": -0.0277, "num_tokens": 996585.0, "reward": -12.600645065307617, "reward_std": 14.105387687683105, "rewards/rollout_reward_func/mean": -12.600645065307617, "rewards/rollout_reward_func/std": 14.105386734008789, "sampling/importance_sampling_ratio/max": 0.8192175626754761, "sampling/importance_sampling_ratio/mean": 0.33474093675613403, "sampling/importance_sampling_ratio/min": 1.5594495152981835e-06, "sampling/sampling_logp_difference/max": 2.0706448554992676, "sampling/sampling_logp_difference/mean": 0.39812666177749634, "step": 37, "step_time": 16.968741906000105 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 2.9896836280822754, "epoch": 0.00038, "grad_norm": 0.08371906727552414, "kl": 0.09319737134501338, "learning_rate": 7.999999999629824e-06, "loss": -0.029, "step": 38, "step_time": 4.557126382000206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.625, "completions/mean_terminated_length": 4.761904716491699, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.608646273612976, "epoch": 0.00039, "frac_reward_zero_std": 0.0, "grad_norm": 0.08364315330982208, "kl": 0.10081169568002224, "learning_rate": 7.999999999167105e-06, "loss": -0.0392, "num_tokens": 1050584.0, "reward": -14.455350875854492, "reward_std": 11.706289291381836, "rewards/rollout_reward_func/mean": -14.455350875854492, "rewards/rollout_reward_func/std": 11.706289291381836, "sampling/importance_sampling_ratio/max": 1.1295586824417114, "sampling/importance_sampling_ratio/mean": 0.39024534821510315, "sampling/importance_sampling_ratio/min": 4.6722266233700793e-07, "sampling/sampling_logp_difference/max": 2.06076717376709, "sampling/sampling_logp_difference/mean": 0.3188022971153259, "step": 39, "step_time": 16.849725108999337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0027173913549631834, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027173913549631834, "entropy": 2.4867441654205322, "epoch": 0.0004, "grad_norm": 0.08225726336240768, "kl": 0.12041333597153425, "learning_rate": 7.9999999985193e-06, "loss": -0.0404, "step": 40, "step_time": 3.895338659001027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.34375, "completions/mean_terminated_length": 5.34615421295166, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4050289392471313, "epoch": 0.00041, "frac_reward_zero_std": 0.0, "grad_norm": 0.1407051682472229, "kl": 0.13318810425698757, "learning_rate": 7.999999997686407e-06, "loss": -0.038, "num_tokens": 1105766.0, "reward": -11.147544860839844, "reward_std": 12.305276870727539, "rewards/rollout_reward_func/mean": -11.147544860839844, "rewards/rollout_reward_func/std": 12.305276870727539, "sampling/importance_sampling_ratio/max": 1.0643306970596313, "sampling/importance_sampling_ratio/mean": 0.5046571493148804, "sampling/importance_sampling_ratio/min": 1.800249629013706e-05, "sampling/sampling_logp_difference/max": 2.292391300201416, "sampling/sampling_logp_difference/mean": 0.3037129342556, "step": 41, "step_time": 18.443871225000294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "entropy": 2.302206367254257, "epoch": 0.00042, "grad_norm": 0.1491519808769226, "kl": 0.157350717112422, "learning_rate": 7.999999996668426e-06, "loss": -0.0388, "step": 42, "step_time": 3.9447429090005244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.307692527770996, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8303978443145752, "epoch": 0.00043, "frac_reward_zero_std": 0.0, "grad_norm": 0.25971972942352295, "kl": 0.32411492243409157, "learning_rate": 7.999999995465356e-06, "loss": -0.0334, "num_tokens": 1160523.0, "reward": -10.353226661682129, "reward_std": 10.783092498779297, "rewards/rollout_reward_func/mean": -10.353226661682129, "rewards/rollout_reward_func/std": 10.783092498779297, "sampling/importance_sampling_ratio/max": 1.3089326620101929, "sampling/importance_sampling_ratio/mean": 0.6952589750289917, "sampling/importance_sampling_ratio/min": 5.0103026296710595e-05, "sampling/sampling_logp_difference/max": 3.2104599475860596, "sampling/sampling_logp_difference/mean": 0.23429559171199799, "step": 43, "step_time": 16.653257252000913 }, { "clip_ratio/high_max": 0.10219156090170145, "clip_ratio/high_mean": 0.03348058229312301, "clip_ratio/low_mean": 0.011101973708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.044582555536180735, "entropy": 1.7575804591178894, "epoch": 0.00044, "grad_norm": 0.23699837923049927, "kl": 0.3579994738101959, "learning_rate": 7.9999999940772e-06, "loss": -0.0356, "step": 44, "step_time": 3.910400990999733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.21875, "completions/mean_terminated_length": 4.759999752044678, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9243027865886688, "epoch": 0.00045, "frac_reward_zero_std": 0.0, "grad_norm": 0.21503277122974396, "kl": 0.14009490422904491, "learning_rate": 7.999999992503956e-06, "loss": -0.0717, "num_tokens": 1211990.0, "reward": -11.69435977935791, "reward_std": 12.643112182617188, "rewards/rollout_reward_func/mean": -11.69435977935791, "rewards/rollout_reward_func/std": 12.643111228942871, "sampling/importance_sampling_ratio/max": 1.786710262298584, "sampling/importance_sampling_ratio/mean": 0.6511831283569336, "sampling/importance_sampling_ratio/min": 0.00011509961768751964, "sampling/sampling_logp_difference/max": 1.6943392753601074, "sampling/sampling_logp_difference/mean": 0.26553356647491455, "step": 45, "step_time": 16.96633616299914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8838016390800476, "epoch": 0.00046, "grad_norm": 0.22245362401008606, "kl": 0.15043162181973457, "learning_rate": 7.999999990745626e-06, "loss": -0.0732, "step": 46, "step_time": 4.3081432599992695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.71875, "completions/mean_terminated_length": 4.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0557123720645905, "epoch": 0.00047, "frac_reward_zero_std": 0.0, "grad_norm": 0.20216670632362366, "kl": 0.21005860716104507, "learning_rate": 7.999999988802207e-06, "loss": -0.031, "num_tokens": 1268232.0, "reward": -11.173686027526855, "reward_std": 12.839822769165039, "rewards/rollout_reward_func/mean": -11.173686027526855, "rewards/rollout_reward_func/std": 12.839823722839355, "sampling/importance_sampling_ratio/max": 1.7709037065505981, "sampling/importance_sampling_ratio/mean": 0.7376168966293335, "sampling/importance_sampling_ratio/min": 0.00014609003846999258, "sampling/sampling_logp_difference/max": 1.6002585887908936, "sampling/sampling_logp_difference/mean": 0.28716325759887695, "step": 47, "step_time": 16.63276727200082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.023777514696121, "epoch": 0.00048, "grad_norm": 0.19804874062538147, "kl": 0.22359632328152657, "learning_rate": 7.999999986673701e-06, "loss": -0.0318, "step": 48, "step_time": 4.446633482000834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.15625, "completions/mean_terminated_length": 5.137930870056152, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8786053955554962, "epoch": 0.00049, "frac_reward_zero_std": 0.0, "grad_norm": 0.35645022988319397, "kl": 0.2930166721343994, "learning_rate": 7.999999984360109e-06, "loss": -0.0859, "num_tokens": 1319209.0, "reward": -12.124788284301758, "reward_std": 15.002610206604004, "rewards/rollout_reward_func/mean": -12.124788284301758, "rewards/rollout_reward_func/std": 15.002609252929688, "sampling/importance_sampling_ratio/max": 1.6997944116592407, "sampling/importance_sampling_ratio/mean": 0.5894205570220947, "sampling/importance_sampling_ratio/min": 0.0016469202237203717, "sampling/sampling_logp_difference/max": 1.475503921508789, "sampling/sampling_logp_difference/mean": 0.28066980838775635, "step": 49, "step_time": 16.970838325999466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.835387498140335, "epoch": 0.0005, "grad_norm": 0.33079567551612854, "kl": 0.32937128841876984, "learning_rate": 7.999999981861428e-06, "loss": -0.0894, "step": 50, "step_time": 3.784620336000444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.65625, "completions/mean_terminated_length": 5.318181991577148, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.5209531486034393, "epoch": 0.00051, "frac_reward_zero_std": 0.0, "grad_norm": 0.562506377696991, "kl": 1.2470178231596947, "learning_rate": 7.99999997917766e-06, "loss": -0.0532, "num_tokens": 1374256.0, "reward": -15.260754585266113, "reward_std": 13.334301948547363, "rewards/rollout_reward_func/mean": -15.260754585266113, "rewards/rollout_reward_func/std": 13.334300994873047, "sampling/importance_sampling_ratio/max": 2.7327239513397217, "sampling/importance_sampling_ratio/mean": 0.49202102422714233, "sampling/importance_sampling_ratio/min": 7.784686886225245e-07, "sampling/sampling_logp_difference/max": 3.982198715209961, "sampling/sampling_logp_difference/mean": 0.4385358393192291, "step": 51, "step_time": 17.43475549499999 }, { "clip_ratio/high_max": 0.033333334140479565, "clip_ratio/high_mean": 0.008333333535119891, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01614583353511989, "entropy": 2.5165918469429016, "epoch": 0.00052, "grad_norm": 0.30679789185523987, "kl": 0.862664507701993, "learning_rate": 7.999999976308803e-06, "loss": -0.0575, "step": 52, "step_time": 3.9217220809996434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 5.153846263885498, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.749287188053131, "epoch": 0.00053, "frac_reward_zero_std": 0.0, "grad_norm": 0.16565142571926117, "kl": 0.580406965687871, "learning_rate": 7.99999997325486e-06, "loss": -0.0522, "num_tokens": 1429021.0, "reward": -11.922074317932129, "reward_std": 11.131998062133789, "rewards/rollout_reward_func/mean": -11.922074317932129, "rewards/rollout_reward_func/std": 11.131998062133789, "sampling/importance_sampling_ratio/max": 2.066969633102417, "sampling/importance_sampling_ratio/mean": 0.6679714322090149, "sampling/importance_sampling_ratio/min": 0.0002583549648988992, "sampling/sampling_logp_difference/max": 3.0659780502319336, "sampling/sampling_logp_difference/mean": 0.3197592496871948, "step": 53, "step_time": 17.996286176000467 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011284722248092294, "entropy": 1.7498438358306885, "epoch": 0.00054, "grad_norm": 0.14138051867485046, "kl": 0.5468345917761326, "learning_rate": 7.99999997001583e-06, "loss": -0.0531, "step": 54, "step_time": 3.8620070240012865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.740740776062012, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7488270699977875, "epoch": 0.00055, "frac_reward_zero_std": 0.0, "grad_norm": 0.31030309200286865, "kl": 0.6247325241565704, "learning_rate": 7.999999966591712e-06, "loss": -0.0304, "num_tokens": 1487154.0, "reward": -8.613126754760742, "reward_std": 13.516670227050781, "rewards/rollout_reward_func/mean": -8.613126754760742, "rewards/rollout_reward_func/std": 13.516670227050781, "sampling/importance_sampling_ratio/max": 1.7777390480041504, "sampling/importance_sampling_ratio/mean": 0.6633009910583496, "sampling/importance_sampling_ratio/min": 9.032462548930198e-05, "sampling/sampling_logp_difference/max": 2.094834327697754, "sampling/sampling_logp_difference/mean": 0.34423574805259705, "step": 55, "step_time": 19.064140515000872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7735461294651031, "epoch": 0.00056, "grad_norm": 0.29576367139816284, "kl": 0.5696428194642067, "learning_rate": 7.999999962982505e-06, "loss": -0.0318, "step": 56, "step_time": 4.42871877000016 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.78125, "completions/mean_terminated_length": 5.480000019073486, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8830964267253876, "epoch": 0.00057, "frac_reward_zero_std": 0.0, "grad_norm": 0.20293869078159332, "kl": 0.44318973273038864, "learning_rate": 7.999999959188212e-06, "loss": -0.0893, "num_tokens": 1541595.0, "reward": -10.316732406616211, "reward_std": 8.490743637084961, "rewards/rollout_reward_func/mean": -10.316732406616211, "rewards/rollout_reward_func/std": 8.490743637084961, "sampling/importance_sampling_ratio/max": 2.3016879558563232, "sampling/importance_sampling_ratio/mean": 0.754888653755188, "sampling/importance_sampling_ratio/min": 1.4722305650138878e-06, "sampling/sampling_logp_difference/max": 2.461587905883789, "sampling/sampling_logp_difference/mean": 0.34106218814849854, "step": 57, "step_time": 17.752485323000656 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01006944477558136, "entropy": 1.8965005278587341, "epoch": 0.00058, "grad_norm": 0.19612440466880798, "kl": 0.3990703672170639, "learning_rate": 7.999999955208831e-06, "loss": -0.0901, "step": 58, "step_time": 4.325834547999875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.96875, "completions/mean_terminated_length": 5.277777671813965, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.378452181816101, "epoch": 0.00059, "frac_reward_zero_std": 0.0, "grad_norm": 0.17499127984046936, "kl": 0.31387595646083355, "learning_rate": 7.999999951044363e-06, "loss": -0.0789, "num_tokens": 1593388.0, "reward": -17.23110008239746, "reward_std": 13.483238220214844, "rewards/rollout_reward_func/mean": -17.23110008239746, "rewards/rollout_reward_func/std": 13.483237266540527, "sampling/importance_sampling_ratio/max": 1.614124059677124, "sampling/importance_sampling_ratio/mean": 0.4520293176174164, "sampling/importance_sampling_ratio/min": 1.5979654563125223e-05, "sampling/sampling_logp_difference/max": 1.9805740118026733, "sampling/sampling_logp_difference/mean": 0.36399880051612854, "step": 59, "step_time": 15.457802058002017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003289473708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "entropy": 2.387795388698578, "epoch": 0.0006, "grad_norm": 0.1648591309785843, "kl": 0.30828676000237465, "learning_rate": 7.999999946694808e-06, "loss": -0.0794, "step": 60, "step_time": 3.722013247000177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 5.176470756530762, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.433082163333893, "epoch": 0.00061, "frac_reward_zero_std": 0.0, "grad_norm": 0.12386750429868698, "kl": 0.382779598236084, "learning_rate": 7.999999942160165e-06, "loss": -0.0394, "num_tokens": 1647399.0, "reward": -17.269968032836914, "reward_std": 13.600323677062988, "rewards/rollout_reward_func/mean": -17.269968032836914, "rewards/rollout_reward_func/std": 13.600322723388672, "sampling/importance_sampling_ratio/max": 1.6075975894927979, "sampling/importance_sampling_ratio/mean": 0.25079697370529175, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.278927803039551, "sampling/sampling_logp_difference/mean": 0.38671162724494934, "step": 61, "step_time": 15.263149969000551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4407622814178467, "epoch": 0.00062, "grad_norm": 0.12103719264268875, "kl": 0.3563564345240593, "learning_rate": 7.999999937440435e-06, "loss": -0.0395, "step": 62, "step_time": 3.7838021770012347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.4375, "completions/mean_terminated_length": 5.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.7550713419914246, "epoch": 0.00063, "frac_reward_zero_std": 0.0, "grad_norm": 0.09021484851837158, "kl": 0.4475920870900154, "learning_rate": 7.999999932535616e-06, "loss": -0.0724, "num_tokens": 1700477.0, "reward": -14.257622718811035, "reward_std": 10.737204551696777, "rewards/rollout_reward_func/mean": -14.257622718811035, "rewards/rollout_reward_func/std": 10.737204551696777, "sampling/importance_sampling_ratio/max": 1.7209477424621582, "sampling/importance_sampling_ratio/mean": 0.4475940465927124, "sampling/importance_sampling_ratio/min": 1.4291852146186557e-09, "sampling/sampling_logp_difference/max": 3.103945255279541, "sampling/sampling_logp_difference/mean": 0.4909018278121948, "step": 63, "step_time": 17.94124886699956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 2.772922933101654, "epoch": 0.00064, "grad_norm": 0.09608107805252075, "kl": 0.4009213149547577, "learning_rate": 7.99999992744571e-06, "loss": -0.0727, "step": 64, "step_time": 3.8427052489987545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 9.9375, "completions/mean_terminated_length": 3.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.379754215478897, "epoch": 0.00065, "frac_reward_zero_std": 0.0, "grad_norm": 0.10835270583629608, "kl": 0.2603175528347492, "learning_rate": 7.999999922170718e-06, "loss": -0.0382, "num_tokens": 1756034.0, "reward": -15.971830368041992, "reward_std": 13.320069313049316, "rewards/rollout_reward_func/mean": -15.971830368041992, "rewards/rollout_reward_func/std": 13.320068359375, "sampling/importance_sampling_ratio/max": 1.7064971923828125, "sampling/importance_sampling_ratio/mean": 0.4520533084869385, "sampling/importance_sampling_ratio/min": 2.9174557312217075e-06, "sampling/sampling_logp_difference/max": 2.2685322761535645, "sampling/sampling_logp_difference/mean": 0.35645103454589844, "step": 65, "step_time": 16.905106244998933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3868141770362854, "epoch": 0.00066, "grad_norm": 0.10785356163978577, "kl": 0.25116612389683723, "learning_rate": 7.999999916710638e-06, "loss": -0.0385, "step": 66, "step_time": 3.9364537680003195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.0625, "completions/mean_terminated_length": 5.909090995788574, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1731958389282227, "epoch": 0.00067, "frac_reward_zero_std": 0.0, "grad_norm": 0.15863889455795288, "kl": 0.5337837189435959, "learning_rate": 7.99999991106547e-06, "loss": -0.0774, "num_tokens": 1808967.0, "reward": -16.11435890197754, "reward_std": 11.578441619873047, "rewards/rollout_reward_func/mean": -16.11435890197754, "rewards/rollout_reward_func/std": 11.578441619873047, "sampling/importance_sampling_ratio/max": 1.9132463932037354, "sampling/importance_sampling_ratio/mean": 0.552422046661377, "sampling/importance_sampling_ratio/min": 7.997507964319084e-06, "sampling/sampling_logp_difference/max": 2.0675995349884033, "sampling/sampling_logp_difference/mean": 0.3414417505264282, "step": 67, "step_time": 16.45315675100028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1796962916851044, "epoch": 0.00068, "grad_norm": 0.16032451391220093, "kl": 0.5108031183481216, "learning_rate": 7.999999905235214e-06, "loss": -0.0769, "step": 68, "step_time": 4.365772124000614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.46875, "completions/mean_terminated_length": 4.9375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5042994618415833, "epoch": 0.00069, "frac_reward_zero_std": 0.0, "grad_norm": 0.1341199278831482, "kl": 0.31201931834220886, "learning_rate": 7.999999899219872e-06, "loss": -0.0576, "num_tokens": 1862883.0, "reward": -13.885496139526367, "reward_std": 13.731287002563477, "rewards/rollout_reward_func/mean": -13.885496139526367, "rewards/rollout_reward_func/std": 13.73128604888916, "sampling/importance_sampling_ratio/max": 1.6955304145812988, "sampling/importance_sampling_ratio/mean": 0.28851842880249023, "sampling/importance_sampling_ratio/min": 0.0001419264153810218, "sampling/sampling_logp_difference/max": 1.7802906036376953, "sampling/sampling_logp_difference/mean": 0.3640894889831543, "step": 69, "step_time": 17.370235246001357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5076807141304016, "epoch": 0.0007, "grad_norm": 0.13722825050354004, "kl": 0.3210822604596615, "learning_rate": 7.999999893019442e-06, "loss": -0.0579, "step": 70, "step_time": 3.975507073000699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.84375, "completions/mean_terminated_length": 5.769230842590332, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.704359233379364, "epoch": 0.00071, "frac_reward_zero_std": 0.0, "grad_norm": 0.19662363827228546, "kl": 0.1317777018994093, "learning_rate": 7.999999886633925e-06, "loss": -0.0797, "num_tokens": 1917842.0, "reward": -16.158588409423828, "reward_std": 14.412886619567871, "rewards/rollout_reward_func/mean": -16.158588409423828, "rewards/rollout_reward_func/std": 14.412885665893555, "sampling/importance_sampling_ratio/max": 1.502885103225708, "sampling/importance_sampling_ratio/mean": 0.2917467951774597, "sampling/importance_sampling_ratio/min": 2.4606692932138685e-06, "sampling/sampling_logp_difference/max": 2.3619441986083984, "sampling/sampling_logp_difference/mean": 0.3998678922653198, "step": 71, "step_time": 16.13678017500024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7011520862579346, "epoch": 0.00072, "grad_norm": 0.18873122334480286, "kl": 0.14117254316806793, "learning_rate": 7.999999880063319e-06, "loss": -0.0804, "step": 72, "step_time": 4.012296630999117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.5625, "completions/mean_terminated_length": 4.6666669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2183931171894073, "epoch": 0.00073, "frac_reward_zero_std": 0.0, "grad_norm": 0.11125136911869049, "kl": 0.13156628236174583, "learning_rate": 7.999999873307627e-06, "loss": -0.0626, "num_tokens": 1971775.0, "reward": -12.964683532714844, "reward_std": 11.838773727416992, "rewards/rollout_reward_func/mean": -12.964683532714844, "rewards/rollout_reward_func/std": 11.838773727416992, "sampling/importance_sampling_ratio/max": 1.7075703144073486, "sampling/importance_sampling_ratio/mean": 0.6171098947525024, "sampling/importance_sampling_ratio/min": 1.910046375996899e-05, "sampling/sampling_logp_difference/max": 2.1132054328918457, "sampling/sampling_logp_difference/mean": 0.3204073905944824, "step": 73, "step_time": 17.679292820999763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.210416316986084, "epoch": 0.00074, "grad_norm": 0.10946302860975266, "kl": 0.14527074992656708, "learning_rate": 7.999999866366846e-06, "loss": -0.0627, "step": 74, "step_time": 3.8590269370006354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.875, "completions/mean_terminated_length": 6.352941036224365, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.5368541479110718, "epoch": 0.00075, "frac_reward_zero_std": 0.0, "grad_norm": 0.1750744730234146, "kl": 0.31151563487946987, "learning_rate": 7.99999985924098e-06, "loss": -0.0815, "num_tokens": 2025629.0, "reward": -11.506864547729492, "reward_std": 15.047867774963379, "rewards/rollout_reward_func/mean": -11.506864547729492, "rewards/rollout_reward_func/std": 15.047867774963379, "sampling/importance_sampling_ratio/max": 1.7958234548568726, "sampling/importance_sampling_ratio/mean": 0.4085558354854584, "sampling/importance_sampling_ratio/min": 1.818686728993768e-10, "sampling/sampling_logp_difference/max": 2.7109293937683105, "sampling/sampling_logp_difference/mean": 0.3959559202194214, "step": 75, "step_time": 17.016520997000953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5261048078536987, "epoch": 0.00076, "grad_norm": 0.1723417490720749, "kl": 0.32033916004002094, "learning_rate": 7.999999851930024e-06, "loss": -0.0818, "step": 76, "step_time": 3.748379217000547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.875, "completions/mean_terminated_length": 5.111111164093018, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3834725618362427, "epoch": 0.00077, "frac_reward_zero_std": 0.0, "grad_norm": 0.19522102177143097, "kl": 0.31159199588000774, "learning_rate": 7.999999844433981e-06, "loss": -0.051, "num_tokens": 2082134.0, "reward": -15.549795150756836, "reward_std": 12.357343673706055, "rewards/rollout_reward_func/mean": -15.549795150756836, "rewards/rollout_reward_func/std": 12.357343673706055, "sampling/importance_sampling_ratio/max": 1.8710898160934448, "sampling/importance_sampling_ratio/mean": 0.43635016679763794, "sampling/importance_sampling_ratio/min": 5.469024344506579e-08, "sampling/sampling_logp_difference/max": 2.187922954559326, "sampling/sampling_logp_difference/mean": 0.36380988359451294, "step": 77, "step_time": 17.31962984799975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.381533920764923, "epoch": 0.00078, "grad_norm": 0.20654837787151337, "kl": 0.34653630293905735, "learning_rate": 7.99999983675285e-06, "loss": -0.0515, "step": 78, "step_time": 3.942640487000972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.5625, "completions/mean_terminated_length": 5.65217399597168, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.199011206626892, "epoch": 0.00079, "frac_reward_zero_std": 0.0, "grad_norm": 0.17270159721374512, "kl": 0.31027599424123764, "learning_rate": 7.999999828886634e-06, "loss": -0.0507, "num_tokens": 2138945.0, "reward": -10.484298706054688, "reward_std": 13.501275062561035, "rewards/rollout_reward_func/mean": -10.484298706054688, "rewards/rollout_reward_func/std": 13.501275062561035, "sampling/importance_sampling_ratio/max": 1.755179762840271, "sampling/importance_sampling_ratio/mean": 0.6117238998413086, "sampling/importance_sampling_ratio/min": 1.1615011175081236e-08, "sampling/sampling_logp_difference/max": 2.111875057220459, "sampling/sampling_logp_difference/mean": 0.3664819300174713, "step": 79, "step_time": 18.590574371000002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1965456008911133, "epoch": 0.0008, "grad_norm": 0.18009911477565765, "kl": 0.3319133520126343, "learning_rate": 7.99999982083533e-06, "loss": -0.0513, "step": 80, "step_time": 3.9078741640014414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 5.090909004211426, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.030279517173767, "epoch": 0.00081, "frac_reward_zero_std": 0.0, "grad_norm": 0.13315314054489136, "kl": 0.46507609635591507, "learning_rate": 7.999999812598937e-06, "loss": -0.0843, "num_tokens": 2193678.0, "reward": -11.784393310546875, "reward_std": 11.269954681396484, "rewards/rollout_reward_func/mean": -11.784393310546875, "rewards/rollout_reward_func/std": 11.269953727722168, "sampling/importance_sampling_ratio/max": 1.8686487674713135, "sampling/importance_sampling_ratio/mean": 0.6036275625228882, "sampling/importance_sampling_ratio/min": 6.387705798260868e-05, "sampling/sampling_logp_difference/max": 2.2164251804351807, "sampling/sampling_logp_difference/mean": 0.3347189426422119, "step": 81, "step_time": 17.67407256899969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0290656089782715, "epoch": 0.00082, "grad_norm": 0.1341545134782791, "kl": 0.4619626756757498, "learning_rate": 7.999999804177458e-06, "loss": -0.0848, "step": 82, "step_time": 3.847223238000879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.375, "completions/mean_terminated_length": 4.38095235824585, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9526284635066986, "epoch": 0.00083, "frac_reward_zero_std": 0.0, "grad_norm": 0.17867720127105713, "kl": 0.32847855892032385, "learning_rate": 7.99999979557089e-06, "loss": -0.1031, "num_tokens": 2248398.0, "reward": -13.414186477661133, "reward_std": 14.369061470031738, "rewards/rollout_reward_func/mean": -13.414186477661133, "rewards/rollout_reward_func/std": 14.369061470031738, "sampling/importance_sampling_ratio/max": 1.7845985889434814, "sampling/importance_sampling_ratio/mean": 0.643210232257843, "sampling/importance_sampling_ratio/min": 0.00023519247770309448, "sampling/sampling_logp_difference/max": 1.944533109664917, "sampling/sampling_logp_difference/mean": 0.31070560216903687, "step": 83, "step_time": 17.449074100999496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9546249210834503, "epoch": 0.00084, "grad_norm": 0.17672060430049896, "kl": 0.33289087750017643, "learning_rate": 7.999999786779235e-06, "loss": -0.1038, "step": 84, "step_time": 3.9484871029990245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.46875, "completions/mean_terminated_length": 6.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.511549234390259, "epoch": 0.00085, "frac_reward_zero_std": 0.0, "grad_norm": 0.19033412635326385, "kl": 0.6604866236448288, "learning_rate": 7.999999777802493e-06, "loss": -0.052, "num_tokens": 2302080.0, "reward": -11.880884170532227, "reward_std": 13.290580749511719, "rewards/rollout_reward_func/mean": -11.880884170532227, "rewards/rollout_reward_func/std": 13.290580749511719, "sampling/importance_sampling_ratio/max": 1.3405799865722656, "sampling/importance_sampling_ratio/mean": 0.3203006386756897, "sampling/importance_sampling_ratio/min": 6.564528032271255e-09, "sampling/sampling_logp_difference/max": 2.3874359130859375, "sampling/sampling_logp_difference/mean": 0.45994022488594055, "step": 85, "step_time": 18.14367247500104 }, { "clip_ratio/high_max": 0.0456349216401577, "clip_ratio/high_mean": 0.011408730410039425, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011408730410039425, "entropy": 2.518314838409424, "epoch": 0.00086, "grad_norm": 0.19394153356552124, "kl": 0.6121902614831924, "learning_rate": 7.999999768640663e-06, "loss": -0.0526, "step": 86, "step_time": 3.8879147960005866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.28125, "completions/mean_terminated_length": 4.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6763853430747986, "epoch": 0.00087, "frac_reward_zero_std": 0.0, "grad_norm": 0.12980525195598602, "kl": 0.5347051322460175, "learning_rate": 7.999999759293746e-06, "loss": -0.0516, "num_tokens": 2356813.0, "reward": -10.978804588317871, "reward_std": 12.673788070678711, "rewards/rollout_reward_func/mean": -10.978804588317871, "rewards/rollout_reward_func/std": 12.673788070678711, "sampling/importance_sampling_ratio/max": 1.733041524887085, "sampling/importance_sampling_ratio/mean": 0.7279175519943237, "sampling/importance_sampling_ratio/min": 3.9881647353468e-06, "sampling/sampling_logp_difference/max": 2.1248507499694824, "sampling/sampling_logp_difference/mean": 0.3236481547355652, "step": 87, "step_time": 17.445783205001135 }, { "clip_ratio/high_max": 0.0357142873108387, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 1.6849164068698883, "epoch": 0.00088, "grad_norm": 0.1278933435678482, "kl": 0.4615030586719513, "learning_rate": 7.99999974976174e-06, "loss": -0.0519, "step": 88, "step_time": 3.8829523390004397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 4.782608985900879, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.247287154197693, "epoch": 0.00089, "frac_reward_zero_std": 0.0, "grad_norm": 0.19330966472625732, "kl": 0.25942667573690414, "learning_rate": 7.99999974004465e-06, "loss": -0.0933, "num_tokens": 2410222.0, "reward": -14.23149585723877, "reward_std": 12.280137062072754, "rewards/rollout_reward_func/mean": -14.23149585723877, "rewards/rollout_reward_func/std": 12.280137062072754, "sampling/importance_sampling_ratio/max": 2.4155802726745605, "sampling/importance_sampling_ratio/mean": 0.6226311922073364, "sampling/importance_sampling_ratio/min": 1.8856179906379111e-07, "sampling/sampling_logp_difference/max": 2.406726360321045, "sampling/sampling_logp_difference/mean": 0.3674609065055847, "step": 89, "step_time": 18.001383000000715 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 2.2435511350631714, "epoch": 0.0009, "grad_norm": 0.18288487195968628, "kl": 0.24931224063038826, "learning_rate": 7.99999973014247e-06, "loss": -0.0937, "step": 90, "step_time": 3.8572730580008283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.21875, "completions/mean_terminated_length": 5.6666669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3415603637695312, "epoch": 0.00091, "frac_reward_zero_std": 0.0, "grad_norm": 0.15940554440021515, "kl": 0.17532595247030258, "learning_rate": 7.999999720055204e-06, "loss": -0.0804, "num_tokens": 2465136.0, "reward": -9.855671882629395, "reward_std": 13.327954292297363, "rewards/rollout_reward_func/mean": -9.855671882629395, "rewards/rollout_reward_func/std": 13.32795524597168, "sampling/importance_sampling_ratio/max": 1.8149421215057373, "sampling/importance_sampling_ratio/mean": 0.4823257327079773, "sampling/importance_sampling_ratio/min": 1.4065689413200744e-07, "sampling/sampling_logp_difference/max": 2.1048762798309326, "sampling/sampling_logp_difference/mean": 0.3534820079803467, "step": 91, "step_time": 17.23065138000038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3344454169273376, "epoch": 0.00092, "grad_norm": 0.16030289232730865, "kl": 0.1749311424791813, "learning_rate": 7.99999970978285e-06, "loss": -0.0804, "step": 92, "step_time": 3.7962840359996335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.4375, "completions/mean_terminated_length": 4.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4226681888103485, "epoch": 0.00093, "frac_reward_zero_std": 0.0, "grad_norm": 0.16215384006500244, "kl": 0.1679548118263483, "learning_rate": 7.999999699325408e-06, "loss": -0.0691, "num_tokens": 2518319.0, "reward": -14.618207931518555, "reward_std": 14.890968322753906, "rewards/rollout_reward_func/mean": -14.618207931518555, "rewards/rollout_reward_func/std": 14.89096736907959, "sampling/importance_sampling_ratio/max": 2.581852674484253, "sampling/importance_sampling_ratio/mean": 0.5841804146766663, "sampling/importance_sampling_ratio/min": 1.309545609728957e-07, "sampling/sampling_logp_difference/max": 2.069216251373291, "sampling/sampling_logp_difference/mean": 0.37615901231765747, "step": 93, "step_time": 16.15502323599958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.415958136320114, "epoch": 0.00094, "grad_norm": 0.15928374230861664, "kl": 0.16120341513305902, "learning_rate": 7.999999688682879e-06, "loss": -0.0693, "step": 94, "step_time": 4.380746477001594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.84375, "completions/mean_terminated_length": 5.590909481048584, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.249532103538513, "epoch": 0.00095, "frac_reward_zero_std": 0.0, "grad_norm": 0.20422203838825226, "kl": 0.19618945568799973, "learning_rate": 7.999999677855262e-06, "loss": -0.0335, "num_tokens": 2574519.0, "reward": -12.202113151550293, "reward_std": 12.481950759887695, "rewards/rollout_reward_func/mean": -12.202113151550293, "rewards/rollout_reward_func/std": 12.481949806213379, "sampling/importance_sampling_ratio/max": 1.7778980731964111, "sampling/importance_sampling_ratio/mean": 0.5207325220108032, "sampling/importance_sampling_ratio/min": 3.815670481799316e-07, "sampling/sampling_logp_difference/max": 2.1158597469329834, "sampling/sampling_logp_difference/mean": 0.3664204776287079, "step": 95, "step_time": 17.884947187000762 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 2.252625048160553, "epoch": 0.00096, "grad_norm": 0.18260903656482697, "kl": 0.20388933457434177, "learning_rate": 7.999999666842558e-06, "loss": -0.0342, "step": 96, "step_time": 4.354894248000164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.625, "completions/mean_terminated_length": 4.666666507720947, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.422206461429596, "epoch": 0.00097, "frac_reward_zero_std": 0.0, "grad_norm": 0.1517009437084198, "kl": 0.23826614394783974, "learning_rate": 7.999999655644765e-06, "loss": -0.0639, "num_tokens": 2630373.0, "reward": -15.756963729858398, "reward_std": 13.262795448303223, "rewards/rollout_reward_func/mean": -15.756963729858398, "rewards/rollout_reward_func/std": 13.262795448303223, "sampling/importance_sampling_ratio/max": 1.8024792671203613, "sampling/importance_sampling_ratio/mean": 0.4569413959980011, "sampling/importance_sampling_ratio/min": 5.0067726988345385e-05, "sampling/sampling_logp_difference/max": 1.8050687313079834, "sampling/sampling_logp_difference/mean": 0.3587415814399719, "step": 97, "step_time": 17.231286643000203 }, { "clip_ratio/high_max": 0.02777777798473835, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 2.4185343384742737, "epoch": 0.00098, "grad_norm": 0.12224368751049042, "kl": 0.23479103296995163, "learning_rate": 7.999999644261886e-06, "loss": -0.0647, "step": 98, "step_time": 3.972425010000734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.46875, "completions/mean_terminated_length": 5.079999923706055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.626591831445694, "epoch": 0.00099, "frac_reward_zero_std": 0.0, "grad_norm": 0.24397920072078705, "kl": 0.4933258295059204, "learning_rate": 7.99999963269392e-06, "loss": -0.0412, "num_tokens": 2688451.0, "reward": -9.039474487304688, "reward_std": 11.927276611328125, "rewards/rollout_reward_func/mean": -9.039474487304688, "rewards/rollout_reward_func/std": 11.927276611328125, "sampling/importance_sampling_ratio/max": 1.8795239925384521, "sampling/importance_sampling_ratio/mean": 0.6348873376846313, "sampling/importance_sampling_ratio/min": 0.0006322892149910331, "sampling/sampling_logp_difference/max": 1.8396663665771484, "sampling/sampling_logp_difference/mean": 0.28130608797073364, "step": 99, "step_time": 18.72062262200052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6342822015285492, "epoch": 0.001, "grad_norm": 0.2613573968410492, "kl": 0.4713650569319725, "learning_rate": 7.999999620940867e-06, "loss": -0.0428, "step": 100, "step_time": 4.030729011001313 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.6875, "completions/mean_terminated_length": 6.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.255509912967682, "epoch": 0.00101, "frac_reward_zero_std": 0.0, "grad_norm": 0.09797148406505585, "kl": 0.37552202492952347, "learning_rate": 7.999999609002725e-06, "loss": -0.0601, "num_tokens": 2739807.0, "reward": -11.917654991149902, "reward_std": 15.342547416687012, "rewards/rollout_reward_func/mean": -11.917654991149902, "rewards/rollout_reward_func/std": 15.342547416687012, "sampling/importance_sampling_ratio/max": 1.9397064447402954, "sampling/importance_sampling_ratio/mean": 0.4642309248447418, "sampling/importance_sampling_ratio/min": 2.5285717129008844e-05, "sampling/sampling_logp_difference/max": 2.0285515785217285, "sampling/sampling_logp_difference/mean": 0.39951184391975403, "step": 101, "step_time": 17.561035403999085 }, { "clip_ratio/high_max": 0.05208333395421505, "clip_ratio/high_mean": 0.013020833488553762, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013020833488553762, "entropy": 2.2617304623126984, "epoch": 0.00102, "grad_norm": 0.0905458927154541, "kl": 0.32919252663850784, "learning_rate": 7.999999596879495e-06, "loss": -0.0604, "step": 102, "step_time": 3.5986510100001396 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.84375, "completions/mean_terminated_length": 4.730769634246826, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5397303998470306, "epoch": 0.00103, "frac_reward_zero_std": 0.0, "grad_norm": 0.10002883523702621, "kl": 0.2266007773578167, "learning_rate": 7.99999958457118e-06, "loss": -0.0566, "num_tokens": 2796027.0, "reward": -8.462937355041504, "reward_std": 12.543950080871582, "rewards/rollout_reward_func/mean": -8.462937355041504, "rewards/rollout_reward_func/std": 12.543950080871582, "sampling/importance_sampling_ratio/max": 1.809809923171997, "sampling/importance_sampling_ratio/mean": 0.6285245418548584, "sampling/importance_sampling_ratio/min": 0.0004429263935890049, "sampling/sampling_logp_difference/max": 1.7707011699676514, "sampling/sampling_logp_difference/mean": 0.26612767577171326, "step": 103, "step_time": 18.406355995000013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5431667268276215, "epoch": 0.00104, "grad_norm": 0.10193097591400146, "kl": 0.2164880335330963, "learning_rate": 7.999999572077776e-06, "loss": -0.0564, "step": 104, "step_time": 3.983802058999572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.78125, "completions/mean_terminated_length": 4.450000286102295, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.090934157371521, "epoch": 0.00105, "frac_reward_zero_std": 0.0, "grad_norm": 0.2242397665977478, "kl": 0.1422728020697832, "learning_rate": 7.999999559399285e-06, "loss": -0.0976, "num_tokens": 2850733.0, "reward": -14.096229553222656, "reward_std": 12.312780380249023, "rewards/rollout_reward_func/mean": -14.096229553222656, "rewards/rollout_reward_func/std": 12.31278133392334, "sampling/importance_sampling_ratio/max": 1.754956603050232, "sampling/importance_sampling_ratio/mean": 0.6469407081604004, "sampling/importance_sampling_ratio/min": 2.0509116183120568e-08, "sampling/sampling_logp_difference/max": 2.1255948543548584, "sampling/sampling_logp_difference/mean": 0.34918224811553955, "step": 105, "step_time": 16.946704742999827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0850610733032227, "epoch": 0.00106, "grad_norm": 0.2554610073566437, "kl": 0.14228078722953796, "learning_rate": 7.999999546535705e-06, "loss": -0.0985, "step": 106, "step_time": 3.903583842000444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.125, "completions/mean_terminated_length": 5.555555820465088, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3368077278137207, "epoch": 0.00107, "frac_reward_zero_std": 0.0, "grad_norm": 0.1585676670074463, "kl": 0.139299426227808, "learning_rate": 7.99999953348704e-06, "loss": -0.0576, "num_tokens": 2904708.0, "reward": -14.390296936035156, "reward_std": 13.991205215454102, "rewards/rollout_reward_func/mean": -14.390296936035156, "rewards/rollout_reward_func/std": 13.991205215454102, "sampling/importance_sampling_ratio/max": 2.0207252502441406, "sampling/importance_sampling_ratio/mean": 0.5079885721206665, "sampling/importance_sampling_ratio/min": 3.8972223137534456e-07, "sampling/sampling_logp_difference/max": 1.953859806060791, "sampling/sampling_logp_difference/mean": 0.3582305908203125, "step": 107, "step_time": 17.065207262000513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3326573967933655, "epoch": 0.00108, "grad_norm": 0.16174548864364624, "kl": 0.1380956508219242, "learning_rate": 7.999999520253286e-06, "loss": -0.058, "step": 108, "step_time": 3.919972986997891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.84375, "completions/mean_terminated_length": 6.61904764175415, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4273093342781067, "epoch": 0.00109, "frac_reward_zero_std": 0.0, "grad_norm": 0.1942053735256195, "kl": 0.1486597415059805, "learning_rate": 7.999999506834446e-06, "loss": -0.0638, "num_tokens": 2959107.0, "reward": -15.394391059875488, "reward_std": 15.676250457763672, "rewards/rollout_reward_func/mean": -15.394391059875488, "rewards/rollout_reward_func/std": 15.676250457763672, "sampling/importance_sampling_ratio/max": 1.7307286262512207, "sampling/importance_sampling_ratio/mean": 0.4652015268802643, "sampling/importance_sampling_ratio/min": 8.355043974006549e-06, "sampling/sampling_logp_difference/max": 2.305194139480591, "sampling/sampling_logp_difference/mean": 0.37129175662994385, "step": 109, "step_time": 17.101558804001797 }, { "clip_ratio/high_max": 0.01923076994717121, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 2.4139832258224487, "epoch": 0.0011, "grad_norm": 0.1761111617088318, "kl": 0.15364351961761713, "learning_rate": 7.999999493230518e-06, "loss": -0.0638, "step": 110, "step_time": 3.962792625000475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.96875, "completions/mean_terminated_length": 5.266666889190674, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5127357840538025, "epoch": 0.00111, "frac_reward_zero_std": 0.0, "grad_norm": 0.29044249653816223, "kl": 0.15417676605284214, "learning_rate": 7.9999994794415e-06, "loss": -0.0711, "num_tokens": 3012189.0, "reward": -14.177255630493164, "reward_std": 15.213272094726562, "rewards/rollout_reward_func/mean": -14.177255630493164, "rewards/rollout_reward_func/std": 15.213272094726562, "sampling/importance_sampling_ratio/max": 2.71513032913208, "sampling/importance_sampling_ratio/mean": 0.33143186569213867, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3559942245483398, "sampling/sampling_logp_difference/mean": 0.3534078896045685, "step": 111, "step_time": 16.627831042998878 }, { "clip_ratio/high_max": 0.040178571827709675, "clip_ratio/high_mean": 0.010044642956927419, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010044642956927419, "entropy": 2.497465133666992, "epoch": 0.00112, "grad_norm": 0.15500706434249878, "kl": 0.16152178589254618, "learning_rate": 7.999999465467398e-06, "loss": -0.0722, "step": 112, "step_time": 3.801304140999491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.28125, "completions/mean_terminated_length": 4.7727274894714355, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6913675665855408, "epoch": 0.00113, "frac_reward_zero_std": 0.0, "grad_norm": 0.15884366631507874, "kl": 0.30313733220100403, "learning_rate": 7.999999451308208e-06, "loss": -0.0735, "num_tokens": 3067264.0, "reward": -14.963839530944824, "reward_std": 14.617509841918945, "rewards/rollout_reward_func/mean": -14.963839530944824, "rewards/rollout_reward_func/std": 14.617510795593262, "sampling/importance_sampling_ratio/max": 2.1521050930023193, "sampling/importance_sampling_ratio/mean": 0.5878071188926697, "sampling/importance_sampling_ratio/min": 0.0015108644729480147, "sampling/sampling_logp_difference/max": 1.8143436908721924, "sampling/sampling_logp_difference/mean": 0.29392382502555847, "step": 113, "step_time": 17.380109040001116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6796797811985016, "epoch": 0.00114, "grad_norm": 0.1635114997625351, "kl": 0.31825270503759384, "learning_rate": 7.99999943696393e-06, "loss": -0.0739, "step": 114, "step_time": 4.39201037300063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.46875, "completions/mean_terminated_length": 6.047619342803955, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.257766842842102, "epoch": 0.00115, "frac_reward_zero_std": 0.0, "grad_norm": 0.2490290105342865, "kl": 0.7180089578032494, "learning_rate": 7.999999422434564e-06, "loss": -0.0361, "num_tokens": 3119894.0, "reward": -16.34988021850586, "reward_std": 13.16049861907959, "rewards/rollout_reward_func/mean": -16.34988021850586, "rewards/rollout_reward_func/std": 13.160499572753906, "sampling/importance_sampling_ratio/max": 1.8871204853057861, "sampling/importance_sampling_ratio/mean": 0.3680192232131958, "sampling/importance_sampling_ratio/min": 1.7118882169597782e-05, "sampling/sampling_logp_difference/max": 2.1067299842834473, "sampling/sampling_logp_difference/mean": 0.343960165977478, "step": 115, "step_time": 16.77088873700086 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 2.253935754299164, "epoch": 0.00116, "grad_norm": 0.21995894610881805, "kl": 0.7159292958676815, "learning_rate": 7.99999940772011e-06, "loss": -0.0372, "step": 116, "step_time": 4.362302176999947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.28125, "completions/mean_terminated_length": 5.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4287390410900116, "epoch": 0.00117, "frac_reward_zero_std": 0.0, "grad_norm": 0.21433348953723907, "kl": 0.19245577231049538, "learning_rate": 7.99999939282057e-06, "loss": -0.0591, "num_tokens": 3174160.0, "reward": -12.751977920532227, "reward_std": 12.405231475830078, "rewards/rollout_reward_func/mean": -12.751977920532227, "rewards/rollout_reward_func/std": 12.405231475830078, "sampling/importance_sampling_ratio/max": 1.9564330577850342, "sampling/importance_sampling_ratio/mean": 0.4801850914955139, "sampling/importance_sampling_ratio/min": 4.3652960357576376e-08, "sampling/sampling_logp_difference/max": 1.948685884475708, "sampling/sampling_logp_difference/mean": 0.3683035969734192, "step": 117, "step_time": 17.629074146000676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.431937485933304, "epoch": 0.00118, "grad_norm": 0.2163916379213333, "kl": 0.18722756952047348, "learning_rate": 7.999999377735942e-06, "loss": -0.0594, "step": 118, "step_time": 3.8727677299993957 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.59375, "completions/mean_terminated_length": 5.653846263885498, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8224510252475739, "epoch": 0.00119, "frac_reward_zero_std": 0.0, "grad_norm": 0.19748830795288086, "kl": 0.32728492841124535, "learning_rate": 7.999999362466227e-06, "loss": -0.0403, "num_tokens": 3228118.0, "reward": -8.620935440063477, "reward_std": 11.81003475189209, "rewards/rollout_reward_func/mean": -8.620935440063477, "rewards/rollout_reward_func/std": 11.810032844543457, "sampling/importance_sampling_ratio/max": 1.723278284072876, "sampling/importance_sampling_ratio/mean": 0.6870340704917908, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.018822193145752, "sampling/sampling_logp_difference/mean": 0.27822041511535645, "step": 119, "step_time": 18.02906806000101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8196918070316315, "epoch": 0.0012, "grad_norm": 0.2079036831855774, "kl": 0.33383315429091454, "learning_rate": 7.999999347011425e-06, "loss": -0.0411, "step": 120, "step_time": 3.7269393289998334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 9.5625, "completions/mean_terminated_length": 3.8823530673980713, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.331916868686676, "epoch": 0.00121, "frac_reward_zero_std": 0.0, "grad_norm": 0.19351336359977722, "kl": 0.19814911484718323, "learning_rate": 7.999999331371534e-06, "loss": -0.0808, "num_tokens": 3282730.0, "reward": -13.055758476257324, "reward_std": 14.445548057556152, "rewards/rollout_reward_func/mean": -13.055758476257324, "rewards/rollout_reward_func/std": 14.445548057556152, "sampling/importance_sampling_ratio/max": 2.632713556289673, "sampling/importance_sampling_ratio/mean": 0.4473937749862671, "sampling/importance_sampling_ratio/min": 3.104937377429451e-06, "sampling/sampling_logp_difference/max": 2.0077202320098877, "sampling/sampling_logp_difference/mean": 0.3796681761741638, "step": 121, "step_time": 16.572294898999644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.326456129550934, "epoch": 0.00122, "grad_norm": 0.19353602826595306, "kl": 0.19753051735460758, "learning_rate": 7.999999315546556e-06, "loss": -0.081, "step": 122, "step_time": 3.9433432719997654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.8125, "completions/mean_terminated_length": 4.352941036224365, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2000025659799576, "epoch": 0.00123, "frac_reward_zero_std": 0.0, "grad_norm": 0.1553424596786499, "kl": 0.14056373573839664, "learning_rate": 7.999999299536492e-06, "loss": -0.0979, "num_tokens": 3337725.0, "reward": -13.25818920135498, "reward_std": 13.001375198364258, "rewards/rollout_reward_func/mean": -13.25818920135498, "rewards/rollout_reward_func/std": 13.001375198364258, "sampling/importance_sampling_ratio/max": 1.7498034238815308, "sampling/importance_sampling_ratio/mean": 0.5303956270217896, "sampling/importance_sampling_ratio/min": 7.444007223966764e-07, "sampling/sampling_logp_difference/max": 2.322181224822998, "sampling/sampling_logp_difference/mean": 0.33692312240600586, "step": 123, "step_time": 17.27749253300044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1995555013418198, "epoch": 0.00124, "grad_norm": 0.15592116117477417, "kl": 0.14079230464994907, "learning_rate": 7.999999283341339e-06, "loss": -0.0979, "step": 124, "step_time": 4.8605795160001435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.15625, "completions/mean_terminated_length": 5.611111164093018, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.169024884700775, "epoch": 0.00125, "frac_reward_zero_std": 0.0, "grad_norm": 0.3734012246131897, "kl": 0.34247829020023346, "learning_rate": 7.999999266961099e-06, "loss": -0.0669, "num_tokens": 3391072.0, "reward": -14.423711776733398, "reward_std": 13.864624977111816, "rewards/rollout_reward_func/mean": -14.423711776733398, "rewards/rollout_reward_func/std": 13.864624977111816, "sampling/importance_sampling_ratio/max": 1.745375156402588, "sampling/importance_sampling_ratio/mean": 0.3903349041938782, "sampling/importance_sampling_ratio/min": 9.635182323108893e-07, "sampling/sampling_logp_difference/max": 2.479546070098877, "sampling/sampling_logp_difference/mean": 0.3675075173377991, "step": 125, "step_time": 17.79798304400083 }, { "clip_ratio/high_max": 0.02777777798473835, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 2.167737305164337, "epoch": 0.00126, "grad_norm": 0.21211105585098267, "kl": 0.3429744876921177, "learning_rate": 7.999999250395772e-06, "loss": -0.0681, "step": 126, "step_time": 4.284180971000751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 4.400000095367432, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1142513751983643, "epoch": 0.00127, "frac_reward_zero_std": 0.0, "grad_norm": 0.16093088686466217, "kl": 0.44958314299583435, "learning_rate": 7.999999233645358e-06, "loss": -0.0874, "num_tokens": 3445057.0, "reward": -13.798921585083008, "reward_std": 13.21139144897461, "rewards/rollout_reward_func/mean": -13.798921585083008, "rewards/rollout_reward_func/std": 13.21139144897461, "sampling/importance_sampling_ratio/max": 1.526474952697754, "sampling/importance_sampling_ratio/mean": 0.5948379039764404, "sampling/importance_sampling_ratio/min": 3.6784692838409683e-06, "sampling/sampling_logp_difference/max": 2.10664963722229, "sampling/sampling_logp_difference/mean": 0.32167869806289673, "step": 127, "step_time": 16.26557365499957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.114555597305298, "epoch": 0.00128, "grad_norm": 0.15952369570732117, "kl": 0.42794549092650414, "learning_rate": 7.999999216709854e-06, "loss": -0.0873, "step": 128, "step_time": 3.994595754999864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.71875, "completions/mean_terminated_length": 5.950000286102295, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3199170231819153, "epoch": 0.00129, "frac_reward_zero_std": 0.0, "grad_norm": 0.19072632491588593, "kl": 0.1472384948283434, "learning_rate": 7.999999199589265e-06, "loss": -0.0702, "num_tokens": 3497654.0, "reward": -14.459236145019531, "reward_std": 16.05208396911621, "rewards/rollout_reward_func/mean": -14.459236145019531, "rewards/rollout_reward_func/std": 16.052082061767578, "sampling/importance_sampling_ratio/max": 1.8563324213027954, "sampling/importance_sampling_ratio/mean": 0.4823446273803711, "sampling/importance_sampling_ratio/min": 0.00014365543029271066, "sampling/sampling_logp_difference/max": 1.9336071014404297, "sampling/sampling_logp_difference/mean": 0.312381386756897, "step": 129, "step_time": 16.940661580999404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3198951482772827, "epoch": 0.0013, "grad_norm": 0.19747306406497955, "kl": 0.15347701776772738, "learning_rate": 7.999999182283588e-06, "loss": -0.0714, "step": 130, "step_time": 3.913357768999049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.90625, "completions/mean_terminated_length": 4.650000095367432, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2088379859924316, "epoch": 0.00131, "frac_reward_zero_std": 0.0, "grad_norm": 0.24974103271961212, "kl": 0.15223483741283417, "learning_rate": 7.999999164792823e-06, "loss": -0.0963, "num_tokens": 3551747.0, "reward": -12.887598037719727, "reward_std": 14.826447486877441, "rewards/rollout_reward_func/mean": -12.887598037719727, "rewards/rollout_reward_func/std": 14.826446533203125, "sampling/importance_sampling_ratio/max": 1.7164642810821533, "sampling/importance_sampling_ratio/mean": 0.5934466123580933, "sampling/importance_sampling_ratio/min": 9.983389190892922e-07, "sampling/sampling_logp_difference/max": 1.9111764430999756, "sampling/sampling_logp_difference/mean": 0.34947288036346436, "step": 131, "step_time": 18.378533543999765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.206984668970108, "epoch": 0.00132, "grad_norm": 0.2575746774673462, "kl": 0.1576593853533268, "learning_rate": 7.999999147116972e-06, "loss": -0.0974, "step": 132, "step_time": 3.949292685999353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.96875, "completions/mean_terminated_length": 4.82608699798584, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.047189474105835, "epoch": 0.00133, "frac_reward_zero_std": 0.0, "grad_norm": 0.19629521667957306, "kl": 0.16287460923194885, "learning_rate": 7.999999129256032e-06, "loss": -0.0918, "num_tokens": 3605154.0, "reward": -14.940670013427734, "reward_std": 15.202176094055176, "rewards/rollout_reward_func/mean": -14.940670013427734, "rewards/rollout_reward_func/std": 15.202176094055176, "sampling/importance_sampling_ratio/max": 1.793296456336975, "sampling/importance_sampling_ratio/mean": 0.6387282609939575, "sampling/importance_sampling_ratio/min": 7.713686500210315e-06, "sampling/sampling_logp_difference/max": 1.9775452613830566, "sampling/sampling_logp_difference/mean": 0.31392231583595276, "step": 133, "step_time": 17.2386745419999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.03654208779335, "epoch": 0.00134, "grad_norm": 0.2018130123615265, "kl": 0.1662757657468319, "learning_rate": 7.999999111210005e-06, "loss": -0.0929, "step": 134, "step_time": 5.141195217001041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.71875, "completions/mean_terminated_length": 4.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0783639401197433, "epoch": 0.00135, "frac_reward_zero_std": 0.0, "grad_norm": 0.28810447454452515, "kl": 0.35645222663879395, "learning_rate": 7.999999092978893e-06, "loss": -0.0587, "num_tokens": 3660160.0, "reward": -11.354267120361328, "reward_std": 11.96794319152832, "rewards/rollout_reward_func/mean": -11.354267120361328, "rewards/rollout_reward_func/std": 11.967944145202637, "sampling/importance_sampling_ratio/max": 1.65242338180542, "sampling/importance_sampling_ratio/mean": 0.9433579444885254, "sampling/importance_sampling_ratio/min": 0.001182172680273652, "sampling/sampling_logp_difference/max": 1.262531042098999, "sampling/sampling_logp_difference/mean": 0.20646844804286957, "step": 135, "step_time": 18.34364368599836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0591329634189606, "epoch": 0.00136, "grad_norm": 0.30133089423179626, "kl": 0.39398249238729477, "learning_rate": 7.999999074562691e-06, "loss": -0.0604, "step": 136, "step_time": 4.393148312000449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.15625, "completions/mean_terminated_length": 4.208333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6967704892158508, "epoch": 0.00137, "frac_reward_zero_std": 0.0, "grad_norm": 0.1882079839706421, "kl": 0.30807357281446457, "learning_rate": 7.999999055961402e-06, "loss": -0.0284, "num_tokens": 3714586.0, "reward": -10.878229141235352, "reward_std": 10.182136535644531, "rewards/rollout_reward_func/mean": -10.878229141235352, "rewards/rollout_reward_func/std": 10.182136535644531, "sampling/importance_sampling_ratio/max": 1.750657558441162, "sampling/importance_sampling_ratio/mean": 0.7130255699157715, "sampling/importance_sampling_ratio/min": 0.00011815351899713278, "sampling/sampling_logp_difference/max": 1.5353028774261475, "sampling/sampling_logp_difference/mean": 0.3114352822303772, "step": 137, "step_time": 16.519907348999368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.6849656850099564, "epoch": 0.00138, "grad_norm": 0.1932072639465332, "kl": 0.32724323868751526, "learning_rate": 7.999999037175024e-06, "loss": -0.028, "step": 138, "step_time": 3.983022218998485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.78125, "completions/mean_terminated_length": 5.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8886836171150208, "epoch": 0.00139, "frac_reward_zero_std": 0.0, "grad_norm": 0.15388597548007965, "kl": 0.49410954490303993, "learning_rate": 7.999999018203562e-06, "loss": -0.07, "num_tokens": 3768890.0, "reward": -9.989204406738281, "reward_std": 16.435895919799805, "rewards/rollout_reward_func/mean": -9.989204406738281, "rewards/rollout_reward_func/std": 16.435895919799805, "sampling/importance_sampling_ratio/max": 1.6989898681640625, "sampling/importance_sampling_ratio/mean": 0.5757548809051514, "sampling/importance_sampling_ratio/min": 1.1753714716178365e-05, "sampling/sampling_logp_difference/max": 1.8855177164077759, "sampling/sampling_logp_difference/mean": 0.35318872332572937, "step": 139, "step_time": 16.849343687999863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8802864253520966, "epoch": 0.0014, "grad_norm": 0.1569357067346573, "kl": 0.48264528438448906, "learning_rate": 7.99999899904701e-06, "loss": -0.0701, "step": 140, "step_time": 3.869698566999432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.0625, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3938067555427551, "epoch": 0.00141, "frac_reward_zero_std": 0.0, "grad_norm": 0.22312870621681213, "kl": 0.48523272573947906, "learning_rate": 7.99999897970537e-06, "loss": -0.0613, "num_tokens": 3822507.0, "reward": -8.933938980102539, "reward_std": 15.994019508361816, "rewards/rollout_reward_func/mean": -8.933938980102539, "rewards/rollout_reward_func/std": 15.994020462036133, "sampling/importance_sampling_ratio/max": 2.1881649494171143, "sampling/importance_sampling_ratio/mean": 0.8062597513198853, "sampling/importance_sampling_ratio/min": 0.00012270870502106845, "sampling/sampling_logp_difference/max": 1.8308358192443848, "sampling/sampling_logp_difference/mean": 0.30373409390449524, "step": 141, "step_time": 18.849196494999887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3866601586341858, "epoch": 0.00142, "grad_norm": 0.22305528819561005, "kl": 0.5025356411933899, "learning_rate": 7.999998960178645e-06, "loss": -0.0617, "step": 142, "step_time": 3.8698511140000846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 5.119999885559082, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.687413215637207, "epoch": 0.00143, "frac_reward_zero_std": 0.0, "grad_norm": 0.1487019956111908, "kl": 0.3045383282005787, "learning_rate": 7.999998940466832e-06, "loss": -0.0812, "num_tokens": 3874547.0, "reward": -9.996606826782227, "reward_std": 13.055642127990723, "rewards/rollout_reward_func/mean": -9.996606826782227, "rewards/rollout_reward_func/std": 13.055642127990723, "sampling/importance_sampling_ratio/max": 2.0816428661346436, "sampling/importance_sampling_ratio/mean": 0.7184334397315979, "sampling/importance_sampling_ratio/min": 0.0002469506289344281, "sampling/sampling_logp_difference/max": 2.3434011936187744, "sampling/sampling_logp_difference/mean": 0.2939603924751282, "step": 143, "step_time": 17.95529502500085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6915176808834076, "epoch": 0.00144, "grad_norm": 0.14668838679790497, "kl": 0.2959621697664261, "learning_rate": 7.999998920569931e-06, "loss": -0.0817, "step": 144, "step_time": 4.756567324999196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.53125, "completions/mean_terminated_length": 4.34615421295166, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.633510947227478, "epoch": 0.00145, "frac_reward_zero_std": 0.0, "grad_norm": 0.2417609989643097, "kl": 0.4079745076596737, "learning_rate": 7.999998900487944e-06, "loss": -0.0608, "num_tokens": 3929345.0, "reward": -10.732439041137695, "reward_std": 13.979938507080078, "rewards/rollout_reward_func/mean": -10.732439041137695, "rewards/rollout_reward_func/std": 13.979938507080078, "sampling/importance_sampling_ratio/max": 2.0555548667907715, "sampling/importance_sampling_ratio/mean": 0.8213640451431274, "sampling/importance_sampling_ratio/min": 0.00043251458555459976, "sampling/sampling_logp_difference/max": 2.0537118911743164, "sampling/sampling_logp_difference/mean": 0.30844593048095703, "step": 145, "step_time": 16.829515376999552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6366823315620422, "epoch": 0.00146, "grad_norm": 0.24146048724651337, "kl": 0.43085963279008865, "learning_rate": 7.999998880220868e-06, "loss": -0.0619, "step": 146, "step_time": 4.33593830800055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.28125, "completions/mean_terminated_length": 4.375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.868195116519928, "epoch": 0.00147, "frac_reward_zero_std": 0.0, "grad_norm": 0.19449394941329956, "kl": 1.0403657630085945, "learning_rate": 7.999998859768705e-06, "loss": -0.1095, "num_tokens": 3983970.0, "reward": -10.43870735168457, "reward_std": 13.17192268371582, "rewards/rollout_reward_func/mean": -10.43870735168457, "rewards/rollout_reward_func/std": 13.17192268371582, "sampling/importance_sampling_ratio/max": 1.8008885383605957, "sampling/importance_sampling_ratio/mean": 0.690872073173523, "sampling/importance_sampling_ratio/min": 7.352130523941014e-06, "sampling/sampling_logp_difference/max": 2.4113898277282715, "sampling/sampling_logp_difference/mean": 0.35631847381591797, "step": 147, "step_time": 18.674088943998868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8719338774681091, "epoch": 0.00148, "grad_norm": 0.18691851198673248, "kl": 0.9619548469781876, "learning_rate": 7.999998839131454e-06, "loss": -0.1098, "step": 148, "step_time": 3.8689029930010292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 4.239999771118164, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6032952070236206, "epoch": 0.00149, "frac_reward_zero_std": 0.0, "grad_norm": 0.14864298701286316, "kl": 0.32866325974464417, "learning_rate": 7.999998818309116e-06, "loss": -0.1131, "num_tokens": 4040007.0, "reward": -8.887664794921875, "reward_std": 15.682639122009277, "rewards/rollout_reward_func/mean": -8.887664794921875, "rewards/rollout_reward_func/std": 15.682639122009277, "sampling/importance_sampling_ratio/max": 2.5515780448913574, "sampling/importance_sampling_ratio/mean": 0.7709916830062866, "sampling/importance_sampling_ratio/min": 0.0005490843905135989, "sampling/sampling_logp_difference/max": 1.887569785118103, "sampling/sampling_logp_difference/mean": 0.30962276458740234, "step": 149, "step_time": 17.30719570800011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6081042438745499, "epoch": 0.0015, "grad_norm": 0.14045801758766174, "kl": 0.3462950363755226, "learning_rate": 7.99999879730169e-06, "loss": -0.1134, "step": 150, "step_time": 3.85738850800044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.9375, "completions/mean_terminated_length": 6.761904716491699, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.06572088599205, "epoch": 0.00151, "frac_reward_zero_std": 0.0, "grad_norm": 0.1559440940618515, "kl": 0.31545647978782654, "learning_rate": 7.999998776109179e-06, "loss": -0.0837, "num_tokens": 4093618.0, "reward": -11.188453674316406, "reward_std": 16.41277503967285, "rewards/rollout_reward_func/mean": -11.188453674316406, "rewards/rollout_reward_func/std": 16.41277503967285, "sampling/importance_sampling_ratio/max": 2.0832417011260986, "sampling/importance_sampling_ratio/mean": 0.4934885501861572, "sampling/importance_sampling_ratio/min": 7.131351594580337e-05, "sampling/sampling_logp_difference/max": 1.8440673351287842, "sampling/sampling_logp_difference/mean": 0.3231852948665619, "step": 151, "step_time": 16.69929774999855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0701276659965515, "epoch": 0.00152, "grad_norm": 0.15733514726161957, "kl": 0.3268819563090801, "learning_rate": 7.999998754731578e-06, "loss": -0.0837, "step": 152, "step_time": 3.943474671998956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.28125, "completions/mean_terminated_length": 4.839999675750732, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6787732243537903, "epoch": 0.00153, "frac_reward_zero_std": 0.0, "grad_norm": 0.18446031212806702, "kl": 0.4987460821866989, "learning_rate": 7.99999873316889e-06, "loss": -0.0785, "num_tokens": 4148589.0, "reward": -10.091891288757324, "reward_std": 12.097522735595703, "rewards/rollout_reward_func/mean": -10.091891288757324, "rewards/rollout_reward_func/std": 12.097522735595703, "sampling/importance_sampling_ratio/max": 2.14483642578125, "sampling/importance_sampling_ratio/mean": 0.6681872010231018, "sampling/importance_sampling_ratio/min": 0.00020938250236213207, "sampling/sampling_logp_difference/max": 1.620222806930542, "sampling/sampling_logp_difference/mean": 0.32973551750183105, "step": 153, "step_time": 18.399413101999926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6819849982857704, "epoch": 0.00154, "grad_norm": 0.18620085716247559, "kl": 0.48833972215652466, "learning_rate": 7.999998711421117e-06, "loss": -0.0791, "step": 154, "step_time": 4.364322221999828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 4.782608985900879, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6303704231977463, "epoch": 0.00155, "frac_reward_zero_std": 0.0, "grad_norm": 0.1306331753730774, "kl": 0.31623901054263115, "learning_rate": 7.999998689488254e-06, "loss": -0.109, "num_tokens": 4202572.0, "reward": -8.886487007141113, "reward_std": 15.716998100280762, "rewards/rollout_reward_func/mean": -8.886487007141113, "rewards/rollout_reward_func/std": 15.716998100280762, "sampling/importance_sampling_ratio/max": 2.090451717376709, "sampling/importance_sampling_ratio/mean": 0.8083341717720032, "sampling/importance_sampling_ratio/min": 1.2844536989575772e-08, "sampling/sampling_logp_difference/max": 2.296675443649292, "sampling/sampling_logp_difference/mean": 0.3483703136444092, "step": 155, "step_time": 18.65726999799881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.626208946108818, "epoch": 0.00156, "grad_norm": 0.12759527564048767, "kl": 0.32484784722328186, "learning_rate": 7.999998667370304e-06, "loss": -0.109, "step": 156, "step_time": 3.862805773999753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.40625, "completions/mean_terminated_length": 4.192307949066162, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3164727091789246, "epoch": 0.00157, "frac_reward_zero_std": 0.0, "grad_norm": 0.23874951899051666, "kl": 0.4714328870177269, "learning_rate": 7.999998645067266e-06, "loss": -0.0137, "num_tokens": 4255252.0, "reward": -7.912813186645508, "reward_std": 13.325704574584961, "rewards/rollout_reward_func/mean": -7.912813186645508, "rewards/rollout_reward_func/std": 13.325705528259277, "sampling/importance_sampling_ratio/max": 1.982794165611267, "sampling/importance_sampling_ratio/mean": 0.7808276414871216, "sampling/importance_sampling_ratio/min": 1.0276262063513286e-09, "sampling/sampling_logp_difference/max": 2.8208160400390625, "sampling/sampling_logp_difference/mean": 0.29134130477905273, "step": 157, "step_time": 18.53787607399954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.314701959490776, "epoch": 0.00158, "grad_norm": 0.24614301323890686, "kl": 0.48739565163850784, "learning_rate": 7.999998622579143e-06, "loss": -0.0145, "step": 158, "step_time": 3.9460504859998764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.4375, "completions/mean_terminated_length": 5.9166669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8467506170272827, "epoch": 0.00159, "frac_reward_zero_std": 0.0, "grad_norm": 0.14640164375305176, "kl": 0.9175825789570808, "learning_rate": 7.99999859990593e-06, "loss": -0.065, "num_tokens": 4313012.0, "reward": -7.307235240936279, "reward_std": 13.707070350646973, "rewards/rollout_reward_func/mean": -7.307235240936279, "rewards/rollout_reward_func/std": 13.707070350646973, "sampling/importance_sampling_ratio/max": 2.0130488872528076, "sampling/importance_sampling_ratio/mean": 0.6311084032058716, "sampling/importance_sampling_ratio/min": 4.030262630294601e-07, "sampling/sampling_logp_difference/max": 3.4423694610595703, "sampling/sampling_logp_difference/mean": 0.374096542596817, "step": 159, "step_time": 18.153461923000577 }, { "clip_ratio/high_max": 0.0669642873108387, "clip_ratio/high_mean": 0.016741071827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016741071827709675, "entropy": 1.8567652851343155, "epoch": 0.0016, "grad_norm": 0.1283155381679535, "kl": 0.8061093091964722, "learning_rate": 7.999998577047632e-06, "loss": -0.0655, "step": 160, "step_time": 3.9918870550000065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.375, "completions/mean_terminated_length": 4.909090995788574, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7105783745646477, "epoch": 0.00161, "frac_reward_zero_std": 0.0, "grad_norm": 0.12223649770021439, "kl": 0.36596077494323254, "learning_rate": 7.999998554004246e-06, "loss": -0.0869, "num_tokens": 4368008.0, "reward": -11.96145248413086, "reward_std": 14.088513374328613, "rewards/rollout_reward_func/mean": -11.96145248413086, "rewards/rollout_reward_func/std": 14.088513374328613, "sampling/importance_sampling_ratio/max": 2.0441575050354004, "sampling/importance_sampling_ratio/mean": 0.6713329553604126, "sampling/importance_sampling_ratio/min": 8.461696621164805e-11, "sampling/sampling_logp_difference/max": 2.564812660217285, "sampling/sampling_logp_difference/mean": 0.374387264251709, "step": 161, "step_time": 17.583204286999717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7200811356306076, "epoch": 0.00162, "grad_norm": 0.12556396424770355, "kl": 0.3722399827092886, "learning_rate": 7.99999853077577e-06, "loss": -0.0869, "step": 162, "step_time": 4.3533847419994345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.3125, "completions/mean_terminated_length": 5.888888835906982, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.197139322757721, "epoch": 0.00163, "frac_reward_zero_std": 0.0, "grad_norm": 0.11858666688203812, "kl": 0.19090315885841846, "learning_rate": 7.999998507362211e-06, "loss": -0.1, "num_tokens": 4418044.0, "reward": -14.280783653259277, "reward_std": 16.12468719482422, "rewards/rollout_reward_func/mean": -14.280783653259277, "rewards/rollout_reward_func/std": 16.12468719482422, "sampling/importance_sampling_ratio/max": 1.8674105405807495, "sampling/importance_sampling_ratio/mean": 0.5099196434020996, "sampling/importance_sampling_ratio/min": 9.520912840343954e-07, "sampling/sampling_logp_difference/max": 2.0011162757873535, "sampling/sampling_logp_difference/mean": 0.3657623529434204, "step": 163, "step_time": 17.231510792000336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2020488381385803, "epoch": 0.00164, "grad_norm": 0.11956977844238281, "kl": 0.18890911526978016, "learning_rate": 7.999998483763561e-06, "loss": -0.1002, "step": 164, "step_time": 4.230278795001141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.0, "completions/mean_terminated_length": 4.705882549285889, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5943320095539093, "epoch": 0.00165, "frac_reward_zero_std": 0.0, "grad_norm": 0.2030939757823944, "kl": 0.3157796412706375, "learning_rate": 7.999998459979827e-06, "loss": -0.0677, "num_tokens": 4470979.0, "reward": -15.238121032714844, "reward_std": 14.710016250610352, "rewards/rollout_reward_func/mean": -15.238121032714844, "rewards/rollout_reward_func/std": 14.710016250610352, "sampling/importance_sampling_ratio/max": 1.710871934890747, "sampling/importance_sampling_ratio/mean": 0.4000137448310852, "sampling/importance_sampling_ratio/min": 5.465988772712649e-12, "sampling/sampling_logp_difference/max": 2.4391446113586426, "sampling/sampling_logp_difference/mean": 0.48378366231918335, "step": 165, "step_time": 17.45906997400016 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 2.597247302532196, "epoch": 0.00166, "grad_norm": 0.1736898273229599, "kl": 0.30577781796455383, "learning_rate": 7.999998436011002e-06, "loss": -0.068, "step": 166, "step_time": 3.9631415820003895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.84375, "completions/mean_terminated_length": 4.65217399597168, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.296883225440979, "epoch": 0.00167, "frac_reward_zero_std": 0.0, "grad_norm": 0.1555374413728714, "kl": 0.3570997714996338, "learning_rate": 7.999998411857091e-06, "loss": -0.0792, "num_tokens": 4525382.0, "reward": -9.636720657348633, "reward_std": 14.635923385620117, "rewards/rollout_reward_func/mean": -9.636720657348633, "rewards/rollout_reward_func/std": 14.6359224319458, "sampling/importance_sampling_ratio/max": 1.656609296798706, "sampling/importance_sampling_ratio/mean": 0.6189147233963013, "sampling/importance_sampling_ratio/min": 1.5223996285840258e-08, "sampling/sampling_logp_difference/max": 2.306878089904785, "sampling/sampling_logp_difference/mean": 0.43765151500701904, "step": 167, "step_time": 17.427419858000576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3018491864204407, "epoch": 0.00168, "grad_norm": 0.15550491213798523, "kl": 0.3664994090795517, "learning_rate": 7.999998387518093e-06, "loss": -0.08, "step": 168, "step_time": 3.8344462200002454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 5.473684310913086, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2176968455314636, "epoch": 0.00169, "frac_reward_zero_std": 0.0, "grad_norm": 0.2020222544670105, "kl": 0.1437150090932846, "learning_rate": 7.999998362994007e-06, "loss": -0.1283, "num_tokens": 4579548.0, "reward": -11.094268798828125, "reward_std": 16.456941604614258, "rewards/rollout_reward_func/mean": -11.094268798828125, "rewards/rollout_reward_func/std": 16.456941604614258, "sampling/importance_sampling_ratio/max": 2.056429386138916, "sampling/importance_sampling_ratio/mean": 0.5675873160362244, "sampling/importance_sampling_ratio/min": 6.82476456859149e-06, "sampling/sampling_logp_difference/max": 1.9386825561523438, "sampling/sampling_logp_difference/mean": 0.30168092250823975, "step": 169, "step_time": 17.768926198998997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2236937284469604, "epoch": 0.0017, "grad_norm": 0.2032463401556015, "kl": 0.141741331666708, "learning_rate": 7.999998338284834e-06, "loss": -0.1284, "step": 170, "step_time": 3.8692198640010247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 4.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9961879253387451, "epoch": 0.00171, "frac_reward_zero_std": 0.0, "grad_norm": 0.11603956669569016, "kl": 0.43092787079513073, "learning_rate": 7.999998313390573e-06, "loss": -0.0722, "num_tokens": 4635414.0, "reward": -12.404193878173828, "reward_std": 14.190462112426758, "rewards/rollout_reward_func/mean": -12.404193878173828, "rewards/rollout_reward_func/std": 14.190462112426758, "sampling/importance_sampling_ratio/max": 1.9295297861099243, "sampling/importance_sampling_ratio/mean": 0.5306910872459412, "sampling/importance_sampling_ratio/min": 0.00012898267596028745, "sampling/sampling_logp_difference/max": 2.1113429069519043, "sampling/sampling_logp_difference/mean": 0.32274630665779114, "step": 171, "step_time": 17.905455860001894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9995128214359283, "epoch": 0.00172, "grad_norm": 0.12198062986135483, "kl": 0.4241302162408829, "learning_rate": 7.999998288311227e-06, "loss": -0.0727, "step": 172, "step_time": 3.9823877339995306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.96875, "completions/mean_terminated_length": 4.82608699798584, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9899572432041168, "epoch": 0.00173, "frac_reward_zero_std": 0.0, "grad_norm": 0.153068408370018, "kl": 0.32967495545744896, "learning_rate": 7.999998263046792e-06, "loss": -0.1018, "num_tokens": 4690039.0, "reward": -13.395965576171875, "reward_std": 16.77534294128418, "rewards/rollout_reward_func/mean": -13.395965576171875, "rewards/rollout_reward_func/std": 16.77534294128418, "sampling/importance_sampling_ratio/max": 1.7074509859085083, "sampling/importance_sampling_ratio/mean": 0.6728349328041077, "sampling/importance_sampling_ratio/min": 2.480944385752082e-05, "sampling/sampling_logp_difference/max": 1.823836088180542, "sampling/sampling_logp_difference/mean": 0.3218085765838623, "step": 173, "step_time": 18.04434510900046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9979658126831055, "epoch": 0.00174, "grad_norm": 0.15009357035160065, "kl": 0.32032644748687744, "learning_rate": 7.999998237597269e-06, "loss": -0.1017, "step": 174, "step_time": 4.333251175999976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.53125, "completions/mean_terminated_length": 5.705882549285889, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1258323788642883, "epoch": 0.00175, "frac_reward_zero_std": 0.0, "grad_norm": 0.24242353439331055, "kl": 0.1641453467309475, "learning_rate": 7.99999821196266e-06, "loss": -0.0793, "num_tokens": 4743399.0, "reward": -11.369354248046875, "reward_std": 14.81938648223877, "rewards/rollout_reward_func/mean": -11.369354248046875, "rewards/rollout_reward_func/std": 14.819385528564453, "sampling/importance_sampling_ratio/max": 1.7454863786697388, "sampling/importance_sampling_ratio/mean": 0.4486271142959595, "sampling/importance_sampling_ratio/min": 1.2196534271424753e-07, "sampling/sampling_logp_difference/max": 2.2343366146087646, "sampling/sampling_logp_difference/mean": 0.3288901150226593, "step": 175, "step_time": 16.594622761998835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1284611523151398, "epoch": 0.00176, "grad_norm": 0.24624553322792053, "kl": 0.15673335455358028, "learning_rate": 7.999998186142964e-06, "loss": -0.0795, "step": 176, "step_time": 3.8995925859990166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.46875, "completions/mean_terminated_length": 4.130434989929199, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7220653295516968, "epoch": 0.00177, "frac_reward_zero_std": 0.0, "grad_norm": 0.11496133357286453, "kl": 0.4338963031768799, "learning_rate": 7.999998160138178e-06, "loss": -0.0697, "num_tokens": 4797360.0, "reward": -11.340206146240234, "reward_std": 12.472711563110352, "rewards/rollout_reward_func/mean": -11.340206146240234, "rewards/rollout_reward_func/std": 12.472711563110352, "sampling/importance_sampling_ratio/max": 2.0994679927825928, "sampling/importance_sampling_ratio/mean": 0.6026086807250977, "sampling/importance_sampling_ratio/min": 1.4573797280093004e-08, "sampling/sampling_logp_difference/max": 1.8486011028289795, "sampling/sampling_logp_difference/mean": 0.368246465921402, "step": 177, "step_time": 17.989053492000494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.723304696381092, "epoch": 0.00178, "grad_norm": 0.11527397483587265, "kl": 0.4286658577620983, "learning_rate": 7.999998133948305e-06, "loss": -0.0706, "step": 178, "step_time": 3.818264634001025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.40625, "completions/mean_terminated_length": 5.450000286102295, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3937262892723083, "epoch": 0.00179, "frac_reward_zero_std": 0.0, "grad_norm": 0.17590516805648804, "kl": 0.2670660689473152, "learning_rate": 7.999998107573346e-06, "loss": -0.0747, "num_tokens": 4852277.0, "reward": -9.66629409790039, "reward_std": 15.92879867553711, "rewards/rollout_reward_func/mean": -9.66629409790039, "rewards/rollout_reward_func/std": 15.92879867553711, "sampling/importance_sampling_ratio/max": 1.8354038000106812, "sampling/importance_sampling_ratio/mean": 0.44543373584747314, "sampling/importance_sampling_ratio/min": 3.4304603104828857e-06, "sampling/sampling_logp_difference/max": 1.688849687576294, "sampling/sampling_logp_difference/mean": 0.3668004274368286, "step": 179, "step_time": 17.344255801000145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3964751064777374, "epoch": 0.0018, "grad_norm": 0.17852210998535156, "kl": 0.2702103443443775, "learning_rate": 7.9999980810133e-06, "loss": -0.075, "step": 180, "step_time": 3.8838701839995338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.78125, "completions/mean_terminated_length": 5.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2022765278816223, "epoch": 0.00181, "frac_reward_zero_std": 0.0, "grad_norm": 0.19939319789409637, "kl": 0.37362560257315636, "learning_rate": 7.999998054268167e-06, "loss": -0.0942, "num_tokens": 4904289.0, "reward": -12.69278335571289, "reward_std": 15.035601615905762, "rewards/rollout_reward_func/mean": -12.69278335571289, "rewards/rollout_reward_func/std": 15.035601615905762, "sampling/importance_sampling_ratio/max": 1.4835023880004883, "sampling/importance_sampling_ratio/mean": 0.6308207511901855, "sampling/importance_sampling_ratio/min": 1.0235872878183727e-06, "sampling/sampling_logp_difference/max": 2.3106415271759033, "sampling/sampling_logp_difference/mean": 0.3557279109954834, "step": 181, "step_time": 17.315779094999925 }, { "clip_ratio/high_max": 0.0357142873108387, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 2.205231249332428, "epoch": 0.00182, "grad_norm": 0.2008916586637497, "kl": 0.34943463653326035, "learning_rate": 7.999998027337947e-06, "loss": -0.0948, "step": 182, "step_time": 4.23370106999937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.28125, "completions/mean_terminated_length": 4.7727274894714355, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3029288351535797, "epoch": 0.00183, "frac_reward_zero_std": 0.0, "grad_norm": 0.4503638744354248, "kl": 1.76473019272089, "learning_rate": 7.999998000222637e-06, "loss": -0.0742, "num_tokens": 4959454.0, "reward": -14.055398941040039, "reward_std": 13.439220428466797, "rewards/rollout_reward_func/mean": -14.055398941040039, "rewards/rollout_reward_func/std": 13.439220428466797, "sampling/importance_sampling_ratio/max": 1.5102527141571045, "sampling/importance_sampling_ratio/mean": 0.5053730607032776, "sampling/importance_sampling_ratio/min": 3.866955012199469e-06, "sampling/sampling_logp_difference/max": 2.985649585723877, "sampling/sampling_logp_difference/mean": 0.3916690945625305, "step": 183, "step_time": 18.57882476299983 }, { "clip_ratio/high_max": 0.0357142873108387, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 2.303779900074005, "epoch": 0.00184, "grad_norm": 0.21240274608135223, "kl": 0.6982220262289047, "learning_rate": 7.999997972922241e-06, "loss": -0.0763, "step": 184, "step_time": 4.337302431999888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.8125, "completions/mean_terminated_length": 5.083333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9209644198417664, "epoch": 0.00185, "frac_reward_zero_std": 0.0, "grad_norm": 0.10858798772096634, "kl": 0.1766141690313816, "learning_rate": 7.999997945436757e-06, "loss": -0.0882, "num_tokens": 5011109.0, "reward": -9.937747955322266, "reward_std": 15.143767356872559, "rewards/rollout_reward_func/mean": -9.937747955322266, "rewards/rollout_reward_func/std": 15.143767356872559, "sampling/importance_sampling_ratio/max": 1.4435644149780273, "sampling/importance_sampling_ratio/mean": 0.5621886253356934, "sampling/importance_sampling_ratio/min": 1.5468328911083518e-06, "sampling/sampling_logp_difference/max": 2.203329086303711, "sampling/sampling_logp_difference/mean": 0.3333553671836853, "step": 185, "step_time": 18.255741546000536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9236587584018707, "epoch": 0.00186, "grad_norm": 0.11308836191892624, "kl": 0.17665309458971024, "learning_rate": 7.999997917766188e-06, "loss": -0.0881, "step": 186, "step_time": 3.802071215000069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.03125, "completions/mean_terminated_length": 5.388888835906982, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.343993365764618, "epoch": 0.00187, "frac_reward_zero_std": 0.0, "grad_norm": 0.21753397583961487, "kl": 0.3416254911571741, "learning_rate": 7.999997889910529e-06, "loss": -0.0847, "num_tokens": 5063089.0, "reward": -14.541221618652344, "reward_std": 17.445598602294922, "rewards/rollout_reward_func/mean": -14.541221618652344, "rewards/rollout_reward_func/std": 17.445598602294922, "sampling/importance_sampling_ratio/max": 1.6345809698104858, "sampling/importance_sampling_ratio/mean": 0.4147283136844635, "sampling/importance_sampling_ratio/min": 1.136263676926319e-06, "sampling/sampling_logp_difference/max": 2.1603991985321045, "sampling/sampling_logp_difference/mean": 0.39235302805900574, "step": 187, "step_time": 16.173584965998998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3498682975769043, "epoch": 0.00188, "grad_norm": 0.2077595442533493, "kl": 0.3244616128504276, "learning_rate": 7.999997861869784e-06, "loss": -0.0859, "step": 188, "step_time": 3.772416253999836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.6875, "completions/mean_terminated_length": 6.818181991577148, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.102005034685135, "epoch": 0.00189, "frac_reward_zero_std": 0.0, "grad_norm": 0.09862715005874634, "kl": 0.22960012778639793, "learning_rate": 7.999997833643951e-06, "loss": -0.0756, "num_tokens": 5116323.0, "reward": -12.077775955200195, "reward_std": 15.122434616088867, "rewards/rollout_reward_func/mean": -12.077775955200195, "rewards/rollout_reward_func/std": 15.122434616088867, "sampling/importance_sampling_ratio/max": 1.6958177089691162, "sampling/importance_sampling_ratio/mean": 0.5261013507843018, "sampling/importance_sampling_ratio/min": 7.013686172285816e-06, "sampling/sampling_logp_difference/max": 1.7023420333862305, "sampling/sampling_logp_difference/mean": 0.3332656919956207, "step": 189, "step_time": 18.05219873199985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.107665181159973, "epoch": 0.0019, "grad_norm": 0.09518983215093613, "kl": 0.24116471968591213, "learning_rate": 7.999997805233032e-06, "loss": -0.0752, "step": 190, "step_time": 3.843729849000738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 9.9375, "completions/mean_terminated_length": 4.588235378265381, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.217253863811493, "epoch": 0.00191, "frac_reward_zero_std": 0.0, "grad_norm": 0.15725789964199066, "kl": 0.23210493847727776, "learning_rate": 7.999997776637025e-06, "loss": -0.0728, "num_tokens": 5169114.0, "reward": -13.1730318069458, "reward_std": 16.67192840576172, "rewards/rollout_reward_func/mean": -13.1730318069458, "rewards/rollout_reward_func/std": 16.67192840576172, "sampling/importance_sampling_ratio/max": 1.6286739110946655, "sampling/importance_sampling_ratio/mean": 0.4241676330566406, "sampling/importance_sampling_ratio/min": 3.611832653405145e-05, "sampling/sampling_logp_difference/max": 1.9177536964416504, "sampling/sampling_logp_difference/mean": 0.3435857594013214, "step": 191, "step_time": 16.484537010998793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.21610888838768, "epoch": 0.00192, "grad_norm": 0.15658234059810638, "kl": 0.236163642257452, "learning_rate": 7.99999774785593e-06, "loss": -0.0731, "step": 192, "step_time": 4.41554368800098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.96875, "completions/mean_terminated_length": 5.277777671813965, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1872140169143677, "epoch": 0.00193, "frac_reward_zero_std": 0.0, "grad_norm": 0.089054174721241, "kl": 0.23527542874217033, "learning_rate": 7.999997718889747e-06, "loss": -0.0804, "num_tokens": 5220023.0, "reward": -17.193927764892578, "reward_std": 14.157336235046387, "rewards/rollout_reward_func/mean": -17.193927764892578, "rewards/rollout_reward_func/std": 14.157336235046387, "sampling/importance_sampling_ratio/max": 1.492855429649353, "sampling/importance_sampling_ratio/mean": 0.424294650554657, "sampling/importance_sampling_ratio/min": 7.346995971602155e-07, "sampling/sampling_logp_difference/max": 2.433102607727051, "sampling/sampling_logp_difference/mean": 0.32766053080558777, "step": 193, "step_time": 16.15121796800031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1895185112953186, "epoch": 0.00194, "grad_norm": 0.08690271526575089, "kl": 0.24099277146160603, "learning_rate": 7.999997689738478e-06, "loss": -0.0806, "step": 194, "step_time": 4.127570746000856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.59375, "completions/mean_terminated_length": 4.304347991943359, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8838205933570862, "epoch": 0.00195, "frac_reward_zero_std": 0.0, "grad_norm": 0.1939055472612381, "kl": 0.3078561760485172, "learning_rate": 7.999997660402122e-06, "loss": -0.0725, "num_tokens": 5273744.0, "reward": -10.73081111907959, "reward_std": 15.433686256408691, "rewards/rollout_reward_func/mean": -10.73081111907959, "rewards/rollout_reward_func/std": 15.433686256408691, "sampling/importance_sampling_ratio/max": 2.3882527351379395, "sampling/importance_sampling_ratio/mean": 0.7642704248428345, "sampling/importance_sampling_ratio/min": 4.5063810830470175e-05, "sampling/sampling_logp_difference/max": 2.0094499588012695, "sampling/sampling_logp_difference/mean": 0.3321591913700104, "step": 195, "step_time": 17.877560809999522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8863323032855988, "epoch": 0.00196, "grad_norm": 0.19066844880580902, "kl": 0.3078266754746437, "learning_rate": 7.999997630880678e-06, "loss": -0.0727, "step": 196, "step_time": 3.9360052530000758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 4.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9504822790622711, "epoch": 0.00197, "frac_reward_zero_std": 0.0, "grad_norm": 0.11613193154335022, "kl": 0.3237598240375519, "learning_rate": 7.999997601174145e-06, "loss": -0.0589, "num_tokens": 5329271.0, "reward": -9.113938331604004, "reward_std": 14.319207191467285, "rewards/rollout_reward_func/mean": -9.113938331604004, "rewards/rollout_reward_func/std": 14.319207191467285, "sampling/importance_sampling_ratio/max": 1.752303957939148, "sampling/importance_sampling_ratio/mean": 0.634240448474884, "sampling/importance_sampling_ratio/min": 4.90644788442296e-06, "sampling/sampling_logp_difference/max": 1.9489902257919312, "sampling/sampling_logp_difference/mean": 0.36118200421333313, "step": 197, "step_time": 17.57801572000062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9551126956939697, "epoch": 0.00198, "grad_norm": 0.12123607099056244, "kl": 0.32137780636548996, "learning_rate": 7.999997571282526e-06, "loss": -0.059, "step": 198, "step_time": 3.8680451509990235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.3125, "completions/mean_terminated_length": 4.4166669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.963010996580124, "epoch": 0.00199, "frac_reward_zero_std": 0.0, "grad_norm": 0.22119255363941193, "kl": 0.37798455730080605, "learning_rate": 7.999997541205821e-06, "loss": -0.0741, "num_tokens": 5385317.0, "reward": -9.101659774780273, "reward_std": 14.427990913391113, "rewards/rollout_reward_func/mean": -9.101659774780273, "rewards/rollout_reward_func/std": 14.427990913391113, "sampling/importance_sampling_ratio/max": 1.5297480821609497, "sampling/importance_sampling_ratio/mean": 0.6383270025253296, "sampling/importance_sampling_ratio/min": 6.046400358172832e-06, "sampling/sampling_logp_difference/max": 2.111297130584717, "sampling/sampling_logp_difference/mean": 0.34987545013427734, "step": 199, "step_time": 18.717354274999707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9659490883350372, "epoch": 0.002, "grad_norm": 0.21188680827617645, "kl": 0.3738158456981182, "learning_rate": 7.99999751094403e-06, "loss": -0.0752, "step": 200, "step_time": 3.8862253159995817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 4.545454502105713, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8831826150417328, "epoch": 0.00201, "frac_reward_zero_std": 0.0, "grad_norm": 0.0965028703212738, "kl": 0.3229590505361557, "learning_rate": 7.999997480497147e-06, "loss": -0.0776, "num_tokens": 5440758.0, "reward": -9.24014949798584, "reward_std": 14.90712833404541, "rewards/rollout_reward_func/mean": -9.24014949798584, "rewards/rollout_reward_func/std": 14.907127380371094, "sampling/importance_sampling_ratio/max": 2.161083459854126, "sampling/importance_sampling_ratio/mean": 0.6597106456756592, "sampling/importance_sampling_ratio/min": 1.5208666809485294e-05, "sampling/sampling_logp_difference/max": 2.0175633430480957, "sampling/sampling_logp_difference/mean": 0.33059799671173096, "step": 201, "step_time": 19.08245056800024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8829796314239502, "epoch": 0.00202, "grad_norm": 0.09482964128255844, "kl": 0.3302973732352257, "learning_rate": 7.99999744986518e-06, "loss": -0.0772, "step": 202, "step_time": 4.317819938999492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.78125, "completions/mean_terminated_length": 5.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2092472314834595, "epoch": 0.00203, "frac_reward_zero_std": 0.0, "grad_norm": 0.10385310649871826, "kl": 0.30669113993644714, "learning_rate": 7.999997419048124e-06, "loss": -0.0847, "num_tokens": 5494004.0, "reward": -12.003853797912598, "reward_std": 16.692319869995117, "rewards/rollout_reward_func/mean": -12.003853797912598, "rewards/rollout_reward_func/std": 16.692317962646484, "sampling/importance_sampling_ratio/max": 1.4968957901000977, "sampling/importance_sampling_ratio/mean": 0.5533554553985596, "sampling/importance_sampling_ratio/min": 5.855930794496089e-06, "sampling/sampling_logp_difference/max": 1.5768628120422363, "sampling/sampling_logp_difference/mean": 0.3519139289855957, "step": 203, "step_time": 16.032296375999977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.209383875131607, "epoch": 0.00204, "grad_norm": 0.10225124657154083, "kl": 0.3000032715499401, "learning_rate": 7.999997388045983e-06, "loss": -0.0852, "step": 204, "step_time": 4.206914967001467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.28125, "completions/mean_terminated_length": 4.7727274894714355, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1503586769104004, "epoch": 0.00205, "frac_reward_zero_std": 0.0, "grad_norm": 0.18813158571720123, "kl": 0.35786180570721626, "learning_rate": 7.999997356858753e-06, "loss": -0.0773, "num_tokens": 5548327.0, "reward": -12.061787605285645, "reward_std": 13.974858283996582, "rewards/rollout_reward_func/mean": -12.061787605285645, "rewards/rollout_reward_func/std": 13.974858283996582, "sampling/importance_sampling_ratio/max": 2.0342066287994385, "sampling/importance_sampling_ratio/mean": 0.5470632314682007, "sampling/importance_sampling_ratio/min": 9.637801849748939e-05, "sampling/sampling_logp_difference/max": 1.9664344787597656, "sampling/sampling_logp_difference/mean": 0.3497040271759033, "step": 205, "step_time": 18.277515176000634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1545771956443787, "epoch": 0.00206, "grad_norm": 0.1890951544046402, "kl": 0.34738832525908947, "learning_rate": 7.999997325486435e-06, "loss": -0.0779, "step": 206, "step_time": 4.000956516999395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.78125, "completions/mean_terminated_length": 4.653846263885498, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6481732428073883, "epoch": 0.00207, "frac_reward_zero_std": 0.0, "grad_norm": 0.13045871257781982, "kl": 0.5393150746822357, "learning_rate": 7.99999729392903e-06, "loss": -0.0644, "num_tokens": 5602149.0, "reward": -8.20493221282959, "reward_std": 14.693595886230469, "rewards/rollout_reward_func/mean": -8.20493221282959, "rewards/rollout_reward_func/std": 14.693594932556152, "sampling/importance_sampling_ratio/max": 1.7884962558746338, "sampling/importance_sampling_ratio/mean": 0.6364991664886475, "sampling/importance_sampling_ratio/min": 0.00011184854520251974, "sampling/sampling_logp_difference/max": 1.9067788124084473, "sampling/sampling_logp_difference/mean": 0.30511701107025146, "step": 207, "step_time": 17.420888219000517 }, { "clip_ratio/high_max": 0.01923076994717121, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 1.656671166419983, "epoch": 0.00208, "grad_norm": 0.10385328531265259, "kl": 0.48783624917268753, "learning_rate": 7.99999726218654e-06, "loss": -0.0643, "step": 208, "step_time": 3.8545192650008175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 4.239999771118164, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.606386959552765, "epoch": 0.00209, "frac_reward_zero_std": 0.0, "grad_norm": 0.18533404171466827, "kl": 0.5729503110051155, "learning_rate": 7.999997230258959e-06, "loss": -0.0775, "num_tokens": 5658587.0, "reward": -8.994199752807617, "reward_std": 14.747591972351074, "rewards/rollout_reward_func/mean": -8.994199752807617, "rewards/rollout_reward_func/std": 14.747591018676758, "sampling/importance_sampling_ratio/max": 1.6834055185317993, "sampling/importance_sampling_ratio/mean": 0.8157728910446167, "sampling/importance_sampling_ratio/min": 1.823209004214732e-06, "sampling/sampling_logp_difference/max": 2.3821353912353516, "sampling/sampling_logp_difference/mean": 0.3322928547859192, "step": 209, "step_time": 19.307413404998442 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.6122996509075165, "epoch": 0.0021, "grad_norm": 0.17086723446846008, "kl": 0.48706137016415596, "learning_rate": 7.999997198146293e-06, "loss": -0.0778, "step": 210, "step_time": 3.8671605789995738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.03125, "completions/mean_terminated_length": 5.388888835906982, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4346861839294434, "epoch": 0.00211, "frac_reward_zero_std": 0.0, "grad_norm": 0.1725126951932907, "kl": 0.2225559540092945, "learning_rate": 7.99999716584854e-06, "loss": -0.0824, "num_tokens": 5712936.0, "reward": -13.399733543395996, "reward_std": 14.456074714660645, "rewards/rollout_reward_func/mean": -13.399733543395996, "rewards/rollout_reward_func/std": 14.456074714660645, "sampling/importance_sampling_ratio/max": 1.8689335584640503, "sampling/importance_sampling_ratio/mean": 0.5078774094581604, "sampling/importance_sampling_ratio/min": 2.715257778618252e-06, "sampling/sampling_logp_difference/max": 1.9817140102386475, "sampling/sampling_logp_difference/mean": 0.3949892520904541, "step": 211, "step_time": 16.758195877000617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.438041090965271, "epoch": 0.00212, "grad_norm": 0.17641174793243408, "kl": 0.21409287955611944, "learning_rate": 7.9999971333657e-06, "loss": -0.0833, "step": 212, "step_time": 4.329457127000751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 8.15625, "completions/mean_terminated_length": 4.047619342803955, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.854185774922371, "epoch": 0.00213, "frac_reward_zero_std": 0.0, "grad_norm": 0.0879393145442009, "kl": 0.2785319462418556, "learning_rate": 7.99999710069777e-06, "loss": -0.0828, "num_tokens": 5766085.0, "reward": -9.132076263427734, "reward_std": 18.86130142211914, "rewards/rollout_reward_func/mean": -9.132076263427734, "rewards/rollout_reward_func/std": 18.86130142211914, "sampling/importance_sampling_ratio/max": 1.7972530126571655, "sampling/importance_sampling_ratio/mean": 0.6248300075531006, "sampling/importance_sampling_ratio/min": 4.137537956694359e-08, "sampling/sampling_logp_difference/max": 1.9975066184997559, "sampling/sampling_logp_difference/mean": 0.3883479833602905, "step": 213, "step_time": 17.282231226000476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8573011457920074, "epoch": 0.00214, "grad_norm": 0.08517467230558395, "kl": 0.2836026605218649, "learning_rate": 7.999997067844755e-06, "loss": -0.0827, "step": 214, "step_time": 4.305779059000088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.59375, "completions/mean_terminated_length": 6.519999980926514, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.168291985988617, "epoch": 0.00215, "frac_reward_zero_std": 0.0, "grad_norm": 0.1250239461660385, "kl": 0.18806695565581322, "learning_rate": 7.999997034806652e-06, "loss": -0.0988, "num_tokens": 5818351.0, "reward": -14.491008758544922, "reward_std": 12.774662017822266, "rewards/rollout_reward_func/mean": -14.491008758544922, "rewards/rollout_reward_func/std": 12.774662017822266, "sampling/importance_sampling_ratio/max": 1.6606993675231934, "sampling/importance_sampling_ratio/mean": 0.5829496383666992, "sampling/importance_sampling_ratio/min": 1.2659394087677356e-05, "sampling/sampling_logp_difference/max": 1.936669111251831, "sampling/sampling_logp_difference/mean": 0.3272430896759033, "step": 215, "step_time": 17.14138800799992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1672585904598236, "epoch": 0.00216, "grad_norm": 0.1232176423072815, "kl": 0.18753096275031567, "learning_rate": 7.999997001583462e-06, "loss": -0.0993, "step": 216, "step_time": 3.735458778000975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.375, "completions/mean_terminated_length": 4.909090995788574, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7421517968177795, "epoch": 0.00217, "frac_reward_zero_std": 0.0, "grad_norm": 0.16625410318374634, "kl": 0.30272251740098, "learning_rate": 7.999996968175185e-06, "loss": -0.0911, "num_tokens": 5870139.0, "reward": -11.406450271606445, "reward_std": 12.841609001159668, "rewards/rollout_reward_func/mean": -11.406450271606445, "rewards/rollout_reward_func/std": 12.841608047485352, "sampling/importance_sampling_ratio/max": 1.8912345170974731, "sampling/importance_sampling_ratio/mean": 0.6082594394683838, "sampling/importance_sampling_ratio/min": 0.0006826261524111032, "sampling/sampling_logp_difference/max": 1.8900450468063354, "sampling/sampling_logp_difference/mean": 0.2861936688423157, "step": 217, "step_time": 16.962544848001016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7309037148952484, "epoch": 0.00218, "grad_norm": 0.16512005031108856, "kl": 0.3276551216840744, "learning_rate": 7.99999693458182e-06, "loss": -0.0917, "step": 218, "step_time": 3.7895121140008996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.8125, "completions/mean_terminated_length": 4.352941036224365, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1914464831352234, "epoch": 0.00219, "frac_reward_zero_std": 0.0, "grad_norm": 0.09957544505596161, "kl": 0.18771303072571754, "learning_rate": 7.999996900803368e-06, "loss": -0.09, "num_tokens": 5925274.0, "reward": -10.714449882507324, "reward_std": 16.953704833984375, "rewards/rollout_reward_func/mean": -10.714449882507324, "rewards/rollout_reward_func/std": 16.953704833984375, "sampling/importance_sampling_ratio/max": 1.830951452255249, "sampling/importance_sampling_ratio/mean": 0.5633530020713806, "sampling/importance_sampling_ratio/min": 3.100690537660711e-10, "sampling/sampling_logp_difference/max": 2.27714467048645, "sampling/sampling_logp_difference/mean": 0.39281409978866577, "step": 219, "step_time": 17.044623270001466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.179566442966461, "epoch": 0.0022, "grad_norm": 0.09540513902902603, "kl": 0.1986657977104187, "learning_rate": 7.999996866839829e-06, "loss": -0.0904, "step": 220, "step_time": 3.8588728760005324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.90625, "completions/mean_terminated_length": 5.208333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4736140668392181, "epoch": 0.00221, "frac_reward_zero_std": 0.0, "grad_norm": 0.16501913964748383, "kl": 0.3565613701939583, "learning_rate": 7.999996832691203e-06, "loss": -0.0719, "num_tokens": 5980486.0, "reward": -11.985794067382812, "reward_std": 13.121570587158203, "rewards/rollout_reward_func/mean": -11.985794067382812, "rewards/rollout_reward_func/std": 13.121569633483887, "sampling/importance_sampling_ratio/max": 2.0118801593780518, "sampling/importance_sampling_ratio/mean": 0.6766996383666992, "sampling/importance_sampling_ratio/min": 1.0560769624134991e-05, "sampling/sampling_logp_difference/max": 2.1333353519439697, "sampling/sampling_logp_difference/mean": 0.30522823333740234, "step": 221, "step_time": 17.389539677000357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.463305026292801, "epoch": 0.00222, "grad_norm": 0.1584913581609726, "kl": 0.3748170919716358, "learning_rate": 7.999996798357488e-06, "loss": -0.0724, "step": 222, "step_time": 3.932932606999202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.0625, "completions/mean_terminated_length": 5.428571701049805, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.5414681434631348, "epoch": 0.00223, "frac_reward_zero_std": 0.0, "grad_norm": 0.10768629610538483, "kl": 0.2027539936825633, "learning_rate": 7.999996763838688e-06, "loss": -0.0792, "num_tokens": 6035727.0, "reward": -16.298892974853516, "reward_std": 12.814021110534668, "rewards/rollout_reward_func/mean": -16.298892974853516, "rewards/rollout_reward_func/std": 12.814020156860352, "sampling/importance_sampling_ratio/max": 1.524226427078247, "sampling/importance_sampling_ratio/mean": 0.5610209107398987, "sampling/importance_sampling_ratio/min": 1.4667007235402707e-06, "sampling/sampling_logp_difference/max": 2.316927433013916, "sampling/sampling_logp_difference/mean": 0.4164603352546692, "step": 223, "step_time": 15.80510382600005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5359776318073273, "epoch": 0.00224, "grad_norm": 0.10665453970432281, "kl": 0.2109208032488823, "learning_rate": 7.9999967291348e-06, "loss": -0.0795, "step": 224, "step_time": 4.241764834998321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 4.2727274894714355, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7343031167984009, "epoch": 0.00225, "frac_reward_zero_std": 0.0, "grad_norm": 0.09496170282363892, "kl": 0.4871605858206749, "learning_rate": 7.999996694245824e-06, "loss": -0.0775, "num_tokens": 6090864.0, "reward": -9.444841384887695, "reward_std": 13.990036010742188, "rewards/rollout_reward_func/mean": -9.444841384887695, "rewards/rollout_reward_func/std": 13.990035057067871, "sampling/importance_sampling_ratio/max": 1.8196324110031128, "sampling/importance_sampling_ratio/mean": 0.5560163855552673, "sampling/importance_sampling_ratio/min": 1.8182230633101426e-05, "sampling/sampling_logp_difference/max": 1.736194133758545, "sampling/sampling_logp_difference/mean": 0.33208391070365906, "step": 225, "step_time": 17.380618969999887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.7294737100601196, "epoch": 0.00226, "grad_norm": 0.10174727439880371, "kl": 0.48852957785129547, "learning_rate": 7.99999665917176e-06, "loss": -0.0778, "step": 226, "step_time": 3.8921017080001548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.96875, "completions/mean_terminated_length": 4.440000057220459, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5757167786359787, "epoch": 0.00227, "frac_reward_zero_std": 0.0, "grad_norm": 0.1632942259311676, "kl": 0.6642413586378098, "learning_rate": 7.999996623912611e-06, "loss": -0.0761, "num_tokens": 6147040.0, "reward": -9.719850540161133, "reward_std": 13.81702995300293, "rewards/rollout_reward_func/mean": -9.719850540161133, "rewards/rollout_reward_func/std": 13.817028999328613, "sampling/importance_sampling_ratio/max": 2.155230760574341, "sampling/importance_sampling_ratio/mean": 0.7509503364562988, "sampling/importance_sampling_ratio/min": 0.0002112716028932482, "sampling/sampling_logp_difference/max": 1.9420862197875977, "sampling/sampling_logp_difference/mean": 0.32459965348243713, "step": 227, "step_time": 18.361518085998796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5775791555643082, "epoch": 0.00228, "grad_norm": 0.162156343460083, "kl": 0.6475803703069687, "learning_rate": 7.999996588468373e-06, "loss": -0.0762, "step": 228, "step_time": 3.890537870999651 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.59375, "completions/mean_terminated_length": 4.304347991943359, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.37163907289505, "epoch": 0.00229, "frac_reward_zero_std": 0.0, "grad_norm": 0.14051397144794464, "kl": 0.35907455533742905, "learning_rate": 7.999996552839049e-06, "loss": -0.0939, "num_tokens": 6200790.0, "reward": -7.514109134674072, "reward_std": 14.065479278564453, "rewards/rollout_reward_func/mean": -7.514109134674072, "rewards/rollout_reward_func/std": 14.065478324890137, "sampling/importance_sampling_ratio/max": 1.6169648170471191, "sampling/importance_sampling_ratio/mean": 0.6938307881355286, "sampling/importance_sampling_ratio/min": 8.519414222973865e-07, "sampling/sampling_logp_difference/max": 1.9244074821472168, "sampling/sampling_logp_difference/mean": 0.31341320276260376, "step": 229, "step_time": 17.813070038999285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3739171475172043, "epoch": 0.0023, "grad_norm": 0.15173906087875366, "kl": 0.3639489859342575, "learning_rate": 7.999996517024637e-06, "loss": -0.0943, "step": 230, "step_time": 3.840036597001017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.59375, "completions/mean_terminated_length": 4.304347991943359, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6587721109390259, "epoch": 0.00231, "frac_reward_zero_std": 0.0, "grad_norm": 0.12791620194911957, "kl": 0.5861377269029617, "learning_rate": 7.999996481025137e-06, "loss": -0.0681, "num_tokens": 6255915.0, "reward": -12.083134651184082, "reward_std": 12.786955833435059, "rewards/rollout_reward_func/mean": -12.083134651184082, "rewards/rollout_reward_func/std": 12.786955833435059, "sampling/importance_sampling_ratio/max": 1.6969996690750122, "sampling/importance_sampling_ratio/mean": 0.5716183185577393, "sampling/importance_sampling_ratio/min": 2.3432355646946235e-06, "sampling/sampling_logp_difference/max": 1.9034018516540527, "sampling/sampling_logp_difference/mean": 0.38539159297943115, "step": 231, "step_time": 17.905008208000254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6649243235588074, "epoch": 0.00232, "grad_norm": 0.1281098872423172, "kl": 0.5739628374576569, "learning_rate": 7.99999644484055e-06, "loss": -0.0683, "step": 232, "step_time": 4.793027405999055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 3.909090995788574, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.233609527349472, "epoch": 0.00233, "frac_reward_zero_std": 0.0, "grad_norm": 0.14243201911449432, "kl": 0.5135194566100836, "learning_rate": 7.999996408470877e-06, "loss": -0.0465, "num_tokens": 6310604.0, "reward": -11.00844669342041, "reward_std": 13.627372741699219, "rewards/rollout_reward_func/mean": -11.00844669342041, "rewards/rollout_reward_func/std": 13.627372741699219, "sampling/importance_sampling_ratio/max": 1.6929078102111816, "sampling/importance_sampling_ratio/mean": 0.5676394701004028, "sampling/importance_sampling_ratio/min": 6.6187788583249585e-09, "sampling/sampling_logp_difference/max": 1.970396876335144, "sampling/sampling_logp_difference/mean": 0.4330841302871704, "step": 233, "step_time": 17.918012595999244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.243352174758911, "epoch": 0.00234, "grad_norm": 0.13851311802864075, "kl": 0.4864910710602999, "learning_rate": 7.999996371916116e-06, "loss": -0.0467, "step": 234, "step_time": 4.405788648998168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.28125, "completions/mean_terminated_length": 4.892857551574707, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.429550051689148, "epoch": 0.00235, "frac_reward_zero_std": 0.0, "grad_norm": 0.2078956663608551, "kl": 0.8491866588592529, "learning_rate": 7.999996335176269e-06, "loss": -0.0278, "num_tokens": 6365579.0, "reward": -7.6705522537231445, "reward_std": 11.520444869995117, "rewards/rollout_reward_func/mean": -7.6705522537231445, "rewards/rollout_reward_func/std": 11.520444869995117, "sampling/importance_sampling_ratio/max": 1.656417727470398, "sampling/importance_sampling_ratio/mean": 0.5876867771148682, "sampling/importance_sampling_ratio/min": 2.430073800496757e-05, "sampling/sampling_logp_difference/max": 1.7337908744812012, "sampling/sampling_logp_difference/mean": 0.33624759316444397, "step": 235, "step_time": 18.842665078999744 }, { "clip_ratio/high_max": 0.043750000186264515, "clip_ratio/high_mean": 0.010937500046566129, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010937500046566129, "entropy": 1.453457772731781, "epoch": 0.00236, "grad_norm": 0.18413427472114563, "kl": 0.6975578963756561, "learning_rate": 7.999996298251333e-06, "loss": -0.0297, "step": 236, "step_time": 3.9801948300009826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 4.266666889190674, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0449446737766266, "epoch": 0.00237, "frac_reward_zero_std": 0.0, "grad_norm": 0.1429932862520218, "kl": 0.8202063292264938, "learning_rate": 7.99999626114131e-06, "loss": -0.0476, "num_tokens": 6422203.0, "reward": -7.126887321472168, "reward_std": 14.998952865600586, "rewards/rollout_reward_func/mean": -7.126887321472168, "rewards/rollout_reward_func/std": 14.998953819274902, "sampling/importance_sampling_ratio/max": 1.4417450428009033, "sampling/importance_sampling_ratio/mean": 0.7986769080162048, "sampling/importance_sampling_ratio/min": 0.0001517381751909852, "sampling/sampling_logp_difference/max": 1.7204512357711792, "sampling/sampling_logp_difference/mean": 0.24128150939941406, "step": 237, "step_time": 18.76869297200028 }, { "clip_ratio/high_max": 0.07478632591664791, "clip_ratio/high_mean": 0.018696581479161978, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018696581479161978, "entropy": 1.0766856670379639, "epoch": 0.00238, "grad_norm": 0.12955817580223083, "kl": 0.7030998021364212, "learning_rate": 7.9999962238462e-06, "loss": -0.0482, "step": 238, "step_time": 3.9811685009999564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 4.9166669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9342327415943146, "epoch": 0.00239, "frac_reward_zero_std": 0.0, "grad_norm": 0.2111242264509201, "kl": 0.2552301101386547, "learning_rate": 7.999996186366002e-06, "loss": -0.091, "num_tokens": 6476532.0, "reward": -8.031959533691406, "reward_std": 14.39788818359375, "rewards/rollout_reward_func/mean": -8.031959533691406, "rewards/rollout_reward_func/std": 14.397887229919434, "sampling/importance_sampling_ratio/max": 1.6799466609954834, "sampling/importance_sampling_ratio/mean": 0.8035215139389038, "sampling/importance_sampling_ratio/min": 5.955944288871251e-07, "sampling/sampling_logp_difference/max": 2.4864282608032227, "sampling/sampling_logp_difference/mean": 0.3457469046115875, "step": 239, "step_time": 18.733473442001014 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.9549396932125092, "epoch": 0.0024, "grad_norm": 0.19540776312351227, "kl": 0.2132934033870697, "learning_rate": 7.999996148700719e-06, "loss": -0.0912, "step": 240, "step_time": 3.9527178609996554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 7.78125, "completions/mean_terminated_length": 4.045454502105713, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.140638291835785, "epoch": 0.00241, "frac_reward_zero_std": 0.0, "grad_norm": 0.10748062282800674, "kl": 0.22763187624514103, "learning_rate": 7.999996110850347e-06, "loss": -0.0917, "num_tokens": 6531582.0, "reward": -14.768410682678223, "reward_std": 12.166642189025879, "rewards/rollout_reward_func/mean": -14.768410682678223, "rewards/rollout_reward_func/std": 12.166642189025879, "sampling/importance_sampling_ratio/max": 1.8700083494186401, "sampling/importance_sampling_ratio/mean": 0.6611031889915466, "sampling/importance_sampling_ratio/min": 4.051985342812259e-06, "sampling/sampling_logp_difference/max": 1.7738981246948242, "sampling/sampling_logp_difference/mean": 0.36780017614364624, "step": 241, "step_time": 16.468710703001307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1516302824020386, "epoch": 0.00242, "grad_norm": 0.10820019990205765, "kl": 0.22418345510959625, "learning_rate": 7.999996072814888e-06, "loss": -0.0915, "step": 242, "step_time": 4.75956854099968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 7.34375, "completions/mean_terminated_length": 3.95652174949646, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8204703629016876, "epoch": 0.00243, "frac_reward_zero_std": 0.0, "grad_norm": 0.10962741822004318, "kl": 0.312913678586483, "learning_rate": 7.999996034594342e-06, "loss": -0.0568, "num_tokens": 6586732.0, "reward": -7.634429931640625, "reward_std": 17.2109432220459, "rewards/rollout_reward_func/mean": -7.634429931640625, "rewards/rollout_reward_func/std": 17.210941314697266, "sampling/importance_sampling_ratio/max": 1.5878872871398926, "sampling/importance_sampling_ratio/mean": 0.693681001663208, "sampling/importance_sampling_ratio/min": 1.793339606592781e-06, "sampling/sampling_logp_difference/max": 2.11830997467041, "sampling/sampling_logp_difference/mean": 0.3313266336917877, "step": 243, "step_time": 17.242770699000175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8275801241397858, "epoch": 0.00244, "grad_norm": 0.11459728330373764, "kl": 0.3039754405617714, "learning_rate": 7.99999599618871e-06, "loss": -0.057, "step": 244, "step_time": 4.771227783999166 }, { "clip_ratio/high_max": 0.01666666753590107, "clip_ratio/high_mean": 0.004166666883975267, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.1875, "completions/mean_terminated_length": 5.099999904632568, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3528267443180084, "epoch": 0.00245, "frac_reward_zero_std": 0.0, "grad_norm": 0.11151722818613052, "kl": 0.17850125767290592, "learning_rate": 7.999995957597988e-06, "loss": -0.0981, "num_tokens": 6640511.0, "reward": -12.895721435546875, "reward_std": 15.25489330291748, "rewards/rollout_reward_func/mean": -12.895721435546875, "rewards/rollout_reward_func/std": 15.25489330291748, "sampling/importance_sampling_ratio/max": 1.4566997289657593, "sampling/importance_sampling_ratio/mean": 0.5470654964447021, "sampling/importance_sampling_ratio/min": 1.606198338777176e-06, "sampling/sampling_logp_difference/max": 2.3842310905456543, "sampling/sampling_logp_difference/mean": 0.3899468779563904, "step": 245, "step_time": 17.039230008999766 }, { "clip_ratio/high_max": 0.01666666753590107, "clip_ratio/high_mean": 0.004166666883975267, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "entropy": 2.3487825989723206, "epoch": 0.00246, "grad_norm": 0.1101762056350708, "kl": 0.1729639507830143, "learning_rate": 7.99999591882218e-06, "loss": -0.0984, "step": 246, "step_time": 3.782024069999352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.15625, "completions/mean_terminated_length": 5.5714287757873535, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2728529274463654, "epoch": 0.00247, "frac_reward_zero_std": 0.0, "grad_norm": 0.1775488257408142, "kl": 0.3432020992040634, "learning_rate": 7.999995879861286e-06, "loss": -0.0573, "num_tokens": 6693213.0, "reward": -13.802350997924805, "reward_std": 12.804109573364258, "rewards/rollout_reward_func/mean": -13.802350997924805, "rewards/rollout_reward_func/std": 12.804108619689941, "sampling/importance_sampling_ratio/max": 1.4575722217559814, "sampling/importance_sampling_ratio/mean": 0.5122997164726257, "sampling/importance_sampling_ratio/min": 1.4154550171951996e-06, "sampling/sampling_logp_difference/max": 2.0181727409362793, "sampling/sampling_logp_difference/mean": 0.38230982422828674, "step": 247, "step_time": 17.242799384000136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2733653485774994, "epoch": 0.00248, "grad_norm": 0.20373190939426422, "kl": 0.33919014781713486, "learning_rate": 7.999995840715304e-06, "loss": -0.0584, "step": 248, "step_time": 3.96206845100005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.90625, "completions/mean_terminated_length": 4.650000095367432, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.5371057391166687, "epoch": 0.00249, "frac_reward_zero_std": 0.0, "grad_norm": 0.20920880138874054, "kl": 0.7365269064903259, "learning_rate": 7.999995801384234e-06, "loss": -0.0915, "num_tokens": 6747275.0, "reward": -13.13370418548584, "reward_std": 14.801115989685059, "rewards/rollout_reward_func/mean": -13.13370418548584, "rewards/rollout_reward_func/std": 14.801115036010742, "sampling/importance_sampling_ratio/max": 1.6776845455169678, "sampling/importance_sampling_ratio/mean": 0.5383882522583008, "sampling/importance_sampling_ratio/min": 1.6394942070618868e-09, "sampling/sampling_logp_difference/max": 2.544861078262329, "sampling/sampling_logp_difference/mean": 0.43172574043273926, "step": 249, "step_time": 16.440954849999798 }, { "clip_ratio/high_max": 0.0357142873108387, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 2.5391393899917603, "epoch": 0.0025, "grad_norm": 0.20684532821178436, "kl": 0.6392314210534096, "learning_rate": 7.999995761868076e-06, "loss": -0.0924, "step": 250, "step_time": 3.938205837001078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 4.363636493682861, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.271556183695793, "epoch": 0.00251, "frac_reward_zero_std": 0.0, "grad_norm": 0.12634317576885223, "kl": 0.3540884405374527, "learning_rate": 7.999995722166832e-06, "loss": -0.0918, "num_tokens": 6802037.0, "reward": -10.644975662231445, "reward_std": 14.398420333862305, "rewards/rollout_reward_func/mean": -10.644975662231445, "rewards/rollout_reward_func/std": 14.398420333862305, "sampling/importance_sampling_ratio/max": 1.795727014541626, "sampling/importance_sampling_ratio/mean": 0.5779068470001221, "sampling/importance_sampling_ratio/min": 3.258958614082985e-08, "sampling/sampling_logp_difference/max": 2.4475274085998535, "sampling/sampling_logp_difference/mean": 0.42518123984336853, "step": 251, "step_time": 17.309207413000877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.276347368955612, "epoch": 0.00252, "grad_norm": 0.12613889575004578, "kl": 0.3797076754271984, "learning_rate": 7.999995682280502e-06, "loss": -0.0913, "step": 252, "step_time": 4.457778432998566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.34375, "completions/mean_terminated_length": 4.458333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8452085852622986, "epoch": 0.00253, "frac_reward_zero_std": 0.0, "grad_norm": 0.17124012112617493, "kl": 0.3692292869091034, "learning_rate": 7.999995642209084e-06, "loss": -0.0845, "num_tokens": 6857658.0, "reward": -7.6295061111450195, "reward_std": 12.747119903564453, "rewards/rollout_reward_func/mean": -7.6295061111450195, "rewards/rollout_reward_func/std": 12.747119903564453, "sampling/importance_sampling_ratio/max": 1.8098281621932983, "sampling/importance_sampling_ratio/mean": 0.6899364590644836, "sampling/importance_sampling_ratio/min": 1.8885872865581632e-09, "sampling/sampling_logp_difference/max": 2.4892146587371826, "sampling/sampling_logp_difference/mean": 0.3498286306858063, "step": 253, "step_time": 17.532108480000716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8395482897758484, "epoch": 0.00254, "grad_norm": 0.16883191466331482, "kl": 0.37155675888061523, "learning_rate": 7.999995601952578e-06, "loss": -0.085, "step": 254, "step_time": 4.727917969998998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 4.6315789222717285, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.496707260608673, "epoch": 0.00255, "frac_reward_zero_std": 0.0, "grad_norm": 0.13593284785747528, "kl": 0.15688609145581722, "learning_rate": 7.999995561510986e-06, "loss": -0.1057, "num_tokens": 6910603.0, "reward": -15.771849632263184, "reward_std": 14.35876750946045, "rewards/rollout_reward_func/mean": -15.771849632263184, "rewards/rollout_reward_func/std": 14.358766555786133, "sampling/importance_sampling_ratio/max": 1.7089167833328247, "sampling/importance_sampling_ratio/mean": 0.57563316822052, "sampling/importance_sampling_ratio/min": 1.598690886339682e-07, "sampling/sampling_logp_difference/max": 2.112820625305176, "sampling/sampling_logp_difference/mean": 0.4000401496887207, "step": 255, "step_time": 16.580350751999504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4882882237434387, "epoch": 0.00256, "grad_norm": 0.13002127408981323, "kl": 0.15966368839144707, "learning_rate": 7.999995520884306e-06, "loss": -0.1063, "step": 256, "step_time": 3.863376438999694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.71875, "completions/mean_terminated_length": 3.954545497894287, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8822687864303589, "epoch": 0.00257, "frac_reward_zero_std": 0.0, "grad_norm": 0.17419789731502533, "kl": 0.30010559037327766, "learning_rate": 7.99999548007254e-06, "loss": -0.096, "num_tokens": 6971183.0, "reward": -13.805538177490234, "reward_std": 17.633119583129883, "rewards/rollout_reward_func/mean": -13.805538177490234, "rewards/rollout_reward_func/std": 17.633119583129883, "sampling/importance_sampling_ratio/max": 1.725441336631775, "sampling/importance_sampling_ratio/mean": 0.7203733325004578, "sampling/importance_sampling_ratio/min": 1.6379705414237833e-07, "sampling/sampling_logp_difference/max": 1.81145441532135, "sampling/sampling_logp_difference/mean": 0.38121888041496277, "step": 257, "step_time": 19.15125231100137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8630396723747253, "epoch": 0.00258, "grad_norm": 0.1691397875547409, "kl": 0.31672852113842964, "learning_rate": 7.999995439075685e-06, "loss": -0.0969, "step": 258, "step_time": 4.359693027999128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 4.956521987915039, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7654127776622772, "epoch": 0.00259, "frac_reward_zero_std": 0.0, "grad_norm": 0.15566959977149963, "kl": 0.5097969397902489, "learning_rate": 7.999995397893743e-06, "loss": -0.0478, "num_tokens": 7033194.0, "reward": -13.280502319335938, "reward_std": 17.40966796875, "rewards/rollout_reward_func/mean": -13.280502319335938, "rewards/rollout_reward_func/std": 17.40966796875, "sampling/importance_sampling_ratio/max": 1.6417107582092285, "sampling/importance_sampling_ratio/mean": 0.5553216934204102, "sampling/importance_sampling_ratio/min": 1.6735540953050076e-08, "sampling/sampling_logp_difference/max": 2.050567150115967, "sampling/sampling_logp_difference/mean": 0.3290998935699463, "step": 259, "step_time": 20.085864500999378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.7531764507293701, "epoch": 0.0026, "grad_norm": 0.15965068340301514, "kl": 0.5093955062329769, "learning_rate": 7.999995356526716e-06, "loss": -0.048, "step": 260, "step_time": 4.424955650000811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.03125, "completions/mean_terminated_length": 5.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3516737222671509, "epoch": 0.00261, "frac_reward_zero_std": 0.0, "grad_norm": 0.23201431334018707, "kl": 0.7666451632976532, "learning_rate": 7.9999953149746e-06, "loss": -0.0687, "num_tokens": 7095506.0, "reward": -10.622795104980469, "reward_std": 12.983816146850586, "rewards/rollout_reward_func/mean": -10.622795104980469, "rewards/rollout_reward_func/std": 12.98381519317627, "sampling/importance_sampling_ratio/max": 1.8524198532104492, "sampling/importance_sampling_ratio/mean": 0.7690733671188354, "sampling/importance_sampling_ratio/min": 0.0005940155242569745, "sampling/sampling_logp_difference/max": 1.723008155822754, "sampling/sampling_logp_difference/mean": 0.28241196274757385, "step": 261, "step_time": 21.723063561000345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3486091196537018, "epoch": 0.00262, "grad_norm": 0.25002339482307434, "kl": 0.7338548824191093, "learning_rate": 7.999995273237395e-06, "loss": -0.0691, "step": 262, "step_time": 4.43495473800067 }, { "clip_ratio/high_max": 0.011904762126505375, "clip_ratio/high_mean": 0.0029761905316263437, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029761905316263437, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 5.818181991577148, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1859094202518463, "epoch": 0.00263, "frac_reward_zero_std": 0.0, "grad_norm": 0.08486605435609818, "kl": 0.5048080086708069, "learning_rate": 7.999995231315105e-06, "loss": -0.0765, "num_tokens": 7156738.0, "reward": -12.45107650756836, "reward_std": 17.349037170410156, "rewards/rollout_reward_func/mean": -12.45107650756836, "rewards/rollout_reward_func/std": 17.349037170410156, "sampling/importance_sampling_ratio/max": 1.4535378217697144, "sampling/importance_sampling_ratio/mean": 0.48370131850242615, "sampling/importance_sampling_ratio/min": 1.6268334945834795e-07, "sampling/sampling_logp_difference/max": 1.8574566841125488, "sampling/sampling_logp_difference/mean": 0.39963001012802124, "step": 263, "step_time": 20.122229978000178 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 2.1934211552143097, "epoch": 0.00264, "grad_norm": 0.12146904319524765, "kl": 0.4627549573779106, "learning_rate": 7.999995189207729e-06, "loss": -0.0764, "step": 264, "step_time": 4.877285838999342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 4.360000133514404, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5167631059885025, "epoch": 0.00265, "frac_reward_zero_std": 0.0, "grad_norm": 0.12021442502737045, "kl": 0.6153994686901569, "learning_rate": 7.999995146915264e-06, "loss": -0.0989, "num_tokens": 7220419.0, "reward": -9.679851531982422, "reward_std": 16.83230972290039, "rewards/rollout_reward_func/mean": -9.679851531982422, "rewards/rollout_reward_func/std": 16.83230972290039, "sampling/importance_sampling_ratio/max": 1.8400452136993408, "sampling/importance_sampling_ratio/mean": 0.7719322443008423, "sampling/importance_sampling_ratio/min": 4.64403819933068e-06, "sampling/sampling_logp_difference/max": 2.5815987586975098, "sampling/sampling_logp_difference/mean": 0.3394455313682556, "step": 265, "step_time": 20.870658488000117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5193945467472076, "epoch": 0.00266, "grad_norm": 0.12084375321865082, "kl": 0.5979751907289028, "learning_rate": 7.999995104437712e-06, "loss": -0.099, "step": 266, "step_time": 4.422717875001581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.34375, "completions/mean_terminated_length": 4.115384578704834, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5240772664546967, "epoch": 0.00267, "frac_reward_zero_std": 0.0, "grad_norm": 0.09888014942407608, "kl": 0.6499885991215706, "learning_rate": 7.999995061775074e-06, "loss": -0.0594, "num_tokens": 7282754.0, "reward": -12.27413558959961, "reward_std": 16.348922729492188, "rewards/rollout_reward_func/mean": -12.27413558959961, "rewards/rollout_reward_func/std": 16.348922729492188, "sampling/importance_sampling_ratio/max": 1.4538651704788208, "sampling/importance_sampling_ratio/mean": 0.6872031092643738, "sampling/importance_sampling_ratio/min": 9.809443145059049e-05, "sampling/sampling_logp_difference/max": 1.7195124626159668, "sampling/sampling_logp_difference/mean": 0.289919376373291, "step": 267, "step_time": 21.37912380599937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5256189927458763, "epoch": 0.00268, "grad_norm": 0.0977659597992897, "kl": 0.6200659945607185, "learning_rate": 7.999995018927347e-06, "loss": -0.0596, "step": 268, "step_time": 4.403800633998799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.538461685180664, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.539846509695053, "epoch": 0.00269, "frac_reward_zero_std": 0.0, "grad_norm": 0.1732328087091446, "kl": 0.49252986535429955, "learning_rate": 7.999994975894534e-06, "loss": -0.0716, "num_tokens": 7345174.0, "reward": -13.39406967163086, "reward_std": 12.967248916625977, "rewards/rollout_reward_func/mean": -13.39406967163086, "rewards/rollout_reward_func/std": 12.967249870300293, "sampling/importance_sampling_ratio/max": 2.2599642276763916, "sampling/importance_sampling_ratio/mean": 0.7971680164337158, "sampling/importance_sampling_ratio/min": 0.00048333575250580907, "sampling/sampling_logp_difference/max": 1.8083865642547607, "sampling/sampling_logp_difference/mean": 0.2837492823600769, "step": 269, "step_time": 20.797480311999607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.541822075843811, "epoch": 0.0027, "grad_norm": 0.18535968661308289, "kl": 0.4636204428970814, "learning_rate": 7.999994932676635e-06, "loss": -0.0718, "step": 270, "step_time": 4.396799055998599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.03125, "completions/mean_terminated_length": 5.38095235824585, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.933655172586441, "epoch": 0.00271, "frac_reward_zero_std": 0.0, "grad_norm": 0.16307781636714935, "kl": 0.6863114535808563, "learning_rate": 7.999994889273647e-06, "loss": -0.0692, "num_tokens": 7408233.0, "reward": -15.24937629699707, "reward_std": 19.114633560180664, "rewards/rollout_reward_func/mean": -15.24937629699707, "rewards/rollout_reward_func/std": 19.114633560180664, "sampling/importance_sampling_ratio/max": 1.4652762413024902, "sampling/importance_sampling_ratio/mean": 0.468178927898407, "sampling/importance_sampling_ratio/min": 0.00015646709653083235, "sampling/sampling_logp_difference/max": 2.020108222961426, "sampling/sampling_logp_difference/mean": 0.3453989028930664, "step": 271, "step_time": 20.174783993001256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9438427984714508, "epoch": 0.00272, "grad_norm": 0.16116470098495483, "kl": 0.6448568180203438, "learning_rate": 7.999994845685572e-06, "loss": -0.0693, "step": 272, "step_time": 4.378425175001212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.96875, "completions/mean_terminated_length": 4.440000057220459, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6132279336452484, "epoch": 0.00273, "frac_reward_zero_std": 0.0, "grad_norm": 0.15369410812854767, "kl": 0.39524298161268234, "learning_rate": 7.99999480191241e-06, "loss": -0.0563, "num_tokens": 7471953.0, "reward": -9.469846725463867, "reward_std": 14.98778247833252, "rewards/rollout_reward_func/mean": -9.469846725463867, "rewards/rollout_reward_func/std": 14.987781524658203, "sampling/importance_sampling_ratio/max": 1.6361068487167358, "sampling/importance_sampling_ratio/mean": 0.7488998174667358, "sampling/importance_sampling_ratio/min": 3.268207365181297e-05, "sampling/sampling_logp_difference/max": 1.7056207656860352, "sampling/sampling_logp_difference/mean": 0.320708304643631, "step": 273, "step_time": 21.6177913370002 }, { "clip_ratio/high_max": 0.05902777798473835, "clip_ratio/high_mean": 0.014756944496184587, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014756944496184587, "entropy": 1.6261461079120636, "epoch": 0.00274, "grad_norm": 0.1359369456768036, "kl": 0.3733623996376991, "learning_rate": 7.99999475795416e-06, "loss": -0.0568, "step": 274, "step_time": 4.874337018000006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 5.454545497894287, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.100952923297882, "epoch": 0.00275, "frac_reward_zero_std": 0.0, "grad_norm": 0.1247059553861618, "kl": 0.22105363011360168, "learning_rate": 7.999994713810826e-06, "loss": -0.0969, "num_tokens": 7532322.0, "reward": -17.891807556152344, "reward_std": 19.512033462524414, "rewards/rollout_reward_func/mean": -17.891807556152344, "rewards/rollout_reward_func/std": 19.512033462524414, "sampling/importance_sampling_ratio/max": 1.611554741859436, "sampling/importance_sampling_ratio/mean": 0.5693097114562988, "sampling/importance_sampling_ratio/min": 3.412108708289452e-05, "sampling/sampling_logp_difference/max": 1.6634929180145264, "sampling/sampling_logp_difference/mean": 0.3405132293701172, "step": 275, "step_time": 19.933711675997984 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 2.10884428024292, "epoch": 0.00276, "grad_norm": 0.1310911476612091, "kl": 0.20540013909339905, "learning_rate": 7.999994669482402e-06, "loss": -0.0968, "step": 276, "step_time": 4.397033187999114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 7.03125, "completions/mean_terminated_length": 4.0416669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6935067474842072, "epoch": 0.00277, "frac_reward_zero_std": 0.0, "grad_norm": 0.19423824548721313, "kl": 0.4054827019572258, "learning_rate": 7.999994624968891e-06, "loss": -0.1055, "num_tokens": 7595494.0, "reward": -9.705195426940918, "reward_std": 15.566546440124512, "rewards/rollout_reward_func/mean": -9.705195426940918, "rewards/rollout_reward_func/std": 15.566546440124512, "sampling/importance_sampling_ratio/max": 2.8489632606506348, "sampling/importance_sampling_ratio/mean": 0.7836048603057861, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.0634913444519043, "sampling/sampling_logp_difference/mean": 0.42782697081565857, "step": 277, "step_time": 21.301338262000172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6959945857524872, "epoch": 0.00278, "grad_norm": 0.22046692669391632, "kl": 0.3896676227450371, "learning_rate": 7.999994580270293e-06, "loss": -0.1072, "step": 278, "step_time": 4.388507834999473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.71875, "completions/mean_terminated_length": 4.904761791229248, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.249329775571823, "epoch": 0.00279, "frac_reward_zero_std": 0.0, "grad_norm": 0.16570739448070526, "kl": 0.2271353341639042, "learning_rate": 7.999994535386609e-06, "loss": -0.1045, "num_tokens": 7654994.0, "reward": -18.90340232849121, "reward_std": 18.016429901123047, "rewards/rollout_reward_func/mean": -18.90340232849121, "rewards/rollout_reward_func/std": 18.01643180847168, "sampling/importance_sampling_ratio/max": 1.6791698932647705, "sampling/importance_sampling_ratio/mean": 0.6704408526420593, "sampling/importance_sampling_ratio/min": 4.0071827811516414e-07, "sampling/sampling_logp_difference/max": 2.634486198425293, "sampling/sampling_logp_difference/mean": 0.3765864670276642, "step": 279, "step_time": 19.956579213999248 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 2.2498095333576202, "epoch": 0.0028, "grad_norm": 0.15883800387382507, "kl": 0.2266552560031414, "learning_rate": 7.999994490317837e-06, "loss": -0.1045, "step": 280, "step_time": 4.3842818070006615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.1875, "completions/mean_terminated_length": 5.099999904632568, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9734165966510773, "epoch": 0.00281, "frac_reward_zero_std": 0.0, "grad_norm": 0.14942969381809235, "kl": 0.32882358878850937, "learning_rate": 7.999994445063977e-06, "loss": -0.064, "num_tokens": 7714253.0, "reward": -15.453794479370117, "reward_std": 20.136919021606445, "rewards/rollout_reward_func/mean": -15.453794479370117, "rewards/rollout_reward_func/std": 20.136917114257812, "sampling/importance_sampling_ratio/max": 1.357968807220459, "sampling/importance_sampling_ratio/mean": 0.39336246252059937, "sampling/importance_sampling_ratio/min": 9.344332738692174e-07, "sampling/sampling_logp_difference/max": 2.1573987007141113, "sampling/sampling_logp_difference/mean": 0.3165104389190674, "step": 281, "step_time": 20.762598386997524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9717764556407928, "epoch": 0.00282, "grad_norm": 0.14680331945419312, "kl": 0.3377998359501362, "learning_rate": 7.999994399625032e-06, "loss": -0.064, "step": 282, "step_time": 4.425596372000655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.03125, "completions/mean_terminated_length": 5.388888835906982, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.434348165988922, "epoch": 0.00283, "frac_reward_zero_std": 0.0, "grad_norm": 0.1379784643650055, "kl": 0.22688093781471252, "learning_rate": 7.999994354001e-06, "loss": -0.0759, "num_tokens": 7776335.0, "reward": -11.598833084106445, "reward_std": 20.224546432495117, "rewards/rollout_reward_func/mean": -11.598833084106445, "rewards/rollout_reward_func/std": 20.224546432495117, "sampling/importance_sampling_ratio/max": 2.247178554534912, "sampling/importance_sampling_ratio/mean": 0.45571234822273254, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.8011970520019531, "sampling/sampling_logp_difference/mean": 0.3653387427330017, "step": 283, "step_time": 19.420591616999445 }, { "clip_ratio/high_max": 0.011904762126505375, "clip_ratio/high_mean": 0.0029761905316263437, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0061011905781924725, "entropy": 2.4331246614456177, "epoch": 0.00284, "grad_norm": 0.16472913324832916, "kl": 0.2289178930222988, "learning_rate": 7.999994308191879e-06, "loss": -0.0771, "step": 284, "step_time": 5.094573700003821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.34375, "completions/mean_terminated_length": 5.857142925262451, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.7475330233573914, "epoch": 0.00285, "frac_reward_zero_std": 0.0, "grad_norm": 0.127237930893898, "kl": 0.1721923891454935, "learning_rate": 7.999994262197671e-06, "loss": -0.0794, "num_tokens": 7835955.0, "reward": -14.205961227416992, "reward_std": 19.279361724853516, "rewards/rollout_reward_func/mean": -14.205961227416992, "rewards/rollout_reward_func/std": 19.279361724853516, "sampling/importance_sampling_ratio/max": 1.7615692615509033, "sampling/importance_sampling_ratio/mean": 0.5125935077667236, "sampling/importance_sampling_ratio/min": 8.144409946453379e-08, "sampling/sampling_logp_difference/max": 2.206878185272217, "sampling/sampling_logp_difference/mean": 0.43725600838661194, "step": 285, "step_time": 20.570381534000262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7453673481941223, "epoch": 0.00286, "grad_norm": 0.12727075815200806, "kl": 0.17305655032396317, "learning_rate": 7.999994216018377e-06, "loss": -0.0797, "step": 286, "step_time": 4.315165444999366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.03125, "completions/mean_terminated_length": 4.764706134796143, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.129144474864006, "epoch": 0.00287, "frac_reward_zero_std": 0.0, "grad_norm": 0.1992952972650528, "kl": 0.27951181679964066, "learning_rate": 7.999994169653994e-06, "loss": -0.0933, "num_tokens": 7894472.0, "reward": -20.50949478149414, "reward_std": 15.249759674072266, "rewards/rollout_reward_func/mean": -20.50949478149414, "rewards/rollout_reward_func/std": 15.249759674072266, "sampling/importance_sampling_ratio/max": 2.7305748462677, "sampling/importance_sampling_ratio/mean": 0.4726889729499817, "sampling/importance_sampling_ratio/min": 9.641037195251556e-07, "sampling/sampling_logp_difference/max": 2.0861310958862305, "sampling/sampling_logp_difference/mean": 0.3367248475551605, "step": 287, "step_time": 19.08827674999884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1227496415376663, "epoch": 0.00288, "grad_norm": 0.1998666375875473, "kl": 0.28245312720537186, "learning_rate": 7.999994123104526e-06, "loss": -0.0937, "step": 288, "step_time": 4.367479153999739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.09375, "completions/mean_terminated_length": 4.950000286102295, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.354187697172165, "epoch": 0.00289, "frac_reward_zero_std": 0.0, "grad_norm": 0.1813945472240448, "kl": 0.38566706515848637, "learning_rate": 7.999994076369971e-06, "loss": -0.089, "num_tokens": 7956979.0, "reward": -17.599864959716797, "reward_std": 16.257308959960938, "rewards/rollout_reward_func/mean": -17.599864959716797, "rewards/rollout_reward_func/std": 16.25731086730957, "sampling/importance_sampling_ratio/max": 1.456375241279602, "sampling/importance_sampling_ratio/mean": 0.4739567041397095, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.9823466539382935, "sampling/sampling_logp_difference/mean": 0.35087984800338745, "step": 289, "step_time": 20.31729425300182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3526152968406677, "epoch": 0.0029, "grad_norm": 0.18833577632904053, "kl": 0.36690139025449753, "learning_rate": 7.999994029450328e-06, "loss": -0.0895, "step": 290, "step_time": 4.345800559000054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.625, "completions/mean_terminated_length": 4.533333778381348, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.626754581928253, "epoch": 0.00291, "frac_reward_zero_std": 0.0, "grad_norm": 0.16386912763118744, "kl": 0.25906356796622276, "learning_rate": 7.999993982345597e-06, "loss": -0.1117, "num_tokens": 8015288.0, "reward": -22.44664192199707, "reward_std": 19.429838180541992, "rewards/rollout_reward_func/mean": -22.44664192199707, "rewards/rollout_reward_func/std": 19.42983627319336, "sampling/importance_sampling_ratio/max": 1.5636088848114014, "sampling/importance_sampling_ratio/mean": 0.4456107020378113, "sampling/importance_sampling_ratio/min": 3.0394932082344894e-07, "sampling/sampling_logp_difference/max": 1.7227634191513062, "sampling/sampling_logp_difference/mean": 0.41813355684280396, "step": 291, "step_time": 18.503948296998715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.6253994703292847, "epoch": 0.00292, "grad_norm": 0.17873428761959076, "kl": 0.2508209999650717, "learning_rate": 7.99999393505578e-06, "loss": -0.1124, "step": 292, "step_time": 4.453078905997245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 9.71875, "completions/mean_terminated_length": 4.176470756530762, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.5500904321670532, "epoch": 0.00293, "frac_reward_zero_std": 0.0, "grad_norm": 0.1781534105539322, "kl": 0.23778606951236725, "learning_rate": 7.999993887580876e-06, "loss": -0.0736, "num_tokens": 8079240.0, "reward": -15.063907623291016, "reward_std": 21.16265106201172, "rewards/rollout_reward_func/mean": -15.063907623291016, "rewards/rollout_reward_func/std": 21.16265296936035, "sampling/importance_sampling_ratio/max": 1.476945400238037, "sampling/importance_sampling_ratio/mean": 0.4927259683609009, "sampling/importance_sampling_ratio/min": 1.7344162017707276e-07, "sampling/sampling_logp_difference/max": 1.7852025032043457, "sampling/sampling_logp_difference/mean": 0.4057835042476654, "step": 293, "step_time": 19.05093170299733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5508653819561005, "epoch": 0.00294, "grad_norm": 0.19004346430301666, "kl": 0.23457182478159666, "learning_rate": 7.999993839920885e-06, "loss": -0.0743, "step": 294, "step_time": 5.4481660920009745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 6.153846263885498, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.107565224170685, "epoch": 0.00295, "frac_reward_zero_std": 0.0, "grad_norm": 0.13380056619644165, "kl": 0.08719042129814625, "learning_rate": 7.999993792075807e-06, "loss": -0.0784, "num_tokens": 8137668.0, "reward": -23.438034057617188, "reward_std": 18.48423957824707, "rewards/rollout_reward_func/mean": -23.438034057617188, "rewards/rollout_reward_func/std": 18.48423957824707, "sampling/importance_sampling_ratio/max": 1.3730794191360474, "sampling/importance_sampling_ratio/mean": 0.3061738610267639, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.9863481521606445, "sampling/sampling_logp_difference/mean": 0.43083423376083374, "step": 295, "step_time": 19.846454099000766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.109741449356079, "epoch": 0.00296, "grad_norm": 0.1343320906162262, "kl": 0.08560861460864544, "learning_rate": 7.99999374404564e-06, "loss": -0.0786, "step": 296, "step_time": 4.38275451300251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 5.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5546018481254578, "epoch": 0.00297, "frac_reward_zero_std": 0.0, "grad_norm": 0.29443055391311646, "kl": 0.38789743185043335, "learning_rate": 7.999993695830389e-06, "loss": -0.0669, "num_tokens": 8197393.0, "reward": -16.405517578125, "reward_std": 17.197542190551758, "rewards/rollout_reward_func/mean": -16.405517578125, "rewards/rollout_reward_func/std": 17.197542190551758, "sampling/importance_sampling_ratio/max": 2.2955777645111084, "sampling/importance_sampling_ratio/mean": 0.5064162015914917, "sampling/importance_sampling_ratio/min": 7.911162391849302e-09, "sampling/sampling_logp_difference/max": 2.070685386657715, "sampling/sampling_logp_difference/mean": 0.42361876368522644, "step": 297, "step_time": 19.2124455689991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.545079529285431, "epoch": 0.00298, "grad_norm": 0.2850487530231476, "kl": 0.4173169154673815, "learning_rate": 7.999993647430049e-06, "loss": -0.0678, "step": 298, "step_time": 4.356480348000332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 5.176470756530762, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.435251533985138, "epoch": 0.00299, "frac_reward_zero_std": 0.0, "grad_norm": 0.15514251589775085, "kl": 0.14275969378650188, "learning_rate": 7.99999359884462e-06, "loss": -0.0766, "num_tokens": 8253248.0, "reward": -20.4063777923584, "reward_std": 17.842693328857422, "rewards/rollout_reward_func/mean": -20.4063777923584, "rewards/rollout_reward_func/std": 17.842693328857422, "sampling/importance_sampling_ratio/max": 1.287685751914978, "sampling/importance_sampling_ratio/mean": 0.4414984881877899, "sampling/importance_sampling_ratio/min": 2.2686946010708198e-07, "sampling/sampling_logp_difference/max": 2.4957404136657715, "sampling/sampling_logp_difference/mean": 0.39365899562835693, "step": 299, "step_time": 18.706463366999742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4256091713905334, "epoch": 0.003, "grad_norm": 0.15693189203739166, "kl": 0.14520205929875374, "learning_rate": 7.999993550074108e-06, "loss": -0.0775, "step": 300, "step_time": 4.594729988000836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 4.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7627195715904236, "epoch": 0.00301, "frac_reward_zero_std": 0.0, "grad_norm": 0.16817542910575867, "kl": 0.34644652158021927, "learning_rate": 7.999993501118506e-06, "loss": -0.0763, "num_tokens": 8317136.0, "reward": -15.019756317138672, "reward_std": 17.023662567138672, "rewards/rollout_reward_func/mean": -15.019756317138672, "rewards/rollout_reward_func/std": 17.02366065979004, "sampling/importance_sampling_ratio/max": 1.5469061136245728, "sampling/importance_sampling_ratio/mean": 0.6220060586929321, "sampling/importance_sampling_ratio/min": 1.0607765943859704e-05, "sampling/sampling_logp_difference/max": 1.837457299232483, "sampling/sampling_logp_difference/mean": 0.3085443079471588, "step": 301, "step_time": 20.859138890000395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7507364451885223, "epoch": 0.00302, "grad_norm": 0.16562798619270325, "kl": 0.3493092767894268, "learning_rate": 7.999993451977818e-06, "loss": -0.0772, "step": 302, "step_time": 4.479475610000009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 9.9375, "completions/mean_terminated_length": 4.588235378265381, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.6944032311439514, "epoch": 0.00303, "frac_reward_zero_std": 0.0, "grad_norm": 0.17454662919044495, "kl": 0.15058279316872358, "learning_rate": 7.999993402652043e-06, "loss": -0.0727, "num_tokens": 8375362.0, "reward": -19.50526237487793, "reward_std": 18.96709442138672, "rewards/rollout_reward_func/mean": -19.50526237487793, "rewards/rollout_reward_func/std": 18.96709442138672, "sampling/importance_sampling_ratio/max": 1.4518319368362427, "sampling/importance_sampling_ratio/mean": 0.4577232003211975, "sampling/importance_sampling_ratio/min": 2.9329576989312045e-08, "sampling/sampling_logp_difference/max": 2.5214552879333496, "sampling/sampling_logp_difference/mean": 0.4161914885044098, "step": 303, "step_time": 19.045320917002755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.6909571290016174, "epoch": 0.00304, "grad_norm": 0.1676047444343567, "kl": 0.1531302761286497, "learning_rate": 7.99999335314118e-06, "loss": -0.0724, "step": 304, "step_time": 4.270312253998782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.84375, "completions/mean_terminated_length": 4.550000190734863, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.7141701877117157, "epoch": 0.00305, "frac_reward_zero_std": 0.0, "grad_norm": 0.21883468329906464, "kl": 0.2784410510212183, "learning_rate": 7.99999330344523e-06, "loss": -0.1104, "num_tokens": 8437634.0, "reward": -18.515850067138672, "reward_std": 16.021358489990234, "rewards/rollout_reward_func/mean": -18.515850067138672, "rewards/rollout_reward_func/std": 16.021358489990234, "sampling/importance_sampling_ratio/max": 1.7124261856079102, "sampling/importance_sampling_ratio/mean": 0.5615800619125366, "sampling/importance_sampling_ratio/min": 1.1824927526049578e-07, "sampling/sampling_logp_difference/max": 2.1681106090545654, "sampling/sampling_logp_difference/mean": 0.4322679042816162, "step": 305, "step_time": 20.00415210599931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.714418113231659, "epoch": 0.00306, "grad_norm": 0.2074667066335678, "kl": 0.28038154542446136, "learning_rate": 7.999993253564196e-06, "loss": -0.1115, "step": 306, "step_time": 4.416210985000362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.71875, "completions/mean_terminated_length": 4.904761791229248, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8370510041713715, "epoch": 0.00307, "frac_reward_zero_std": 0.0, "grad_norm": 0.16170203685760498, "kl": 0.28916640765964985, "learning_rate": 7.999993203498072e-06, "loss": -0.0833, "num_tokens": 8498047.0, "reward": -18.061891555786133, "reward_std": 17.51820945739746, "rewards/rollout_reward_func/mean": -18.061891555786133, "rewards/rollout_reward_func/std": 17.51820945739746, "sampling/importance_sampling_ratio/max": 1.7164628505706787, "sampling/importance_sampling_ratio/mean": 0.6425861120223999, "sampling/importance_sampling_ratio/min": 3.353370345848816e-07, "sampling/sampling_logp_difference/max": 2.019685983657837, "sampling/sampling_logp_difference/mean": 0.29738616943359375, "step": 307, "step_time": 19.841939379000905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8289556503295898, "epoch": 0.00308, "grad_norm": 0.16262921690940857, "kl": 0.31169747188687325, "learning_rate": 7.999993153246862e-06, "loss": -0.0833, "step": 308, "step_time": 4.408296212001005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.34375, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9044212400913239, "epoch": 0.00309, "frac_reward_zero_std": 0.0, "grad_norm": 0.2175716757774353, "kl": 0.6128500215709209, "learning_rate": 7.999993102810564e-06, "loss": -0.0775, "num_tokens": 8562332.0, "reward": -15.01672649383545, "reward_std": 16.590078353881836, "rewards/rollout_reward_func/mean": -15.01672649383545, "rewards/rollout_reward_func/std": 16.590078353881836, "sampling/importance_sampling_ratio/max": 1.613653302192688, "sampling/importance_sampling_ratio/mean": 0.5771558284759521, "sampling/importance_sampling_ratio/min": 9.375665831612423e-05, "sampling/sampling_logp_difference/max": 1.976401448249817, "sampling/sampling_logp_difference/mean": 0.33531099557876587, "step": 309, "step_time": 21.069777802000317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.899445503950119, "epoch": 0.0031, "grad_norm": 0.21685755252838135, "kl": 0.6088426597416401, "learning_rate": 7.99999305218918e-06, "loss": -0.0779, "step": 310, "step_time": 4.942021375998593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.53125, "completions/mean_terminated_length": 5.159999847412109, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5647799372673035, "epoch": 0.00311, "frac_reward_zero_std": 0.0, "grad_norm": 0.14498397707939148, "kl": 0.5025646314024925, "learning_rate": 7.999993001382707e-06, "loss": -0.0604, "num_tokens": 8626767.0, "reward": -10.395099639892578, "reward_std": 14.864343643188477, "rewards/rollout_reward_func/mean": -10.395099639892578, "rewards/rollout_reward_func/std": 14.864343643188477, "sampling/importance_sampling_ratio/max": 1.464036464691162, "sampling/importance_sampling_ratio/mean": 0.6499402523040771, "sampling/importance_sampling_ratio/min": 5.3084750106791034e-06, "sampling/sampling_logp_difference/max": 1.915421485900879, "sampling/sampling_logp_difference/mean": 0.3050902485847473, "step": 311, "step_time": 22.268787397999404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5588985979557037, "epoch": 0.00312, "grad_norm": 0.14911703765392303, "kl": 0.5107932239770889, "learning_rate": 7.99999295039115e-06, "loss": -0.0614, "step": 312, "step_time": 4.426290038001753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 6.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.23248890042305, "epoch": 0.00313, "frac_reward_zero_std": 0.0, "grad_norm": 0.24595314264297485, "kl": 0.569962028414011, "learning_rate": 7.999992899214505e-06, "loss": -0.0451, "num_tokens": 8689026.0, "reward": -18.594650268554688, "reward_std": 16.420520782470703, "rewards/rollout_reward_func/mean": -18.594650268554688, "rewards/rollout_reward_func/std": 16.420520782470703, "sampling/importance_sampling_ratio/max": 1.3750810623168945, "sampling/importance_sampling_ratio/mean": 0.43849092721939087, "sampling/importance_sampling_ratio/min": 3.1600890793015424e-08, "sampling/sampling_logp_difference/max": 2.2639899253845215, "sampling/sampling_logp_difference/mean": 0.4310978353023529, "step": 313, "step_time": 21.114495892001287 }, { "clip_ratio/high_max": 0.0555555559694767, "clip_ratio/high_mean": 0.013888888992369175, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013888888992369175, "entropy": 2.2396149337291718, "epoch": 0.00314, "grad_norm": 0.23528218269348145, "kl": 0.5335803255438805, "learning_rate": 7.999992847852771e-06, "loss": -0.0462, "step": 314, "step_time": 4.398278281001694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.625, "completions/mean_terminated_length": 5.2727274894714355, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1704867780208588, "epoch": 0.00315, "frac_reward_zero_std": 0.0, "grad_norm": 0.12072563916444778, "kl": 0.3340882137417793, "learning_rate": 7.999992796305951e-06, "loss": -0.0843, "num_tokens": 8749690.0, "reward": -16.91543197631836, "reward_std": 16.84542465209961, "rewards/rollout_reward_func/mean": -16.91543197631836, "rewards/rollout_reward_func/std": 16.845422744750977, "sampling/importance_sampling_ratio/max": 1.4414572715759277, "sampling/importance_sampling_ratio/mean": 0.5841809511184692, "sampling/importance_sampling_ratio/min": 3.290441327408189e-06, "sampling/sampling_logp_difference/max": 1.736545443534851, "sampling/sampling_logp_difference/mean": 0.41212835907936096, "step": 315, "step_time": 19.345189183997718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.180101126432419, "epoch": 0.00316, "grad_norm": 0.12163487821817398, "kl": 0.3072635121643543, "learning_rate": 7.999992744574046e-06, "loss": -0.0851, "step": 316, "step_time": 4.3192574529984995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.875, "completions/mean_terminated_length": 4.599999904632568, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.342890352010727, "epoch": 0.00317, "frac_reward_zero_std": 0.0, "grad_norm": 0.12424305081367493, "kl": 0.4465862400829792, "learning_rate": 7.999992692657052e-06, "loss": -0.0658, "num_tokens": 8812507.0, "reward": -14.589262962341309, "reward_std": 20.733007431030273, "rewards/rollout_reward_func/mean": -14.589262962341309, "rewards/rollout_reward_func/std": 20.733007431030273, "sampling/importance_sampling_ratio/max": 1.4496190547943115, "sampling/importance_sampling_ratio/mean": 0.5400967001914978, "sampling/importance_sampling_ratio/min": 3.150397105855518e-06, "sampling/sampling_logp_difference/max": 3.0580315589904785, "sampling/sampling_logp_difference/mean": 0.4049890637397766, "step": 317, "step_time": 19.81865888900211 }, { "clip_ratio/high_max": 0.02440476231276989, "clip_ratio/high_mean": 0.0061011905781924725, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0061011905781924725, "entropy": 2.3537756204605103, "epoch": 0.00318, "grad_norm": 0.11054342240095139, "kl": 0.385508019477129, "learning_rate": 7.999992640554971e-06, "loss": -0.0665, "step": 318, "step_time": 4.412461428997631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 4.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9640852808952332, "epoch": 0.00319, "frac_reward_zero_std": 0.0, "grad_norm": 0.18149054050445557, "kl": 0.1705967951565981, "learning_rate": 7.999992588267803e-06, "loss": -0.0861, "num_tokens": 8875363.0, "reward": -13.550469398498535, "reward_std": 20.953073501586914, "rewards/rollout_reward_func/mean": -13.550469398498535, "rewards/rollout_reward_func/std": 20.953073501586914, "sampling/importance_sampling_ratio/max": 1.5421020984649658, "sampling/importance_sampling_ratio/mean": 0.7350863814353943, "sampling/importance_sampling_ratio/min": 6.128847599029541e-05, "sampling/sampling_logp_difference/max": 1.4400146007537842, "sampling/sampling_logp_difference/mean": 0.29858607053756714, "step": 319, "step_time": 20.94463397799882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9747523218393326, "epoch": 0.0032, "grad_norm": 0.18039517104625702, "kl": 0.16397930681705475, "learning_rate": 7.999992535795547e-06, "loss": -0.0866, "step": 320, "step_time": 4.96585899800084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 4.6666669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5682352632284164, "epoch": 0.00321, "frac_reward_zero_std": 0.0, "grad_norm": 0.08300068974494934, "kl": 0.28661707788705826, "learning_rate": 7.999992483138206e-06, "loss": -0.0663, "num_tokens": 8937760.0, "reward": -12.278928756713867, "reward_std": 17.230913162231445, "rewards/rollout_reward_func/mean": -12.278928756713867, "rewards/rollout_reward_func/std": 17.230915069580078, "sampling/importance_sampling_ratio/max": 1.4337971210479736, "sampling/importance_sampling_ratio/mean": 0.6113978624343872, "sampling/importance_sampling_ratio/min": 9.81551711447537e-05, "sampling/sampling_logp_difference/max": 1.5084974765777588, "sampling/sampling_logp_difference/mean": 0.30212223529815674, "step": 321, "step_time": 20.682065078999585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5769501328468323, "epoch": 0.00322, "grad_norm": 0.08449824154376984, "kl": 0.26884496957063675, "learning_rate": 7.999992430295777e-06, "loss": -0.0664, "step": 322, "step_time": 4.4462619650003035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.90625, "completions/mean_terminated_length": 5.166666507720947, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4824748635292053, "epoch": 0.00323, "frac_reward_zero_std": 0.0, "grad_norm": 0.1600836217403412, "kl": 0.13555404916405678, "learning_rate": 7.99999237726826e-06, "loss": -0.0937, "num_tokens": 8996243.0, "reward": -21.08321189880371, "reward_std": 19.297866821289062, "rewards/rollout_reward_func/mean": -21.08321189880371, "rewards/rollout_reward_func/std": 19.297866821289062, "sampling/importance_sampling_ratio/max": 1.5423580408096313, "sampling/importance_sampling_ratio/mean": 0.5572726130485535, "sampling/importance_sampling_ratio/min": 1.389326598655316e-06, "sampling/sampling_logp_difference/max": 1.7848811149597168, "sampling/sampling_logp_difference/mean": 0.3631199598312378, "step": 323, "step_time": 20.462649720999252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4938772916793823, "epoch": 0.00324, "grad_norm": 0.16167224943637848, "kl": 0.13145863451063633, "learning_rate": 7.999992324055659e-06, "loss": -0.0939, "step": 324, "step_time": 4.351032430000487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.28125, "completions/mean_terminated_length": 4.7727274894714355, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.307255506515503, "epoch": 0.00325, "frac_reward_zero_std": 0.0, "grad_norm": 0.12578928470611572, "kl": 0.2023339346051216, "learning_rate": 7.99999227065797e-06, "loss": -0.068, "num_tokens": 9060658.0, "reward": -14.723221778869629, "reward_std": 13.014717102050781, "rewards/rollout_reward_func/mean": -14.723221778869629, "rewards/rollout_reward_func/std": 13.014717102050781, "sampling/importance_sampling_ratio/max": 1.379841685295105, "sampling/importance_sampling_ratio/mean": 0.560204267501831, "sampling/importance_sampling_ratio/min": 1.4305098772204872e-10, "sampling/sampling_logp_difference/max": 2.3037209510803223, "sampling/sampling_logp_difference/mean": 0.4109978675842285, "step": 325, "step_time": 20.758197012000892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3121678233146667, "epoch": 0.00326, "grad_norm": 0.12469126284122467, "kl": 0.1937619335949421, "learning_rate": 7.999992217075192e-06, "loss": -0.0681, "step": 326, "step_time": 4.428951476002112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 4.559999942779541, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6699815392494202, "epoch": 0.00327, "frac_reward_zero_std": 0.0, "grad_norm": 0.1874384731054306, "kl": 0.27143729105591774, "learning_rate": 7.999992163307328e-06, "loss": -0.0688, "num_tokens": 9123258.0, "reward": -10.257308006286621, "reward_std": 17.367488861083984, "rewards/rollout_reward_func/mean": -10.257308006286621, "rewards/rollout_reward_func/std": 17.367488861083984, "sampling/importance_sampling_ratio/max": 1.5698515176773071, "sampling/importance_sampling_ratio/mean": 0.6791461110115051, "sampling/importance_sampling_ratio/min": 1.8533637557993643e-06, "sampling/sampling_logp_difference/max": 1.7023123502731323, "sampling/sampling_logp_difference/mean": 0.3027149438858032, "step": 327, "step_time": 20.90325719499924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.682560294866562, "epoch": 0.00328, "grad_norm": 0.18677523732185364, "kl": 0.26811081543564796, "learning_rate": 7.999992109354377e-06, "loss": -0.0689, "step": 328, "step_time": 4.446879080001963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.125, "completions/mean_terminated_length": 5.600000381469727, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.9533395171165466, "epoch": 0.00329, "frac_reward_zero_std": 0.0, "grad_norm": 0.09377557784318924, "kl": 0.13541093654930592, "learning_rate": 7.999992055216339e-06, "loss": -0.0649, "num_tokens": 9185734.0, "reward": -19.775957107543945, "reward_std": 14.610590934753418, "rewards/rollout_reward_func/mean": -19.775957107543945, "rewards/rollout_reward_func/std": 14.610589981079102, "sampling/importance_sampling_ratio/max": 1.3526033163070679, "sampling/importance_sampling_ratio/mean": 0.33636143803596497, "sampling/importance_sampling_ratio/min": 4.887223781224748e-07, "sampling/sampling_logp_difference/max": 1.98807954788208, "sampling/sampling_logp_difference/mean": 0.4750409424304962, "step": 329, "step_time": 18.979129236000517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.9574416875839233, "epoch": 0.0033, "grad_norm": 0.09261801093816757, "kl": 0.13277234695851803, "learning_rate": 7.999992000893214e-06, "loss": -0.065, "step": 330, "step_time": 4.837502749001942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 4.800000190734863, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.68165522813797, "epoch": 0.00331, "frac_reward_zero_std": 0.0, "grad_norm": 0.13964952528476715, "kl": 0.17170805297791958, "learning_rate": 7.999991946385003e-06, "loss": -0.0772, "num_tokens": 9246019.0, "reward": -20.3525333404541, "reward_std": 18.48819923400879, "rewards/rollout_reward_func/mean": -20.3525333404541, "rewards/rollout_reward_func/std": 18.48819923400879, "sampling/importance_sampling_ratio/max": 1.4014955759048462, "sampling/importance_sampling_ratio/mean": 0.378917932510376, "sampling/importance_sampling_ratio/min": 3.424230271775741e-06, "sampling/sampling_logp_difference/max": 1.8333488702774048, "sampling/sampling_logp_difference/mean": 0.40435439348220825, "step": 331, "step_time": 19.11594055200112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.6903814673423767, "epoch": 0.00332, "grad_norm": 0.1426868736743927, "kl": 0.16529123485088348, "learning_rate": 7.999991891691704e-06, "loss": -0.0778, "step": 332, "step_time": 4.736373334002565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 10.21875, "completions/mean_terminated_length": 4.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.7351603507995605, "epoch": 0.00333, "frac_reward_zero_std": 0.0, "grad_norm": 0.16847318410873413, "kl": 0.1327975420281291, "learning_rate": 7.999991836813319e-06, "loss": -0.1089, "num_tokens": 9309499.0, "reward": -20.057100296020508, "reward_std": 17.3825740814209, "rewards/rollout_reward_func/mean": -20.057100296020508, "rewards/rollout_reward_func/std": 17.3825740814209, "sampling/importance_sampling_ratio/max": 1.584790825843811, "sampling/importance_sampling_ratio/mean": 0.4250970482826233, "sampling/importance_sampling_ratio/min": 6.9354391598608345e-06, "sampling/sampling_logp_difference/max": 2.0595741271972656, "sampling/sampling_logp_difference/mean": 0.424723356962204, "step": 333, "step_time": 19.878083334999246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.736969470977783, "epoch": 0.00334, "grad_norm": 0.1692853420972824, "kl": 0.13036607205867767, "learning_rate": 7.999991781749846e-06, "loss": -0.1097, "step": 334, "step_time": 4.431961944001159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.875, "completions/mean_terminated_length": 5.066667079925537, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.8293248414993286, "epoch": 0.00335, "frac_reward_zero_std": 0.0, "grad_norm": 0.15973564982414246, "kl": 0.1043900279328227, "learning_rate": 7.999991726501287e-06, "loss": -0.0764, "num_tokens": 9370463.0, "reward": -20.712936401367188, "reward_std": 19.794740676879883, "rewards/rollout_reward_func/mean": -20.712936401367188, "rewards/rollout_reward_func/std": 19.79473876953125, "sampling/importance_sampling_ratio/max": 1.2836577892303467, "sampling/importance_sampling_ratio/mean": 0.3867494463920593, "sampling/importance_sampling_ratio/min": 1.577081860659746e-07, "sampling/sampling_logp_difference/max": 2.2846171855926514, "sampling/sampling_logp_difference/mean": 0.44719234108924866, "step": 335, "step_time": 19.073751795000135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.83605033159256, "epoch": 0.00336, "grad_norm": 0.15918906033039093, "kl": 0.10472761001437902, "learning_rate": 7.99999167106764e-06, "loss": -0.0774, "step": 336, "step_time": 4.355698982999456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.59375, "completions/mean_terminated_length": 5.823529243469238, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.7484697103500366, "epoch": 0.00337, "frac_reward_zero_std": 0.0, "grad_norm": 0.11224502325057983, "kl": 0.12071361392736435, "learning_rate": 7.999991615448907e-06, "loss": -0.0913, "num_tokens": 9429364.0, "reward": -20.579330444335938, "reward_std": 16.926515579223633, "rewards/rollout_reward_func/mean": -20.579330444335938, "rewards/rollout_reward_func/std": 16.926515579223633, "sampling/importance_sampling_ratio/max": 1.4275308847427368, "sampling/importance_sampling_ratio/mean": 0.3912113308906555, "sampling/importance_sampling_ratio/min": 7.978558187460294e-07, "sampling/sampling_logp_difference/max": 1.7781636714935303, "sampling/sampling_logp_difference/mean": 0.3970470726490021, "step": 337, "step_time": 19.831610636998448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.747835874557495, "epoch": 0.00338, "grad_norm": 0.1148754209280014, "kl": 0.1210651695728302, "learning_rate": 7.999991559645087e-06, "loss": -0.0915, "step": 338, "step_time": 4.257421615002386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.15625, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.8452428579330444, "epoch": 0.00339, "frac_reward_zero_std": 0.0, "grad_norm": 0.178130105137825, "kl": 0.16216195840388536, "learning_rate": 7.999991503656178e-06, "loss": -0.0945, "num_tokens": 9488042.0, "reward": -20.12209129333496, "reward_std": 17.127416610717773, "rewards/rollout_reward_func/mean": -20.12209129333496, "rewards/rollout_reward_func/std": 17.127418518066406, "sampling/importance_sampling_ratio/max": 1.4083789587020874, "sampling/importance_sampling_ratio/mean": 0.3959985375404358, "sampling/importance_sampling_ratio/min": 2.4931057396315737e-06, "sampling/sampling_logp_difference/max": 2.125865936279297, "sampling/sampling_logp_difference/mean": 0.45859256386756897, "step": 339, "step_time": 18.52382348500032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8408011198043823, "epoch": 0.0034, "grad_norm": 0.17635096609592438, "kl": 0.16291361581534147, "learning_rate": 7.999991447482183e-06, "loss": -0.0956, "step": 340, "step_time": 4.190799441999843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.65625, "completions/mean_terminated_length": 5.941176414489746, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.7058470845222473, "epoch": 0.00341, "frac_reward_zero_std": 0.0, "grad_norm": 0.14843906462192535, "kl": 0.11365974508225918, "learning_rate": 7.999991391123103e-06, "loss": -0.0878, "num_tokens": 9545647.0, "reward": -20.978458404541016, "reward_std": 16.669965744018555, "rewards/rollout_reward_func/mean": -20.978458404541016, "rewards/rollout_reward_func/std": 16.669963836669922, "sampling/importance_sampling_ratio/max": 1.35588800907135, "sampling/importance_sampling_ratio/mean": 0.41046297550201416, "sampling/importance_sampling_ratio/min": 3.18280626743217e-06, "sampling/sampling_logp_difference/max": 1.9711271524429321, "sampling/sampling_logp_difference/mean": 0.41562581062316895, "step": 341, "step_time": 20.341281260996766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.700623095035553, "epoch": 0.00342, "grad_norm": 0.14433589577674866, "kl": 0.11739380285143852, "learning_rate": 7.999991334578934e-06, "loss": -0.0881, "step": 342, "step_time": 5.204344201001732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.9375, "completions/mean_terminated_length": 4.588235378265381, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.6904514133930206, "epoch": 0.00343, "frac_reward_zero_std": 0.0, "grad_norm": 0.17082862555980682, "kl": 0.19887815788388252, "learning_rate": 7.99999127784968e-06, "loss": -0.0699, "num_tokens": 9604315.0, "reward": -21.364574432373047, "reward_std": 16.46986198425293, "rewards/rollout_reward_func/mean": -21.364574432373047, "rewards/rollout_reward_func/std": 16.469860076904297, "sampling/importance_sampling_ratio/max": 1.4167263507843018, "sampling/importance_sampling_ratio/mean": 0.3988547623157501, "sampling/importance_sampling_ratio/min": 1.00886181826354e-05, "sampling/sampling_logp_difference/max": 2.034180164337158, "sampling/sampling_logp_difference/mean": 0.40356796979904175, "step": 343, "step_time": 19.569180713000605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.6818910241127014, "epoch": 0.00344, "grad_norm": 0.16961565613746643, "kl": 0.2098087454214692, "learning_rate": 7.999991220935337e-06, "loss": -0.0699, "step": 344, "step_time": 4.376836343000832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.8125, "completions/mean_terminated_length": 4.090909004211426, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.904115229845047, "epoch": 0.00345, "frac_reward_zero_std": 0.0, "grad_norm": 0.1010427176952362, "kl": 0.19519776664674282, "learning_rate": 7.999991163835908e-06, "loss": -0.0736, "num_tokens": 9668376.0, "reward": -13.10269546508789, "reward_std": 18.657318115234375, "rewards/rollout_reward_func/mean": -13.10269546508789, "rewards/rollout_reward_func/std": 18.657318115234375, "sampling/importance_sampling_ratio/max": 1.3377258777618408, "sampling/importance_sampling_ratio/mean": 0.6607434749603271, "sampling/importance_sampling_ratio/min": 1.9833342435049417e-07, "sampling/sampling_logp_difference/max": 2.090665102005005, "sampling/sampling_logp_difference/mean": 0.3537062406539917, "step": 345, "step_time": 21.306959608999023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8974000066518784, "epoch": 0.00346, "grad_norm": 0.10348206758499146, "kl": 0.20476100593805313, "learning_rate": 7.999991106551393e-06, "loss": -0.0737, "step": 346, "step_time": 4.406535044998236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.03125, "completions/mean_terminated_length": 5.38095235824585, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2735961377620697, "epoch": 0.00347, "frac_reward_zero_std": 0.0, "grad_norm": 0.17961768805980682, "kl": 0.19152284041047096, "learning_rate": 7.99999104908179e-06, "loss": -0.0864, "num_tokens": 9732164.0, "reward": -16.087778091430664, "reward_std": 16.858089447021484, "rewards/rollout_reward_func/mean": -16.087778091430664, "rewards/rollout_reward_func/std": 16.858089447021484, "sampling/importance_sampling_ratio/max": 1.5028153657913208, "sampling/importance_sampling_ratio/mean": 0.5759260058403015, "sampling/importance_sampling_ratio/min": 2.276269151479937e-05, "sampling/sampling_logp_difference/max": 1.9239017963409424, "sampling/sampling_logp_difference/mean": 0.3529711067676544, "step": 347, "step_time": 20.241178279999076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2619168758392334, "epoch": 0.00348, "grad_norm": 0.18166005611419678, "kl": 0.1996128112077713, "learning_rate": 7.9999909914271e-06, "loss": -0.0866, "step": 348, "step_time": 4.423712271996919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.84375, "completions/mean_terminated_length": 4.392857551574707, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2711416482925415, "epoch": 0.00349, "frac_reward_zero_std": 0.0, "grad_norm": 0.09775342047214508, "kl": 0.32437561452388763, "learning_rate": 7.999990933587323e-06, "loss": -0.0406, "num_tokens": 9797283.0, "reward": -9.341991424560547, "reward_std": 17.029605865478516, "rewards/rollout_reward_func/mean": -9.341991424560547, "rewards/rollout_reward_func/std": 17.029605865478516, "sampling/importance_sampling_ratio/max": 1.675200343132019, "sampling/importance_sampling_ratio/mean": 0.8115675449371338, "sampling/importance_sampling_ratio/min": 8.743836588109843e-06, "sampling/sampling_logp_difference/max": 2.4715733528137207, "sampling/sampling_logp_difference/mean": 0.22088806331157684, "step": 349, "step_time": 22.359631743000136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2606180906295776, "epoch": 0.0035, "grad_norm": 0.09363541752099991, "kl": 0.3337710462510586, "learning_rate": 7.99999087556246e-06, "loss": -0.041, "step": 350, "step_time": 4.444798831000298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.03125, "completions/mean_terminated_length": 4.849999904632568, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1738731265068054, "epoch": 0.00351, "frac_reward_zero_std": 0.0, "grad_norm": 0.13345617055892944, "kl": 0.26260989159345627, "learning_rate": 7.999990817352509e-06, "loss": -0.0943, "num_tokens": 9860554.0, "reward": -15.838479995727539, "reward_std": 18.417701721191406, "rewards/rollout_reward_func/mean": -15.838479995727539, "rewards/rollout_reward_func/std": 18.417699813842773, "sampling/importance_sampling_ratio/max": 1.4349629878997803, "sampling/importance_sampling_ratio/mean": 0.575464129447937, "sampling/importance_sampling_ratio/min": 0.0001005628946586512, "sampling/sampling_logp_difference/max": 1.7059826850891113, "sampling/sampling_logp_difference/mean": 0.3668200969696045, "step": 351, "step_time": 19.696289278997938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.170170336961746, "epoch": 0.00352, "grad_norm": 0.13541829586029053, "kl": 0.28173672780394554, "learning_rate": 7.999990758957471e-06, "loss": -0.0944, "step": 352, "step_time": 5.3094355249977525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 4.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4891417026519775, "epoch": 0.00353, "frac_reward_zero_std": 0.0, "grad_norm": 0.11282234638929367, "kl": 0.31259268522262573, "learning_rate": 7.999990700377347e-06, "loss": -0.0837, "num_tokens": 9924293.0, "reward": -8.485885620117188, "reward_std": 18.7723331451416, "rewards/rollout_reward_func/mean": -8.485885620117188, "rewards/rollout_reward_func/std": 18.7723331451416, "sampling/importance_sampling_ratio/max": 1.5155166387557983, "sampling/importance_sampling_ratio/mean": 0.775639533996582, "sampling/importance_sampling_ratio/min": 5.755115125793964e-05, "sampling/sampling_logp_difference/max": 1.5493531227111816, "sampling/sampling_logp_difference/mean": 0.2779691815376282, "step": 353, "step_time": 20.8525584479994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4838953912258148, "epoch": 0.00354, "grad_norm": 0.11188396066427231, "kl": 0.31552113592624664, "learning_rate": 7.999990641612135e-06, "loss": -0.0842, "step": 354, "step_time": 4.420677800000703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.65625, "completions/mean_terminated_length": 4.809524059295654, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1477552354335785, "epoch": 0.00355, "frac_reward_zero_std": 0.0, "grad_norm": 0.16447797417640686, "kl": 0.25453049317002296, "learning_rate": 7.999990582661837e-06, "loss": -0.0784, "num_tokens": 9986170.0, "reward": -12.433097839355469, "reward_std": 14.955541610717773, "rewards/rollout_reward_func/mean": -12.433097839355469, "rewards/rollout_reward_func/std": 14.955541610717773, "sampling/importance_sampling_ratio/max": 1.3322525024414062, "sampling/importance_sampling_ratio/mean": 0.5704469084739685, "sampling/importance_sampling_ratio/min": 2.2312704004434636e-06, "sampling/sampling_logp_difference/max": 1.7934640645980835, "sampling/sampling_logp_difference/mean": 0.36953091621398926, "step": 355, "step_time": 20.50603376699837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.151263415813446, "epoch": 0.00356, "grad_norm": 0.1695026010274887, "kl": 0.2565551660954952, "learning_rate": 7.999990523526452e-06, "loss": -0.0791, "step": 356, "step_time": 4.476248572998884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 4.239999771118164, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6469803154468536, "epoch": 0.00357, "frac_reward_zero_std": 0.0, "grad_norm": 0.15887145698070526, "kl": 0.31106769293546677, "learning_rate": 7.99999046420598e-06, "loss": -0.0508, "num_tokens": 10047849.0, "reward": -9.101951599121094, "reward_std": 18.493242263793945, "rewards/rollout_reward_func/mean": -9.101951599121094, "rewards/rollout_reward_func/std": 18.493242263793945, "sampling/importance_sampling_ratio/max": 1.6390498876571655, "sampling/importance_sampling_ratio/mean": 0.6805405616760254, "sampling/importance_sampling_ratio/min": 4.8594502004561946e-05, "sampling/sampling_logp_difference/max": 1.6193550825119019, "sampling/sampling_logp_difference/mean": 0.2936152219772339, "step": 357, "step_time": 20.76861158400061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.651151716709137, "epoch": 0.00358, "grad_norm": 0.15983015298843384, "kl": 0.3182787224650383, "learning_rate": 7.999990404700422e-06, "loss": -0.0508, "step": 358, "step_time": 4.371176945998741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.65625, "completions/mean_terminated_length": 4.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8830914497375488, "epoch": 0.00359, "frac_reward_zero_std": 0.0, "grad_norm": 0.0712892934679985, "kl": 0.313829705119133, "learning_rate": 7.999990345009776e-06, "loss": -0.0883, "num_tokens": 10111100.0, "reward": -11.626697540283203, "reward_std": 17.447799682617188, "rewards/rollout_reward_func/mean": -11.626697540283203, "rewards/rollout_reward_func/std": 17.447797775268555, "sampling/importance_sampling_ratio/max": 1.3792297840118408, "sampling/importance_sampling_ratio/mean": 0.6036829948425293, "sampling/importance_sampling_ratio/min": 9.481995220994577e-05, "sampling/sampling_logp_difference/max": 1.5772182941436768, "sampling/sampling_logp_difference/mean": 0.3286229968070984, "step": 359, "step_time": 21.379982210999515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8872295022010803, "epoch": 0.0036, "grad_norm": 0.07121790945529938, "kl": 0.31160611286759377, "learning_rate": 7.999990285134044e-06, "loss": -0.0883, "step": 360, "step_time": 4.95559157900243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.03125, "completions/mean_terminated_length": 4.409090995788574, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.063738167285919, "epoch": 0.00361, "frac_reward_zero_std": 0.0, "grad_norm": 0.19364933669567108, "kl": 0.31438189186155796, "learning_rate": 7.999990225073224e-06, "loss": -0.0783, "num_tokens": 10172216.0, "reward": -14.9752779006958, "reward_std": 17.8548641204834, "rewards/rollout_reward_func/mean": -14.9752779006958, "rewards/rollout_reward_func/std": 17.8548641204834, "sampling/importance_sampling_ratio/max": 1.3960484266281128, "sampling/importance_sampling_ratio/mean": 0.6017388105392456, "sampling/importance_sampling_ratio/min": 1.675098974374123e-05, "sampling/sampling_logp_difference/max": 1.8651375770568848, "sampling/sampling_logp_difference/mean": 0.3393416404724121, "step": 361, "step_time": 20.02456319599878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.072593629360199, "epoch": 0.00362, "grad_norm": 0.19769921898841858, "kl": 0.2997250482439995, "learning_rate": 7.999990164827318e-06, "loss": -0.0791, "step": 362, "step_time": 4.893124290001651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.4375, "completions/mean_terminated_length": 5.4782609939575195, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.062592327594757, "epoch": 0.00363, "frac_reward_zero_std": 0.0, "grad_norm": 0.10537216067314148, "kl": 0.23116137459874153, "learning_rate": 7.999990104396325e-06, "loss": -0.0619, "num_tokens": 10233236.0, "reward": -14.072673797607422, "reward_std": 16.682003021240234, "rewards/rollout_reward_func/mean": -14.072673797607422, "rewards/rollout_reward_func/std": 16.682004928588867, "sampling/importance_sampling_ratio/max": 1.338181972503662, "sampling/importance_sampling_ratio/mean": 0.5227475762367249, "sampling/importance_sampling_ratio/min": 3.542625199770555e-05, "sampling/sampling_logp_difference/max": 1.714349627494812, "sampling/sampling_logp_difference/mean": 0.3262529969215393, "step": 363, "step_time": 20.422534048999296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.072212815284729, "epoch": 0.00364, "grad_norm": 0.10353267937898636, "kl": 0.22041823901236057, "learning_rate": 7.999990043780244e-06, "loss": -0.062, "step": 364, "step_time": 4.37037785999928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 8.09375, "completions/mean_terminated_length": 3.952381134033203, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.019817739725113, "epoch": 0.00365, "frac_reward_zero_std": 0.0, "grad_norm": 0.08663073182106018, "kl": 0.20054103061556816, "learning_rate": 7.999989982979077e-06, "loss": -0.0982, "num_tokens": 10294309.0, "reward": -17.08005714416504, "reward_std": 19.12976837158203, "rewards/rollout_reward_func/mean": -17.08005714416504, "rewards/rollout_reward_func/std": 19.1297664642334, "sampling/importance_sampling_ratio/max": 1.3621282577514648, "sampling/importance_sampling_ratio/mean": 0.6057921051979065, "sampling/importance_sampling_ratio/min": 3.888367849924634e-09, "sampling/sampling_logp_difference/max": 2.246751308441162, "sampling/sampling_logp_difference/mean": 0.41486310958862305, "step": 365, "step_time": 18.686280166999495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.030121684074402, "epoch": 0.00366, "grad_norm": 0.08864846080541611, "kl": 0.1968095675110817, "learning_rate": 7.999989921992825e-06, "loss": -0.0977, "step": 366, "step_time": 4.259359114999825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.1875, "completions/mean_terminated_length": 5.583333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.270918071269989, "epoch": 0.00367, "frac_reward_zero_std": 0.0, "grad_norm": 0.1179298684000969, "kl": 0.22297992184758186, "learning_rate": 7.999989860821483e-06, "loss": -0.076, "num_tokens": 10355731.0, "reward": -16.95846939086914, "reward_std": 15.591874122619629, "rewards/rollout_reward_func/mean": -16.95846939086914, "rewards/rollout_reward_func/std": 15.591874122619629, "sampling/importance_sampling_ratio/max": 1.4480396509170532, "sampling/importance_sampling_ratio/mean": 0.5347795486450195, "sampling/importance_sampling_ratio/min": 1.908935445271709e-07, "sampling/sampling_logp_difference/max": 1.8716940879821777, "sampling/sampling_logp_difference/mean": 0.40748390555381775, "step": 367, "step_time": 19.23202080699957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2736973762512207, "epoch": 0.00368, "grad_norm": 0.11673391610383987, "kl": 0.22573072835803032, "learning_rate": 7.999989799465057e-06, "loss": -0.0762, "step": 368, "step_time": 4.382268196999576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 4.782608985900879, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.251573830842972, "epoch": 0.00369, "frac_reward_zero_std": 0.0, "grad_norm": 0.13334698975086212, "kl": 0.28711726143956184, "learning_rate": 7.999989737923541e-06, "loss": -0.0735, "num_tokens": 10418956.0, "reward": -13.903892517089844, "reward_std": 16.870269775390625, "rewards/rollout_reward_func/mean": -13.903892517089844, "rewards/rollout_reward_func/std": 16.870267868041992, "sampling/importance_sampling_ratio/max": 1.760131597518921, "sampling/importance_sampling_ratio/mean": 0.6286762952804565, "sampling/importance_sampling_ratio/min": 1.9258175143477274e-06, "sampling/sampling_logp_difference/max": 1.9204556941986084, "sampling/sampling_logp_difference/mean": 0.3975125551223755, "step": 369, "step_time": 21.32416643599936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2553289234638214, "epoch": 0.0037, "grad_norm": 0.13633406162261963, "kl": 0.2731751948595047, "learning_rate": 7.999989676196942e-06, "loss": -0.0741, "step": 370, "step_time": 4.863971802997185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.03125, "completions/mean_terminated_length": 4.91304349899292, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9269355535507202, "epoch": 0.00371, "frac_reward_zero_std": 0.0, "grad_norm": 0.1556246280670166, "kl": 0.2236421313136816, "learning_rate": 7.999989614285252e-06, "loss": -0.0919, "num_tokens": 10482347.0, "reward": -13.191543579101562, "reward_std": 15.802254676818848, "rewards/rollout_reward_func/mean": -13.191543579101562, "rewards/rollout_reward_func/std": 15.802254676818848, "sampling/importance_sampling_ratio/max": 1.4907896518707275, "sampling/importance_sampling_ratio/mean": 0.6877716779708862, "sampling/importance_sampling_ratio/min": 6.614683911720931e-07, "sampling/sampling_logp_difference/max": 1.9480068683624268, "sampling/sampling_logp_difference/mean": 0.3541649580001831, "step": 371, "step_time": 21.28940558100112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9292424321174622, "epoch": 0.00372, "grad_norm": 0.1539558321237564, "kl": 0.21814223192632198, "learning_rate": 7.999989552188477e-06, "loss": -0.0921, "step": 372, "step_time": 4.881719087001329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 9.03125, "completions/mean_terminated_length": 4.263157844543457, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4109014868736267, "epoch": 0.00373, "frac_reward_zero_std": 0.0, "grad_norm": 0.09082964062690735, "kl": 0.19877872616052628, "learning_rate": 7.999989489906616e-06, "loss": -0.0867, "num_tokens": 10542207.0, "reward": -17.88926124572754, "reward_std": 19.179054260253906, "rewards/rollout_reward_func/mean": -17.88926124572754, "rewards/rollout_reward_func/std": 19.179054260253906, "sampling/importance_sampling_ratio/max": 1.2841769456863403, "sampling/importance_sampling_ratio/mean": 0.5386413335800171, "sampling/importance_sampling_ratio/min": 6.067460617487086e-06, "sampling/sampling_logp_difference/max": 1.7268084287643433, "sampling/sampling_logp_difference/mean": 0.3818080425262451, "step": 373, "step_time": 19.701872499001183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4104710519313812, "epoch": 0.00374, "grad_norm": 0.09016740322113037, "kl": 0.19336807914078236, "learning_rate": 7.999989427439667e-06, "loss": -0.0868, "step": 374, "step_time": 4.35266494300231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.8125, "completions/mean_terminated_length": 4.6086955070495605, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1465433537960052, "epoch": 0.00375, "frac_reward_zero_std": 0.0, "grad_norm": 0.18765011429786682, "kl": 0.1840294823050499, "learning_rate": 7.999989364787634e-06, "loss": -0.0712, "num_tokens": 10605419.0, "reward": -13.295080184936523, "reward_std": 14.464310646057129, "rewards/rollout_reward_func/mean": -13.295080184936523, "rewards/rollout_reward_func/std": 14.464310646057129, "sampling/importance_sampling_ratio/max": 1.5410363674163818, "sampling/importance_sampling_ratio/mean": 0.665681779384613, "sampling/importance_sampling_ratio/min": 7.83773373314034e-07, "sampling/sampling_logp_difference/max": 1.6616930961608887, "sampling/sampling_logp_difference/mean": 0.3529362678527832, "step": 375, "step_time": 21.64472434000345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.146469831466675, "epoch": 0.00376, "grad_norm": 0.1886661797761917, "kl": 0.17933252453804016, "learning_rate": 7.999989301950511e-06, "loss": -0.072, "step": 376, "step_time": 4.500253833999523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.03125, "completions/mean_terminated_length": 4.409090995788574, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1267584562301636, "epoch": 0.00377, "frac_reward_zero_std": 0.0, "grad_norm": 0.11424869298934937, "kl": 0.18581148609519005, "learning_rate": 7.999989238928302e-06, "loss": -0.0636, "num_tokens": 10667311.0, "reward": -14.412802696228027, "reward_std": 18.628095626831055, "rewards/rollout_reward_func/mean": -14.412802696228027, "rewards/rollout_reward_func/std": 18.628095626831055, "sampling/importance_sampling_ratio/max": 1.2058768272399902, "sampling/importance_sampling_ratio/mean": 0.5691664218902588, "sampling/importance_sampling_ratio/min": 1.488568614149699e-05, "sampling/sampling_logp_difference/max": 1.669194221496582, "sampling/sampling_logp_difference/mean": 0.3316815495491028, "step": 377, "step_time": 20.45750093100105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.127110034227371, "epoch": 0.00378, "grad_norm": 0.11366841197013855, "kl": 0.17899305373430252, "learning_rate": 7.999989175721006e-06, "loss": -0.0636, "step": 378, "step_time": 4.431261316001837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.6875, "completions/mean_terminated_length": 4.857142925262451, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.5432680249214172, "epoch": 0.00379, "frac_reward_zero_std": 0.0, "grad_norm": 0.14550095796585083, "kl": 0.21652721613645554, "learning_rate": 7.999989112328623e-06, "loss": -0.087, "num_tokens": 10728792.0, "reward": -16.183704376220703, "reward_std": 16.162038803100586, "rewards/rollout_reward_func/mean": -16.183704376220703, "rewards/rollout_reward_func/std": 16.162038803100586, "sampling/importance_sampling_ratio/max": 1.4572288990020752, "sampling/importance_sampling_ratio/mean": 0.5891520380973816, "sampling/importance_sampling_ratio/min": 4.666746988846171e-08, "sampling/sampling_logp_difference/max": 2.292678117752075, "sampling/sampling_logp_difference/mean": 0.4319155216217041, "step": 379, "step_time": 20.268237797001348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5411313474178314, "epoch": 0.0038, "grad_norm": 0.15190280973911285, "kl": 0.21131745167076588, "learning_rate": 7.999989048751154e-06, "loss": -0.0881, "step": 380, "step_time": 4.75860568600001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.78125, "completions/mean_terminated_length": 5.0416669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.823617845773697, "epoch": 0.00381, "frac_reward_zero_std": 0.0, "grad_norm": 0.08235768228769302, "kl": 0.3481912389397621, "learning_rate": 7.999988984988599e-06, "loss": -0.0745, "num_tokens": 10789980.0, "reward": -12.479476928710938, "reward_std": 16.595369338989258, "rewards/rollout_reward_func/mean": -12.479476928710938, "rewards/rollout_reward_func/std": 16.595369338989258, "sampling/importance_sampling_ratio/max": 1.294473648071289, "sampling/importance_sampling_ratio/mean": 0.625195324420929, "sampling/importance_sampling_ratio/min": 1.620408397684514e-06, "sampling/sampling_logp_difference/max": 1.7605048418045044, "sampling/sampling_logp_difference/mean": 0.3162117004394531, "step": 381, "step_time": 22.225767907000773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.820438712835312, "epoch": 0.00382, "grad_norm": 0.08032235503196716, "kl": 0.3210037983953953, "learning_rate": 7.999988921040955e-06, "loss": -0.0748, "step": 382, "step_time": 4.851529397003105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 4.159999847412109, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4606178998947144, "epoch": 0.00383, "frac_reward_zero_std": 0.0, "grad_norm": 0.06285436451435089, "kl": 0.22881244122982025, "learning_rate": 7.999988856908225e-06, "loss": -0.0809, "num_tokens": 10854049.0, "reward": -5.191091537475586, "reward_std": 17.519428253173828, "rewards/rollout_reward_func/mean": -5.191091537475586, "rewards/rollout_reward_func/std": 17.519426345825195, "sampling/importance_sampling_ratio/max": 1.305767297744751, "sampling/importance_sampling_ratio/mean": 0.7458208203315735, "sampling/importance_sampling_ratio/min": 6.444132304750383e-05, "sampling/sampling_logp_difference/max": 1.3728444576263428, "sampling/sampling_logp_difference/mean": 0.2481037974357605, "step": 383, "step_time": 21.202210274997924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4606351852416992, "epoch": 0.00384, "grad_norm": 0.06136472895741463, "kl": 0.22982259839773178, "learning_rate": 7.99998879259041e-06, "loss": -0.0812, "step": 384, "step_time": 4.386232103999646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 3.904762029647827, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8899413049221039, "epoch": 0.00385, "frac_reward_zero_std": 0.0, "grad_norm": 0.14737187325954437, "kl": 0.14863556437194347, "learning_rate": 7.999988728087506e-06, "loss": -0.0778, "num_tokens": 10923360.0, "reward": -16.40321159362793, "reward_std": 21.067535400390625, "rewards/rollout_reward_func/mean": -16.40321159362793, "rewards/rollout_reward_func/std": 21.067535400390625, "sampling/importance_sampling_ratio/max": 1.2755910158157349, "sampling/importance_sampling_ratio/mean": 0.5669761300086975, "sampling/importance_sampling_ratio/min": 0.0005105970776639879, "sampling/sampling_logp_difference/max": 1.5037474632263184, "sampling/sampling_logp_difference/mean": 0.26567643880844116, "step": 385, "step_time": 23.889610973001254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8863386809825897, "epoch": 0.00386, "grad_norm": 0.1502939611673355, "kl": 0.14858738332986832, "learning_rate": 7.999988663399516e-06, "loss": -0.0787, "step": 386, "step_time": 4.912357765000706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.538461685180664, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.507812350988388, "epoch": 0.00387, "frac_reward_zero_std": 0.0, "grad_norm": 0.12779290974140167, "kl": 0.4590204730629921, "learning_rate": 7.999988598526439e-06, "loss": -0.0811, "num_tokens": 10996436.0, "reward": -17.53731346130371, "reward_std": 17.07171630859375, "rewards/rollout_reward_func/mean": -17.53731346130371, "rewards/rollout_reward_func/std": 17.07171630859375, "sampling/importance_sampling_ratio/max": 1.4247846603393555, "sampling/importance_sampling_ratio/mean": 0.719838559627533, "sampling/importance_sampling_ratio/min": 9.134612628258765e-05, "sampling/sampling_logp_difference/max": 1.6087355613708496, "sampling/sampling_logp_difference/mean": 0.24763642251491547, "step": 387, "step_time": 23.796030594001422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.509123831987381, "epoch": 0.00388, "grad_norm": 0.12776294350624084, "kl": 0.45343340933322906, "learning_rate": 7.999988533468276e-06, "loss": -0.0809, "step": 388, "step_time": 5.058265727000617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.90625, "completions/mean_terminated_length": 5.190476417541504, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4586730003356934, "epoch": 0.00389, "frac_reward_zero_std": 0.0, "grad_norm": 0.12783457338809967, "kl": 0.10636992566287518, "learning_rate": 7.999988468225024e-06, "loss": -0.076, "num_tokens": 11064988.0, "reward": -22.13848114013672, "reward_std": 20.9965763092041, "rewards/rollout_reward_func/mean": -22.13848114013672, "rewards/rollout_reward_func/std": 20.99657440185547, "sampling/importance_sampling_ratio/max": 1.2018171548843384, "sampling/importance_sampling_ratio/mean": 0.4956505298614502, "sampling/importance_sampling_ratio/min": 4.289688888547971e-07, "sampling/sampling_logp_difference/max": 2.0067925453186035, "sampling/sampling_logp_difference/mean": 0.3755154013633728, "step": 389, "step_time": 24.035213423998357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4550511240959167, "epoch": 0.0039, "grad_norm": 0.12746568024158478, "kl": 0.10385009460151196, "learning_rate": 7.999988402796688e-06, "loss": -0.0761, "step": 390, "step_time": 5.30280606100132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 8.375, "completions/mean_terminated_length": 3.799999952316284, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3683049082756042, "epoch": 0.00391, "frac_reward_zero_std": 0.0, "grad_norm": 0.14872650802135468, "kl": 0.1330466903746128, "learning_rate": 7.999988337183264e-06, "loss": -0.0527, "num_tokens": 11134037.0, "reward": -15.814861297607422, "reward_std": 22.44647979736328, "rewards/rollout_reward_func/mean": -15.814861297607422, "rewards/rollout_reward_func/std": 22.44647979736328, "sampling/importance_sampling_ratio/max": 1.2457469701766968, "sampling/importance_sampling_ratio/mean": 0.5628234148025513, "sampling/importance_sampling_ratio/min": 0.00011095221270807087, "sampling/sampling_logp_difference/max": 1.6685212850570679, "sampling/sampling_logp_difference/mean": 0.3382567763328552, "step": 391, "step_time": 23.361217134999606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.366751968860626, "epoch": 0.00392, "grad_norm": 0.15237672626972198, "kl": 0.13265256956219673, "learning_rate": 7.999988271384754e-06, "loss": -0.0528, "step": 392, "step_time": 5.253909962000762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.78125, "completions/mean_terminated_length": 4.294117450714111, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.7413147687911987, "epoch": 0.00393, "frac_reward_zero_std": 0.0, "grad_norm": 0.1718716323375702, "kl": 0.09873543679714203, "learning_rate": 7.999988205401156e-06, "loss": -0.0865, "num_tokens": 11202438.0, "reward": -27.19873809814453, "reward_std": 20.62618637084961, "rewards/rollout_reward_func/mean": -27.19873809814453, "rewards/rollout_reward_func/std": 20.626188278198242, "sampling/importance_sampling_ratio/max": 1.3142772912979126, "sampling/importance_sampling_ratio/mean": 0.5083352327346802, "sampling/importance_sampling_ratio/min": 3.854373886724716e-08, "sampling/sampling_logp_difference/max": 1.9766430854797363, "sampling/sampling_logp_difference/mean": 0.42920389771461487, "step": 393, "step_time": 22.64483671599919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.740393042564392, "epoch": 0.00394, "grad_norm": 0.17032863199710846, "kl": 0.09880896471440792, "learning_rate": 7.999988139232472e-06, "loss": -0.0872, "step": 394, "step_time": 4.969306657998459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.9375, "completions/mean_terminated_length": 4.700000286102295, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2024519443511963, "epoch": 0.00395, "frac_reward_zero_std": 0.0, "grad_norm": 0.12646199762821198, "kl": 0.1881019677966833, "learning_rate": 7.999988072878701e-06, "loss": -0.0933, "num_tokens": 11272589.0, "reward": -21.441986083984375, "reward_std": 18.26578140258789, "rewards/rollout_reward_func/mean": -21.441986083984375, "rewards/rollout_reward_func/std": 18.26578140258789, "sampling/importance_sampling_ratio/max": 1.2538341283798218, "sampling/importance_sampling_ratio/mean": 0.526660680770874, "sampling/importance_sampling_ratio/min": 7.669856216807602e-08, "sampling/sampling_logp_difference/max": 2.014613628387451, "sampling/sampling_logp_difference/mean": 0.3794005215167999, "step": 395, "step_time": 23.258143127002768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.197594463825226, "epoch": 0.00396, "grad_norm": 0.1292232722043991, "kl": 0.18690859712660313, "learning_rate": 7.999988006339844e-06, "loss": -0.0929, "step": 396, "step_time": 4.914109054996516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 4.727272987365723, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8257098197937012, "epoch": 0.00397, "frac_reward_zero_std": 0.0, "grad_norm": 0.3451618552207947, "kl": 0.19448808208107948, "learning_rate": 7.999987939615899e-06, "loss": -0.1043, "num_tokens": 11339055.0, "reward": -21.906661987304688, "reward_std": 17.93731689453125, "rewards/rollout_reward_func/mean": -21.906661987304688, "rewards/rollout_reward_func/std": 17.93731689453125, "sampling/importance_sampling_ratio/max": 2.1453144550323486, "sampling/importance_sampling_ratio/mean": 0.6316545605659485, "sampling/importance_sampling_ratio/min": 0.00019620057719293982, "sampling/sampling_logp_difference/max": 1.636358618736267, "sampling/sampling_logp_difference/mean": 0.28443509340286255, "step": 397, "step_time": 22.205469005999475 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 1.8235366344451904, "epoch": 0.00398, "grad_norm": 0.21132637560367584, "kl": 0.1825256310403347, "learning_rate": 7.999987872706869e-06, "loss": -0.1064, "step": 398, "step_time": 4.932164768000803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 4.6666669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4149444699287415, "epoch": 0.00399, "frac_reward_zero_std": 0.0, "grad_norm": 0.11853428930044174, "kl": 0.14230358880013227, "learning_rate": 7.99998780561275e-06, "loss": -0.0834, "num_tokens": 11411582.0, "reward": -15.776670455932617, "reward_std": 18.706594467163086, "rewards/rollout_reward_func/mean": -15.776670455932617, "rewards/rollout_reward_func/std": 18.706594467163086, "sampling/importance_sampling_ratio/max": 1.350835919380188, "sampling/importance_sampling_ratio/mean": 0.6698627471923828, "sampling/importance_sampling_ratio/min": 2.2098531893277595e-09, "sampling/sampling_logp_difference/max": 2.3486135005950928, "sampling/sampling_logp_difference/mean": 0.41312453150749207, "step": 399, "step_time": 24.995499908000056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4121129512786865, "epoch": 0.004, "grad_norm": 0.11501214653253555, "kl": 0.14106206223368645, "learning_rate": 7.999987738333547e-06, "loss": -0.0838, "step": 400, "step_time": 5.375690367998686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 4.360000133514404, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6749929785728455, "epoch": 0.00401, "frac_reward_zero_std": 0.0, "grad_norm": 0.08192003518342972, "kl": 0.1377236805856228, "learning_rate": 7.999987670869257e-06, "loss": -0.0778, "num_tokens": 11481377.0, "reward": -15.968611717224121, "reward_std": 18.062576293945312, "rewards/rollout_reward_func/mean": -15.968611717224121, "rewards/rollout_reward_func/std": 18.06257438659668, "sampling/importance_sampling_ratio/max": 1.3246278762817383, "sampling/importance_sampling_ratio/mean": 0.7264420390129089, "sampling/importance_sampling_ratio/min": 5.5433424677175935e-06, "sampling/sampling_logp_difference/max": 1.7294108867645264, "sampling/sampling_logp_difference/mean": 0.3050351142883301, "step": 401, "step_time": 24.870984076998866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6716772615909576, "epoch": 0.00402, "grad_norm": 0.08262369781732559, "kl": 0.13848375901579857, "learning_rate": 7.999987603219878e-06, "loss": -0.0774, "step": 402, "step_time": 5.370759816998543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.3125, "completions/mean_terminated_length": 4.818181991577148, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2832884192466736, "epoch": 0.00403, "frac_reward_zero_std": 0.0, "grad_norm": 0.16965042054653168, "kl": 0.18569424096494913, "learning_rate": 7.999987535385413e-06, "loss": -0.0869, "num_tokens": 11549735.0, "reward": -19.21438980102539, "reward_std": 18.339099884033203, "rewards/rollout_reward_func/mean": -19.21438980102539, "rewards/rollout_reward_func/std": 18.339099884033203, "sampling/importance_sampling_ratio/max": 1.4782214164733887, "sampling/importance_sampling_ratio/mean": 0.6174702644348145, "sampling/importance_sampling_ratio/min": 1.148378725446264e-08, "sampling/sampling_logp_difference/max": 2.279078483581543, "sampling/sampling_logp_difference/mean": 0.40513139963150024, "step": 403, "step_time": 24.88945602200147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.280127912759781, "epoch": 0.00404, "grad_norm": 0.16868285834789276, "kl": 0.1846149880439043, "learning_rate": 7.99998746736586e-06, "loss": -0.087, "step": 404, "step_time": 4.807200139000997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.625, "completions/mean_terminated_length": 4.833333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.03776416182518, "epoch": 0.00405, "frac_reward_zero_std": 0.0, "grad_norm": 0.08082429319620132, "kl": 0.16050725802779198, "learning_rate": 7.999987399161224e-06, "loss": -0.099, "num_tokens": 11619617.0, "reward": -17.577289581298828, "reward_std": 20.908571243286133, "rewards/rollout_reward_func/mean": -17.577289581298828, "rewards/rollout_reward_func/std": 20.908571243286133, "sampling/importance_sampling_ratio/max": 1.3217918872833252, "sampling/importance_sampling_ratio/mean": 0.683681845664978, "sampling/importance_sampling_ratio/min": 1.3520308483894183e-10, "sampling/sampling_logp_difference/max": 2.8591372966766357, "sampling/sampling_logp_difference/mean": 0.35542404651641846, "step": 405, "step_time": 23.20061139900099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0361477732658386, "epoch": 0.00406, "grad_norm": 0.08193764090538025, "kl": 0.1594894789159298, "learning_rate": 7.999987330771498e-06, "loss": -0.099, "step": 406, "step_time": 4.887428810001438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 3.7142858505249023, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2521375715732574, "epoch": 0.00407, "frac_reward_zero_std": 0.0, "grad_norm": 0.1512966752052307, "kl": 0.1800726167857647, "learning_rate": 7.999987262196688e-06, "loss": -0.0759, "num_tokens": 11686709.0, "reward": -19.345022201538086, "reward_std": 18.31917381286621, "rewards/rollout_reward_func/mean": -19.345022201538086, "rewards/rollout_reward_func/std": 18.31917381286621, "sampling/importance_sampling_ratio/max": 1.506687045097351, "sampling/importance_sampling_ratio/mean": 0.6941012144088745, "sampling/importance_sampling_ratio/min": 1.1350328321668712e-07, "sampling/sampling_logp_difference/max": 2.0083045959472656, "sampling/sampling_logp_difference/mean": 0.3478458821773529, "step": 407, "step_time": 24.183123021002757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2526106536388397, "epoch": 0.00408, "grad_norm": 0.15205413103103638, "kl": 0.18022336065769196, "learning_rate": 7.99998719343679e-06, "loss": -0.0761, "step": 408, "step_time": 4.852680155001508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.28125, "completions/mean_terminated_length": 4.7727274894714355, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.246074467897415, "epoch": 0.00409, "frac_reward_zero_std": 0.0, "grad_norm": 0.22695514559745789, "kl": 0.2653898447751999, "learning_rate": 7.999987124491804e-06, "loss": -0.0837, "num_tokens": 11758443.0, "reward": -22.234386444091797, "reward_std": 20.18427276611328, "rewards/rollout_reward_func/mean": -22.234386444091797, "rewards/rollout_reward_func/std": 20.18427276611328, "sampling/importance_sampling_ratio/max": 2.2114243507385254, "sampling/importance_sampling_ratio/mean": 0.5933752059936523, "sampling/importance_sampling_ratio/min": 7.016278686933219e-07, "sampling/sampling_logp_difference/max": 1.965620517730713, "sampling/sampling_logp_difference/mean": 0.3646233081817627, "step": 409, "step_time": 23.60833918300159 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.2420388758182526, "epoch": 0.0041, "grad_norm": 0.12369436025619507, "kl": 0.2681120168417692, "learning_rate": 7.999987055361734e-06, "loss": -0.0848, "step": 410, "step_time": 5.527888366001207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.4375, "completions/mean_terminated_length": 5.851851940155029, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7371663451194763, "epoch": 0.00411, "frac_reward_zero_std": 0.0, "grad_norm": 0.26029855012893677, "kl": 0.2938603423535824, "learning_rate": 7.999986986046575e-06, "loss": -0.0662, "num_tokens": 11829239.0, "reward": -15.97643756866455, "reward_std": 22.07244300842285, "rewards/rollout_reward_func/mean": -15.97643756866455, "rewards/rollout_reward_func/std": 22.07244300842285, "sampling/importance_sampling_ratio/max": 1.9420528411865234, "sampling/importance_sampling_ratio/mean": 0.6441619992256165, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.4713801145553589, "sampling/sampling_logp_difference/mean": 0.27826374769210815, "step": 411, "step_time": 25.33056541899714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7378458976745605, "epoch": 0.00412, "grad_norm": 0.27121326327323914, "kl": 0.2904682159423828, "learning_rate": 7.99998691654633e-06, "loss": -0.0678, "step": 412, "step_time": 5.443945198998335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.21875, "completions/mean_terminated_length": 5.192307949066162, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7461543679237366, "epoch": 0.00413, "frac_reward_zero_std": 0.0, "grad_norm": 0.1374795287847519, "kl": 0.14284630492329597, "learning_rate": 7.999986846861e-06, "loss": -0.0771, "num_tokens": 11899968.0, "reward": -17.12442398071289, "reward_std": 17.377168655395508, "rewards/rollout_reward_func/mean": -17.12442398071289, "rewards/rollout_reward_func/std": 17.37717056274414, "sampling/importance_sampling_ratio/max": 1.2812151908874512, "sampling/importance_sampling_ratio/mean": 0.6845134496688843, "sampling/importance_sampling_ratio/min": 0.00017594938981346786, "sampling/sampling_logp_difference/max": 1.6943589448928833, "sampling/sampling_logp_difference/mean": 0.24140658974647522, "step": 413, "step_time": 24.063588864999474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7553327977657318, "epoch": 0.00414, "grad_norm": 0.13826413452625275, "kl": 0.14323938451707363, "learning_rate": 7.999986776990581e-06, "loss": -0.0775, "step": 414, "step_time": 4.821072484999604 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.3125, "completions/mean_terminated_length": 4.285714149475098, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.406057685613632, "epoch": 0.00415, "frac_reward_zero_std": 0.0, "grad_norm": 0.12365926057100296, "kl": 0.12053357437252998, "learning_rate": 7.999986706935076e-06, "loss": -0.0843, "num_tokens": 11965388.0, "reward": -19.309886932373047, "reward_std": 16.810279846191406, "rewards/rollout_reward_func/mean": -19.309886932373047, "rewards/rollout_reward_func/std": 16.810279846191406, "sampling/importance_sampling_ratio/max": 1.397375226020813, "sampling/importance_sampling_ratio/mean": 0.5633900165557861, "sampling/importance_sampling_ratio/min": 2.322728960280074e-06, "sampling/sampling_logp_difference/max": 1.8991833925247192, "sampling/sampling_logp_difference/mean": 0.3856299817562103, "step": 415, "step_time": 22.99637851999978 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 2.4164111018180847, "epoch": 0.00416, "grad_norm": 0.12613196671009064, "kl": 0.11995460838079453, "learning_rate": 7.999986636694485e-06, "loss": -0.0848, "step": 416, "step_time": 4.678021612999146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.09375, "completions/mean_terminated_length": 4.3684210777282715, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.467962086200714, "epoch": 0.00417, "frac_reward_zero_std": 0.0, "grad_norm": 0.158212348818779, "kl": 0.16269364207983017, "learning_rate": 7.999986566268808e-06, "loss": -0.0679, "num_tokens": 12035721.0, "reward": -19.609237670898438, "reward_std": 17.861738204956055, "rewards/rollout_reward_func/mean": -19.609237670898438, "rewards/rollout_reward_func/std": 17.861738204956055, "sampling/importance_sampling_ratio/max": 1.3133705854415894, "sampling/importance_sampling_ratio/mean": 0.5552077889442444, "sampling/importance_sampling_ratio/min": 1.529324722415737e-10, "sampling/sampling_logp_difference/max": 2.6089370250701904, "sampling/sampling_logp_difference/mean": 0.4125804305076599, "step": 417, "step_time": 23.40163016400038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4687264263629913, "epoch": 0.00418, "grad_norm": 0.1599097102880478, "kl": 0.1627120077610016, "learning_rate": 7.999986495658042e-06, "loss": -0.0681, "step": 418, "step_time": 4.9608926130003965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 4.319999694824219, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8704628646373749, "epoch": 0.00419, "frac_reward_zero_std": 0.0, "grad_norm": 0.15976686775684357, "kl": 0.19290710240602493, "learning_rate": 7.999986424862192e-06, "loss": -0.0838, "num_tokens": 12103574.0, "reward": -20.676406860351562, "reward_std": 17.245866775512695, "rewards/rollout_reward_func/mean": -20.676406860351562, "rewards/rollout_reward_func/std": 17.245866775512695, "sampling/importance_sampling_ratio/max": 1.7641595602035522, "sampling/importance_sampling_ratio/mean": 0.7344970107078552, "sampling/importance_sampling_ratio/min": 3.417682819417678e-06, "sampling/sampling_logp_difference/max": 2.080631971359253, "sampling/sampling_logp_difference/mean": 0.28284966945648193, "step": 419, "step_time": 23.300775453000824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8677460849285126, "epoch": 0.0042, "grad_norm": 0.16036483645439148, "kl": 0.1910356730222702, "learning_rate": 7.999986353881253e-06, "loss": -0.0841, "step": 420, "step_time": 5.4814033190014015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.6875, "completions/mean_terminated_length": 4.117647171020508, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.856747031211853, "epoch": 0.00421, "frac_reward_zero_std": 0.0, "grad_norm": 0.1736154854297638, "kl": 0.10536591801792383, "learning_rate": 7.999986282715228e-06, "loss": -0.0838, "num_tokens": 12171629.0, "reward": -20.920551300048828, "reward_std": 23.590749740600586, "rewards/rollout_reward_func/mean": -20.920551300048828, "rewards/rollout_reward_func/std": 23.590749740600586, "sampling/importance_sampling_ratio/max": 1.14710533618927, "sampling/importance_sampling_ratio/mean": 0.46703964471817017, "sampling/importance_sampling_ratio/min": 1.390197326145426e-06, "sampling/sampling_logp_difference/max": 2.1569130420684814, "sampling/sampling_logp_difference/mean": 0.41957157850265503, "step": 421, "step_time": 23.636637613999483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.857547879219055, "epoch": 0.00422, "grad_norm": 0.1706530749797821, "kl": 0.1090911254286766, "learning_rate": 7.999986211364117e-06, "loss": -0.084, "step": 422, "step_time": 5.490889275000882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.09375, "completions/mean_terminated_length": 4.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1514181196689606, "epoch": 0.00423, "frac_reward_zero_std": 0.0, "grad_norm": 0.13608135282993317, "kl": 0.12577690742909908, "learning_rate": 7.999986139827919e-06, "loss": -0.0805, "num_tokens": 12243792.0, "reward": -15.743684768676758, "reward_std": 17.77641487121582, "rewards/rollout_reward_func/mean": -15.743684768676758, "rewards/rollout_reward_func/std": 17.77641487121582, "sampling/importance_sampling_ratio/max": 1.313286542892456, "sampling/importance_sampling_ratio/mean": 0.6385180950164795, "sampling/importance_sampling_ratio/min": 1.6234491528877015e-10, "sampling/sampling_logp_difference/max": 2.384215831756592, "sampling/sampling_logp_difference/mean": 0.3936271369457245, "step": 423, "step_time": 24.29402342600042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1504937410354614, "epoch": 0.00424, "grad_norm": 0.13489188253879547, "kl": 0.12578318268060684, "learning_rate": 7.999986068106634e-06, "loss": -0.081, "step": 424, "step_time": 5.16087215799962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 4.360000133514404, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7096275985240936, "epoch": 0.00425, "frac_reward_zero_std": 0.0, "grad_norm": 0.14347679913043976, "kl": 0.2134578414261341, "learning_rate": 7.999985996200264e-06, "loss": -0.0611, "num_tokens": 12318366.0, "reward": -8.627449989318848, "reward_std": 19.03934097290039, "rewards/rollout_reward_func/mean": -8.627449989318848, "rewards/rollout_reward_func/std": 19.03934097290039, "sampling/importance_sampling_ratio/max": 1.3016846179962158, "sampling/importance_sampling_ratio/mean": 0.7204763889312744, "sampling/importance_sampling_ratio/min": 1.8738726794254035e-05, "sampling/sampling_logp_difference/max": 1.5027121305465698, "sampling/sampling_logp_difference/mean": 0.2669302523136139, "step": 425, "step_time": 24.626093335000405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.706552892923355, "epoch": 0.00426, "grad_norm": 0.14087249338626862, "kl": 0.21157598495483398, "learning_rate": 7.999985924108805e-06, "loss": -0.0613, "step": 426, "step_time": 5.1520495700005995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 4.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7281419038772583, "epoch": 0.00427, "frac_reward_zero_std": 0.0, "grad_norm": 0.2301022708415985, "kl": 0.795043870806694, "learning_rate": 7.999985851832262e-06, "loss": -0.0435, "num_tokens": 12391702.0, "reward": -12.369114875793457, "reward_std": 18.544981002807617, "rewards/rollout_reward_func/mean": -12.369114875793457, "rewards/rollout_reward_func/std": 18.54498291015625, "sampling/importance_sampling_ratio/max": 1.2730846405029297, "sampling/importance_sampling_ratio/mean": 0.6916781067848206, "sampling/importance_sampling_ratio/min": 2.1931390392637695e-07, "sampling/sampling_logp_difference/max": 2.2283129692077637, "sampling/sampling_logp_difference/mean": 0.3262553811073303, "step": 427, "step_time": 24.67607381000016 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.730577439069748, "epoch": 0.00428, "grad_norm": 0.18582285940647125, "kl": 0.7265377677977085, "learning_rate": 7.999985779370631e-06, "loss": -0.0449, "step": 428, "step_time": 5.044945449999432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.625, "completions/mean_terminated_length": 4.34782600402832, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9467171430587769, "epoch": 0.00429, "frac_reward_zero_std": 0.0, "grad_norm": 0.1528516709804535, "kl": 0.16816844791173935, "learning_rate": 7.999985706723913e-06, "loss": -0.0834, "num_tokens": 12460929.0, "reward": -16.948108673095703, "reward_std": 21.32571792602539, "rewards/rollout_reward_func/mean": -16.948108673095703, "rewards/rollout_reward_func/std": 21.32571792602539, "sampling/importance_sampling_ratio/max": 1.4084985256195068, "sampling/importance_sampling_ratio/mean": 0.6517961025238037, "sampling/importance_sampling_ratio/min": 5.585540748143103e-06, "sampling/sampling_logp_difference/max": 1.8894538879394531, "sampling/sampling_logp_difference/mean": 0.3677991032600403, "step": 429, "step_time": 24.53568930199981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9568681716918945, "epoch": 0.0043, "grad_norm": 0.1583365648984909, "kl": 0.16473903879523277, "learning_rate": 7.999985633892109e-06, "loss": -0.0838, "step": 430, "step_time": 5.4017177220030135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 6.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.689051389694214, "epoch": 0.00431, "frac_reward_zero_std": 0.0, "grad_norm": 0.15088535845279694, "kl": 0.09631961397826672, "learning_rate": 7.999985560875218e-06, "loss": -0.0894, "num_tokens": 12525577.0, "reward": -25.907997131347656, "reward_std": 18.307758331298828, "rewards/rollout_reward_func/mean": -25.907997131347656, "rewards/rollout_reward_func/std": 18.307758331298828, "sampling/importance_sampling_ratio/max": 1.2113523483276367, "sampling/importance_sampling_ratio/mean": 0.4359222650527954, "sampling/importance_sampling_ratio/min": 6.35882588539971e-06, "sampling/sampling_logp_difference/max": 1.7563745975494385, "sampling/sampling_logp_difference/mean": 0.40087780356407166, "step": 431, "step_time": 22.50617331599824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.695679187774658, "epoch": 0.00432, "grad_norm": 0.15744943916797638, "kl": 0.09339455887675285, "learning_rate": 7.999985487673243e-06, "loss": -0.0899, "step": 432, "step_time": 4.832338048998281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.03125, "completions/mean_terminated_length": 4.519999980926514, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.739987552165985, "epoch": 0.00433, "frac_reward_zero_std": 0.0, "grad_norm": 0.1081443578004837, "kl": 0.13940790481865406, "learning_rate": 7.999985414286176e-06, "loss": -0.0816, "num_tokens": 12595206.0, "reward": -15.827386856079102, "reward_std": 17.45933723449707, "rewards/rollout_reward_func/mean": -15.827386856079102, "rewards/rollout_reward_func/std": 17.45933723449707, "sampling/importance_sampling_ratio/max": 1.362300992012024, "sampling/importance_sampling_ratio/mean": 0.704897940158844, "sampling/importance_sampling_ratio/min": 4.462753713596612e-05, "sampling/sampling_logp_difference/max": 1.9434281587600708, "sampling/sampling_logp_difference/mean": 0.279979944229126, "step": 433, "step_time": 23.551079765000395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7463849782943726, "epoch": 0.00434, "grad_norm": 0.10690201818943024, "kl": 0.13704814016819, "learning_rate": 7.999985340714028e-06, "loss": -0.0817, "step": 434, "step_time": 4.83376208899972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.71875, "completions/mean_terminated_length": 4.4782609939575195, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.833889126777649, "epoch": 0.00435, "frac_reward_zero_std": 0.0, "grad_norm": 0.10047597438097, "kl": 0.11279113590717316, "learning_rate": 7.99998526695679e-06, "loss": -0.0733, "num_tokens": 12664683.0, "reward": -17.665672302246094, "reward_std": 19.238582611083984, "rewards/rollout_reward_func/mean": -17.665672302246094, "rewards/rollout_reward_func/std": 19.238582611083984, "sampling/importance_sampling_ratio/max": 1.3418409824371338, "sampling/importance_sampling_ratio/mean": 0.6026196479797363, "sampling/importance_sampling_ratio/min": 6.813926302129403e-05, "sampling/sampling_logp_difference/max": 1.5670372247695923, "sampling/sampling_logp_difference/mean": 0.27202847599983215, "step": 435, "step_time": 22.6419178979977 }, { "clip_ratio/high_max": 0.02777777798473835, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 1.8422831892967224, "epoch": 0.00436, "grad_norm": 0.0997045710682869, "kl": 0.10951905697584152, "learning_rate": 7.999985193014467e-06, "loss": -0.0737, "step": 436, "step_time": 4.946773866999138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.46875, "completions/mean_terminated_length": 5.5217390060424805, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.175841122865677, "epoch": 0.00437, "frac_reward_zero_std": 0.0, "grad_norm": 0.10729791969060898, "kl": 0.3084981944411993, "learning_rate": 7.999985118887056e-06, "loss": -0.0837, "num_tokens": 12732804.0, "reward": -20.4417724609375, "reward_std": 19.506816864013672, "rewards/rollout_reward_func/mean": -20.4417724609375, "rewards/rollout_reward_func/std": 19.506816864013672, "sampling/importance_sampling_ratio/max": 1.3589588403701782, "sampling/importance_sampling_ratio/mean": 0.5676668882369995, "sampling/importance_sampling_ratio/min": 5.235635480715928e-09, "sampling/sampling_logp_difference/max": 2.422943592071533, "sampling/sampling_logp_difference/mean": 0.3563418388366699, "step": 437, "step_time": 24.06270770000083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.182437241077423, "epoch": 0.00438, "grad_norm": 0.11158772557973862, "kl": 0.29419360868632793, "learning_rate": 7.99998504457456e-06, "loss": -0.0839, "step": 438, "step_time": 5.51556365900251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.1875, "completions/mean_terminated_length": 6.090909481048584, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.391415596008301, "epoch": 0.00439, "frac_reward_zero_std": 0.0, "grad_norm": 0.12710043787956238, "kl": 0.1557745262980461, "learning_rate": 7.999984970076977e-06, "loss": -0.0986, "num_tokens": 12803215.0, "reward": -17.26525115966797, "reward_std": 21.566410064697266, "rewards/rollout_reward_func/mean": -17.26525115966797, "rewards/rollout_reward_func/std": 21.566410064697266, "sampling/importance_sampling_ratio/max": 1.7904646396636963, "sampling/importance_sampling_ratio/mean": 0.5313239097595215, "sampling/importance_sampling_ratio/min": 2.353921502162848e-08, "sampling/sampling_logp_difference/max": 1.9455653429031372, "sampling/sampling_logp_difference/mean": 0.32274898886680603, "step": 439, "step_time": 23.243521873995633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3940120935440063, "epoch": 0.0044, "grad_norm": 0.13096299767494202, "kl": 0.15345947816967964, "learning_rate": 7.999984895394308e-06, "loss": -0.099, "step": 440, "step_time": 5.590590493997297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.46875, "completions/mean_terminated_length": 5.550000190734863, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.650649666786194, "epoch": 0.00441, "frac_reward_zero_std": 0.0, "grad_norm": 0.15484300255775452, "kl": 0.09097659774124622, "learning_rate": 7.999984820526551e-06, "loss": -0.0837, "num_tokens": 12871477.0, "reward": -20.90436553955078, "reward_std": 22.91033935546875, "rewards/rollout_reward_func/mean": -20.90436553955078, "rewards/rollout_reward_func/std": 22.91033935546875, "sampling/importance_sampling_ratio/max": 1.212924838066101, "sampling/importance_sampling_ratio/mean": 0.45933809876441956, "sampling/importance_sampling_ratio/min": 8.081264581960568e-07, "sampling/sampling_logp_difference/max": 1.777116060256958, "sampling/sampling_logp_difference/mean": 0.40218544006347656, "step": 441, "step_time": 23.34575797599973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.65450918674469, "epoch": 0.00442, "grad_norm": 0.15939871966838837, "kl": 0.09035529289394617, "learning_rate": 7.999984745473708e-06, "loss": -0.0844, "step": 442, "step_time": 4.825434611000674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.46875, "completions/mean_terminated_length": 5.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4428164064884186, "epoch": 0.00443, "frac_reward_zero_std": 0.0, "grad_norm": 0.15658771991729736, "kl": 0.11530089564621449, "learning_rate": 7.99998467023578e-06, "loss": -0.1156, "num_tokens": 12938641.0, "reward": -22.292728424072266, "reward_std": 21.745027542114258, "rewards/rollout_reward_func/mean": -22.292728424072266, "rewards/rollout_reward_func/std": 21.745027542114258, "sampling/importance_sampling_ratio/max": 1.4437917470932007, "sampling/importance_sampling_ratio/mean": 0.5821301937103271, "sampling/importance_sampling_ratio/min": 1.0639031643222552e-05, "sampling/sampling_logp_difference/max": 1.9546699523925781, "sampling/sampling_logp_difference/mean": 0.3648707866668701, "step": 443, "step_time": 23.549418253000113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.444568455219269, "epoch": 0.00444, "grad_norm": 0.15864241123199463, "kl": 0.11495952866971493, "learning_rate": 7.999984594812764e-06, "loss": -0.1158, "step": 444, "step_time": 4.9678405260001455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.71875, "completions/mean_terminated_length": 5.400000095367432, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2238839268684387, "epoch": 0.00445, "frac_reward_zero_std": 0.0, "grad_norm": 0.14913718402385712, "kl": 0.12142389267683029, "learning_rate": 7.999984519204662e-06, "loss": -0.0975, "num_tokens": 13009403.0, "reward": -18.743553161621094, "reward_std": 20.436100006103516, "rewards/rollout_reward_func/mean": -18.743553161621094, "rewards/rollout_reward_func/std": 20.436100006103516, "sampling/importance_sampling_ratio/max": 1.4558296203613281, "sampling/importance_sampling_ratio/mean": 0.6619424819946289, "sampling/importance_sampling_ratio/min": 4.1559292185411323e-07, "sampling/sampling_logp_difference/max": 1.8274741172790527, "sampling/sampling_logp_difference/mean": 0.3678933084011078, "step": 445, "step_time": 23.751702789000774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2206037044525146, "epoch": 0.00446, "grad_norm": 0.14981521666049957, "kl": 0.12334268540143967, "learning_rate": 7.999984443411473e-06, "loss": -0.0979, "step": 446, "step_time": 4.955082925998795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.90625, "completions/mean_terminated_length": 4.227272987365723, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.609289586544037, "epoch": 0.00447, "frac_reward_zero_std": 0.0, "grad_norm": 0.12813587486743927, "kl": 0.13391583412885666, "learning_rate": 7.999984367433198e-06, "loss": -0.0795, "num_tokens": 13079680.0, "reward": -20.651782989501953, "reward_std": 18.389265060424805, "rewards/rollout_reward_func/mean": -20.651782989501953, "rewards/rollout_reward_func/std": 18.389265060424805, "sampling/importance_sampling_ratio/max": 1.2619919776916504, "sampling/importance_sampling_ratio/mean": 0.619870662689209, "sampling/importance_sampling_ratio/min": 1.7268141777604384e-11, "sampling/sampling_logp_difference/max": 2.9432425498962402, "sampling/sampling_logp_difference/mean": 0.4593168795108795, "step": 447, "step_time": 24.180513256998893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.603493392467499, "epoch": 0.00448, "grad_norm": 0.12191624939441681, "kl": 0.134449640288949, "learning_rate": 7.999984291269835e-06, "loss": -0.0799, "step": 448, "step_time": 5.444580407998728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.09375, "completions/mean_terminated_length": 4.950000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4105977416038513, "epoch": 0.00449, "frac_reward_zero_std": 0.0, "grad_norm": 0.19337055087089539, "kl": 0.09631808754056692, "learning_rate": 7.999984214921387e-06, "loss": -0.0973, "num_tokens": 13148148.0, "reward": -22.308134078979492, "reward_std": 17.92047119140625, "rewards/rollout_reward_func/mean": -22.308134078979492, "rewards/rollout_reward_func/std": 17.92047119140625, "sampling/importance_sampling_ratio/max": 1.3917006254196167, "sampling/importance_sampling_ratio/mean": 0.5782403349876404, "sampling/importance_sampling_ratio/min": 7.283317972905934e-05, "sampling/sampling_logp_difference/max": 1.6641919612884521, "sampling/sampling_logp_difference/mean": 0.34027212858200073, "step": 449, "step_time": 23.002608289001728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.401617646217346, "epoch": 0.0045, "grad_norm": 0.1979413628578186, "kl": 0.09812691435217857, "learning_rate": 7.99998413838785e-06, "loss": -0.0982, "step": 450, "step_time": 4.894644358997539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 9.4375, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.409851551055908, "epoch": 0.00451, "frac_reward_zero_std": 0.0, "grad_norm": 0.1114494577050209, "kl": 0.22585982643067837, "learning_rate": 7.99998406166923e-06, "loss": -0.0786, "num_tokens": 13216752.0, "reward": -22.6497859954834, "reward_std": 18.21583366394043, "rewards/rollout_reward_func/mean": -22.6497859954834, "rewards/rollout_reward_func/std": 18.21583366394043, "sampling/importance_sampling_ratio/max": 1.2643169164657593, "sampling/importance_sampling_ratio/mean": 0.48056328296661377, "sampling/importance_sampling_ratio/min": 4.682754024543101e-07, "sampling/sampling_logp_difference/max": 2.4329326152801514, "sampling/sampling_logp_difference/mean": 0.377627432346344, "step": 451, "step_time": 24.641116646002047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4028943181037903, "epoch": 0.00452, "grad_norm": 0.11519069224596024, "kl": 0.22104527801275253, "learning_rate": 7.999983984765522e-06, "loss": -0.0786, "step": 452, "step_time": 4.835679496000012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.03125, "completions/mean_terminated_length": 5.388888835906982, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.539392113685608, "epoch": 0.00453, "frac_reward_zero_std": 0.0, "grad_norm": 0.09738200902938843, "kl": 0.09890759736299515, "learning_rate": 7.999983907676728e-06, "loss": -0.0732, "num_tokens": 13285017.0, "reward": -24.313142776489258, "reward_std": 17.885250091552734, "rewards/rollout_reward_func/mean": -24.313142776489258, "rewards/rollout_reward_func/std": 17.885250091552734, "sampling/importance_sampling_ratio/max": 1.2002710103988647, "sampling/importance_sampling_ratio/mean": 0.45948994159698486, "sampling/importance_sampling_ratio/min": 1.730371695884969e-05, "sampling/sampling_logp_difference/max": 2.0034425258636475, "sampling/sampling_logp_difference/mean": 0.35454651713371277, "step": 453, "step_time": 21.130092850000437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.532060146331787, "epoch": 0.00454, "grad_norm": 0.09928874671459198, "kl": 0.10209068190306425, "learning_rate": 7.999983830402848e-06, "loss": -0.0741, "step": 454, "step_time": 4.83509691600193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 4.807692527770996, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7609526962041855, "epoch": 0.00455, "frac_reward_zero_std": 0.0, "grad_norm": 0.16635385155677795, "kl": 0.41507900319993496, "learning_rate": 7.99998375294388e-06, "loss": -0.0751, "num_tokens": 13354848.0, "reward": -15.187000274658203, "reward_std": 19.553485870361328, "rewards/rollout_reward_func/mean": -15.187000274658203, "rewards/rollout_reward_func/std": 19.553485870361328, "sampling/importance_sampling_ratio/max": 1.3135050535202026, "sampling/importance_sampling_ratio/mean": 0.6937004923820496, "sampling/importance_sampling_ratio/min": 0.00022564570826943964, "sampling/sampling_logp_difference/max": 1.7088253498077393, "sampling/sampling_logp_difference/mean": 0.2613747715950012, "step": 455, "step_time": 24.43487491199994 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.7520268559455872, "epoch": 0.00456, "grad_norm": 0.16457119584083557, "kl": 0.3992520831525326, "learning_rate": 7.999983675299827e-06, "loss": -0.0765, "step": 456, "step_time": 4.80342733100224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.125, "completions/mean_terminated_length": 7.454545497894287, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.6985251903533936, "epoch": 0.00457, "frac_reward_zero_std": 0.0, "grad_norm": 0.11147643625736237, "kl": 0.10085460543632507, "learning_rate": 7.999983597470686e-06, "loss": -0.0711, "num_tokens": 13424667.0, "reward": -15.757342338562012, "reward_std": 18.03775978088379, "rewards/rollout_reward_func/mean": -15.757342338562012, "rewards/rollout_reward_func/std": 18.03775978088379, "sampling/importance_sampling_ratio/max": 1.3562370538711548, "sampling/importance_sampling_ratio/mean": 0.46951016783714294, "sampling/importance_sampling_ratio/min": 2.683265620362363e-06, "sampling/sampling_logp_difference/max": 1.4780898094177246, "sampling/sampling_logp_difference/mean": 0.38634923100471497, "step": 457, "step_time": 22.873697833001643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7031753063201904, "epoch": 0.00458, "grad_norm": 0.11287412792444229, "kl": 0.10466177575290203, "learning_rate": 7.99998351945646e-06, "loss": -0.0716, "step": 458, "step_time": 5.372270448999188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.03125, "completions/mean_terminated_length": 4.409090995788574, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.88693767786026, "epoch": 0.00459, "frac_reward_zero_std": 0.0, "grad_norm": 0.32305917143821716, "kl": 0.8924185186624527, "learning_rate": 7.999983441257147e-06, "loss": -0.0641, "num_tokens": 13495010.0, "reward": -17.267457962036133, "reward_std": 19.17966651916504, "rewards/rollout_reward_func/mean": -17.267457962036133, "rewards/rollout_reward_func/std": 19.17966651916504, "sampling/importance_sampling_ratio/max": 2.0094292163848877, "sampling/importance_sampling_ratio/mean": 0.6262249946594238, "sampling/importance_sampling_ratio/min": 1.1692230827975436e-06, "sampling/sampling_logp_difference/max": 2.3813042640686035, "sampling/sampling_logp_difference/mean": 0.37187397480010986, "step": 459, "step_time": 25.58113031299945 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 1.8792857825756073, "epoch": 0.0046, "grad_norm": 0.30079618096351624, "kl": 0.8993608728051186, "learning_rate": 7.999983362872746e-06, "loss": -0.0646, "step": 460, "step_time": 4.891852184999152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.65625, "completions/mean_terminated_length": 5.782608985900879, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5012333393096924, "epoch": 0.00461, "frac_reward_zero_std": 0.0, "grad_norm": 0.12849141657352448, "kl": 0.1957287471741438, "learning_rate": 7.999983284303261e-06, "loss": -0.0907, "num_tokens": 13567712.0, "reward": -20.429489135742188, "reward_std": 15.905797958374023, "rewards/rollout_reward_func/mean": -20.429489135742188, "rewards/rollout_reward_func/std": 15.905797004699707, "sampling/importance_sampling_ratio/max": 1.445024847984314, "sampling/importance_sampling_ratio/mean": 0.49036335945129395, "sampling/importance_sampling_ratio/min": 5.187794727135042e-08, "sampling/sampling_logp_difference/max": 2.480384349822998, "sampling/sampling_logp_difference/mean": 0.4096663296222687, "step": 461, "step_time": 24.89522989499892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5064748525619507, "epoch": 0.00462, "grad_norm": 0.1271943300962448, "kl": 0.20611068978905678, "learning_rate": 7.999983205548689e-06, "loss": -0.0914, "step": 462, "step_time": 5.067577584000901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.46875, "completions/mean_terminated_length": 4.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.144755184650421, "epoch": 0.00463, "frac_reward_zero_std": 0.0, "grad_norm": 0.17634016275405884, "kl": 0.4394150897860527, "learning_rate": 7.99998312660903e-06, "loss": -0.0702, "num_tokens": 13639243.0, "reward": -17.657068252563477, "reward_std": 18.932065963745117, "rewards/rollout_reward_func/mean": -17.657068252563477, "rewards/rollout_reward_func/std": 18.932064056396484, "sampling/importance_sampling_ratio/max": 1.3324079513549805, "sampling/importance_sampling_ratio/mean": 0.6417543888092041, "sampling/importance_sampling_ratio/min": 8.05757736088708e-07, "sampling/sampling_logp_difference/max": 1.6408665180206299, "sampling/sampling_logp_difference/mean": 0.3906155228614807, "step": 463, "step_time": 24.753898807999576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.158180683851242, "epoch": 0.00464, "grad_norm": 0.1792343705892563, "kl": 0.4155322387814522, "learning_rate": 7.999983047484286e-06, "loss": -0.071, "step": 464, "step_time": 5.011319346001983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.15625, "completions/mean_terminated_length": 4.047619342803955, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7741120159626007, "epoch": 0.00465, "frac_reward_zero_std": 0.0, "grad_norm": 0.14170028269290924, "kl": 0.2576921321451664, "learning_rate": 7.999982968174453e-06, "loss": -0.0776, "num_tokens": 13711660.0, "reward": -18.7150936126709, "reward_std": 19.890445709228516, "rewards/rollout_reward_func/mean": -18.7150936126709, "rewards/rollout_reward_func/std": 19.890443801879883, "sampling/importance_sampling_ratio/max": 1.467609167098999, "sampling/importance_sampling_ratio/mean": 0.6600280404090881, "sampling/importance_sampling_ratio/min": 1.6485961168655194e-05, "sampling/sampling_logp_difference/max": 1.9389703273773193, "sampling/sampling_logp_difference/mean": 0.31008410453796387, "step": 465, "step_time": 23.880688445002306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7853277623653412, "epoch": 0.00466, "grad_norm": 0.14229901134967804, "kl": 0.24608038738369942, "learning_rate": 7.999982888679535e-06, "loss": -0.078, "step": 466, "step_time": 5.066473664001023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 8.3125, "completions/mean_terminated_length": 4.285714149475098, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3971756398677826, "epoch": 0.00467, "frac_reward_zero_std": 0.0, "grad_norm": 0.12393780052661896, "kl": 0.2511787060648203, "learning_rate": 7.99998280899953e-06, "loss": -0.1084, "num_tokens": 13784297.0, "reward": -18.07855987548828, "reward_std": 20.387256622314453, "rewards/rollout_reward_func/mean": -18.07855987548828, "rewards/rollout_reward_func/std": 20.38725471496582, "sampling/importance_sampling_ratio/max": 1.5368783473968506, "sampling/importance_sampling_ratio/mean": 0.6270623207092285, "sampling/importance_sampling_ratio/min": 7.33571027922153e-07, "sampling/sampling_logp_difference/max": 2.270580291748047, "sampling/sampling_logp_difference/mean": 0.43602123856544495, "step": 467, "step_time": 24.490006853999148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4094964265823364, "epoch": 0.00468, "grad_norm": 0.12318355590105057, "kl": 0.24312501400709152, "learning_rate": 7.999982729134439e-06, "loss": -0.1085, "step": 468, "step_time": 5.149679607999133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.3125, "completions/mean_terminated_length": 5.300000190734863, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2238778173923492, "epoch": 0.00469, "frac_reward_zero_std": 0.0, "grad_norm": 0.0884491577744484, "kl": 0.1667272411286831, "learning_rate": 7.99998264908426e-06, "loss": -0.0834, "num_tokens": 13851648.0, "reward": -21.18474769592285, "reward_std": 17.450742721557617, "rewards/rollout_reward_func/mean": -21.18474769592285, "rewards/rollout_reward_func/std": 17.450742721557617, "sampling/importance_sampling_ratio/max": 1.5062971115112305, "sampling/importance_sampling_ratio/mean": 0.5365365743637085, "sampling/importance_sampling_ratio/min": 1.5621779994035023e-06, "sampling/sampling_logp_difference/max": 2.0282058715820312, "sampling/sampling_logp_difference/mean": 0.35857850313186646, "step": 469, "step_time": 23.823659015000885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2319892048835754, "epoch": 0.0047, "grad_norm": 0.08903330564498901, "kl": 0.16537946462631226, "learning_rate": 7.999982568848996e-06, "loss": -0.0834, "step": 470, "step_time": 5.125006416003089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.46875, "completions/mean_terminated_length": 4.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9761815667152405, "epoch": 0.00471, "frac_reward_zero_std": 0.0, "grad_norm": 0.16452357172966003, "kl": 0.25390565022826195, "learning_rate": 7.999982488428647e-06, "loss": -0.0914, "num_tokens": 13921927.0, "reward": -15.437353134155273, "reward_std": 21.222515106201172, "rewards/rollout_reward_func/mean": -15.437353134155273, "rewards/rollout_reward_func/std": 21.22251319885254, "sampling/importance_sampling_ratio/max": 1.6942410469055176, "sampling/importance_sampling_ratio/mean": 0.6845800876617432, "sampling/importance_sampling_ratio/min": 2.091933856718242e-05, "sampling/sampling_logp_difference/max": 1.8865700960159302, "sampling/sampling_logp_difference/mean": 0.3709739148616791, "step": 471, "step_time": 25.760237328000585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9783662557601929, "epoch": 0.00472, "grad_norm": 0.16651582717895508, "kl": 0.2691880539059639, "learning_rate": 7.99998240782321e-06, "loss": -0.0918, "step": 472, "step_time": 4.85879562199807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.84375, "completions/mean_terminated_length": 4.550000190734863, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4554120302200317, "epoch": 0.00473, "frac_reward_zero_std": 0.0, "grad_norm": 0.17190231382846832, "kl": 0.2385566383600235, "learning_rate": 7.999982327032687e-06, "loss": -0.0833, "num_tokens": 13991425.0, "reward": -18.92363739013672, "reward_std": 18.730924606323242, "rewards/rollout_reward_func/mean": -18.92363739013672, "rewards/rollout_reward_func/std": 18.730924606323242, "sampling/importance_sampling_ratio/max": 1.4678200483322144, "sampling/importance_sampling_ratio/mean": 0.5338313579559326, "sampling/importance_sampling_ratio/min": 2.1446704134842065e-10, "sampling/sampling_logp_difference/max": 2.3687543869018555, "sampling/sampling_logp_difference/mean": 0.47444167733192444, "step": 473, "step_time": 22.78291440599969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.452897012233734, "epoch": 0.00474, "grad_norm": 0.1742689460515976, "kl": 0.24164031818509102, "learning_rate": 7.999982246057078e-06, "loss": -0.0843, "step": 474, "step_time": 4.9523998929998925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.59375, "completions/mean_terminated_length": 5.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.7661134600639343, "epoch": 0.00475, "frac_reward_zero_std": 0.0, "grad_norm": 0.18091538548469543, "kl": 0.24852003529667854, "learning_rate": 7.999982164896383e-06, "loss": -0.1078, "num_tokens": 14060772.0, "reward": -22.483924865722656, "reward_std": 17.270160675048828, "rewards/rollout_reward_func/mean": -22.483924865722656, "rewards/rollout_reward_func/std": 17.270160675048828, "sampling/importance_sampling_ratio/max": 1.4833248853683472, "sampling/importance_sampling_ratio/mean": 0.4993225634098053, "sampling/importance_sampling_ratio/min": 1.406827779271147e-11, "sampling/sampling_logp_difference/max": 2.5150306224823, "sampling/sampling_logp_difference/mean": 0.5434218645095825, "step": 475, "step_time": 24.038429121001172 }, { "clip_ratio/high_max": 0.0357142873108387, "clip_ratio/high_mean": 0.011904762126505375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011904762126505375, "entropy": 2.769302546977997, "epoch": 0.00476, "grad_norm": 0.1548774093389511, "kl": 0.23626531660556793, "learning_rate": 7.9999820835506e-06, "loss": -0.1088, "step": 476, "step_time": 4.863332951998018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.09375, "completions/mean_terminated_length": 4.882352828979492, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.765878736972809, "epoch": 0.00477, "frac_reward_zero_std": 0.0, "grad_norm": 0.15278182923793793, "kl": 0.15891127288341522, "learning_rate": 7.999982002019732e-06, "loss": -0.101, "num_tokens": 14129334.0, "reward": -19.123443603515625, "reward_std": 22.426645278930664, "rewards/rollout_reward_func/mean": -19.123443603515625, "rewards/rollout_reward_func/std": 22.426645278930664, "sampling/importance_sampling_ratio/max": 1.4496948719024658, "sampling/importance_sampling_ratio/mean": 0.5647895336151123, "sampling/importance_sampling_ratio/min": 2.2583437342404977e-09, "sampling/sampling_logp_difference/max": 2.2918009757995605, "sampling/sampling_logp_difference/mean": 0.4923846423625946, "step": 477, "step_time": 22.648210540002765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7617345452308655, "epoch": 0.00478, "grad_norm": 0.15195630490779877, "kl": 0.16341817751526833, "learning_rate": 7.999981920303777e-06, "loss": -0.1016, "step": 478, "step_time": 4.9054911469993385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.96875, "completions/mean_terminated_length": 5.285714149475098, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3302236795425415, "epoch": 0.00479, "frac_reward_zero_std": 0.0, "grad_norm": 0.20164696872234344, "kl": 0.3145768903195858, "learning_rate": 7.999981838402736e-06, "loss": -0.0869, "num_tokens": 14200754.0, "reward": -16.303070068359375, "reward_std": 19.293596267700195, "rewards/rollout_reward_func/mean": -16.303070068359375, "rewards/rollout_reward_func/std": 19.293594360351562, "sampling/importance_sampling_ratio/max": 1.4637154340744019, "sampling/importance_sampling_ratio/mean": 0.5283418893814087, "sampling/importance_sampling_ratio/min": 1.5867628462729044e-05, "sampling/sampling_logp_difference/max": 1.622253179550171, "sampling/sampling_logp_difference/mean": 0.3858588933944702, "step": 479, "step_time": 24.30372077800348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3320031464099884, "epoch": 0.0048, "grad_norm": 0.20680350065231323, "kl": 0.3253371100872755, "learning_rate": 7.999981756316607e-06, "loss": -0.0878, "step": 480, "step_time": 5.38770305199796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 7.71875, "completions/mean_terminated_length": 3.954545497894287, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.102576494216919, "epoch": 0.00481, "frac_reward_zero_std": 0.0, "grad_norm": 0.10995849967002869, "kl": 0.6100419759750366, "learning_rate": 7.999981674045395e-06, "loss": -0.0558, "num_tokens": 14272814.0, "reward": -14.629746437072754, "reward_std": 15.802964210510254, "rewards/rollout_reward_func/mean": -14.629746437072754, "rewards/rollout_reward_func/std": 15.802964210510254, "sampling/importance_sampling_ratio/max": 1.5422216653823853, "sampling/importance_sampling_ratio/mean": 0.6478136777877808, "sampling/importance_sampling_ratio/min": 4.486533271119697e-06, "sampling/sampling_logp_difference/max": 1.9467393159866333, "sampling/sampling_logp_difference/mean": 0.38967978954315186, "step": 481, "step_time": 24.681438364003043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1040723621845245, "epoch": 0.00482, "grad_norm": 0.11269649863243103, "kl": 0.669157937169075, "learning_rate": 7.999981591589094e-06, "loss": -0.0556, "step": 482, "step_time": 4.989728557997296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.03125, "completions/mean_terminated_length": 5.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8619046807289124, "epoch": 0.00483, "frac_reward_zero_std": 0.0, "grad_norm": 0.22720550000667572, "kl": 0.3538065478205681, "learning_rate": 7.999981508947708e-06, "loss": -0.0916, "num_tokens": 14341194.0, "reward": -17.266674041748047, "reward_std": 17.808765411376953, "rewards/rollout_reward_func/mean": -17.266674041748047, "rewards/rollout_reward_func/std": 17.80876350402832, "sampling/importance_sampling_ratio/max": 1.8972443342208862, "sampling/importance_sampling_ratio/mean": 0.6356805562973022, "sampling/importance_sampling_ratio/min": 1.2033610801154282e-06, "sampling/sampling_logp_difference/max": 1.7171447277069092, "sampling/sampling_logp_difference/mean": 0.34239232540130615, "step": 483, "step_time": 23.93686269600039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8660557568073273, "epoch": 0.00484, "grad_norm": 0.22635510563850403, "kl": 0.35554688796401024, "learning_rate": 7.999981426121235e-06, "loss": -0.0928, "step": 484, "step_time": 4.9121973069995875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 4.869565486907959, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.954788625240326, "epoch": 0.00485, "frac_reward_zero_std": 0.0, "grad_norm": 0.10807942599058151, "kl": 0.37654323130846024, "learning_rate": 7.999981343109675e-06, "loss": -0.0929, "num_tokens": 14412793.0, "reward": -18.24112892150879, "reward_std": 20.905046463012695, "rewards/rollout_reward_func/mean": -18.24112892150879, "rewards/rollout_reward_func/std": 20.905046463012695, "sampling/importance_sampling_ratio/max": 1.545974850654602, "sampling/importance_sampling_ratio/mean": 0.615673840045929, "sampling/importance_sampling_ratio/min": 4.485705721890554e-05, "sampling/sampling_logp_difference/max": 1.5792133808135986, "sampling/sampling_logp_difference/mean": 0.3477139472961426, "step": 485, "step_time": 25.167028576999655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9562273621559143, "epoch": 0.00486, "grad_norm": 0.10843812674283981, "kl": 0.3706241361796856, "learning_rate": 7.99998125991303e-06, "loss": -0.0926, "step": 486, "step_time": 5.102566645999104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.71875, "completions/mean_terminated_length": 4.349999904632568, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3034667372703552, "epoch": 0.00487, "frac_reward_zero_std": 0.0, "grad_norm": 0.1551945060491562, "kl": 0.8248259127140045, "learning_rate": 7.999981176531298e-06, "loss": -0.0854, "num_tokens": 14480603.0, "reward": -22.135881423950195, "reward_std": 20.715036392211914, "rewards/rollout_reward_func/mean": -22.135881423950195, "rewards/rollout_reward_func/std": 20.715036392211914, "sampling/importance_sampling_ratio/max": 1.3995540142059326, "sampling/importance_sampling_ratio/mean": 0.46646153926849365, "sampling/importance_sampling_ratio/min": 4.113118393433979e-06, "sampling/sampling_logp_difference/max": 2.423207998275757, "sampling/sampling_logp_difference/mean": 0.4635903239250183, "step": 487, "step_time": 22.88290219800001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3053462505340576, "epoch": 0.00488, "grad_norm": 0.15767955780029297, "kl": 0.7664887681603432, "learning_rate": 7.999981092964481e-06, "loss": -0.0867, "step": 488, "step_time": 4.8676047310018475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 9.28125, "completions/mean_terminated_length": 4.055555820465088, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.646563410758972, "epoch": 0.00489, "frac_reward_zero_std": 0.0, "grad_norm": 0.12437555193901062, "kl": 0.5347709506750107, "learning_rate": 7.999981009212575e-06, "loss": -0.083, "num_tokens": 14548301.0, "reward": -20.995445251464844, "reward_std": 18.724903106689453, "rewards/rollout_reward_func/mean": -20.995445251464844, "rewards/rollout_reward_func/std": 18.724905014038086, "sampling/importance_sampling_ratio/max": 1.4202111959457397, "sampling/importance_sampling_ratio/mean": 0.45821765065193176, "sampling/importance_sampling_ratio/min": 8.241729965163813e-09, "sampling/sampling_logp_difference/max": 2.4767823219299316, "sampling/sampling_logp_difference/mean": 0.4561159610748291, "step": 489, "step_time": 23.853560622997975 }, { "clip_ratio/high_max": 0.043154762126505375, "clip_ratio/high_mean": 0.010788690531626344, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010788690531626344, "entropy": 2.650606632232666, "epoch": 0.0049, "grad_norm": 0.10928322374820709, "kl": 0.4580114372074604, "learning_rate": 7.999980925275585e-06, "loss": -0.0832, "step": 490, "step_time": 5.406613083001503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 9.65625, "completions/mean_terminated_length": 4.058823585510254, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1833924055099487, "epoch": 0.00491, "frac_reward_zero_std": 0.0, "grad_norm": 0.14983606338500977, "kl": 0.34758199378848076, "learning_rate": 7.999980841153509e-06, "loss": -0.099, "num_tokens": 14616495.0, "reward": -21.96834945678711, "reward_std": 20.151134490966797, "rewards/rollout_reward_func/mean": -21.96834945678711, "rewards/rollout_reward_func/std": 20.151132583618164, "sampling/importance_sampling_ratio/max": 1.577589511871338, "sampling/importance_sampling_ratio/mean": 0.5748796463012695, "sampling/importance_sampling_ratio/min": 2.545327333791647e-06, "sampling/sampling_logp_difference/max": 1.9050142765045166, "sampling/sampling_logp_difference/mean": 0.39918428659439087, "step": 491, "step_time": 24.255110889000207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1834853291511536, "epoch": 0.00492, "grad_norm": 0.15023919939994812, "kl": 0.3400149755179882, "learning_rate": 7.999980756846344e-06, "loss": -0.0991, "step": 492, "step_time": 5.068287847998363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 4.363636493682861, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.974174439907074, "epoch": 0.00493, "frac_reward_zero_std": 0.0, "grad_norm": 0.1389477252960205, "kl": 0.3782857432961464, "learning_rate": 7.999980672354094e-06, "loss": -0.0804, "num_tokens": 14690645.0, "reward": -16.126100540161133, "reward_std": 20.87386703491211, "rewards/rollout_reward_func/mean": -16.126100540161133, "rewards/rollout_reward_func/std": 20.87386703491211, "sampling/importance_sampling_ratio/max": 1.3682857751846313, "sampling/importance_sampling_ratio/mean": 0.5957636833190918, "sampling/importance_sampling_ratio/min": 3.3083956623158883e-06, "sampling/sampling_logp_difference/max": 2.275355339050293, "sampling/sampling_logp_difference/mean": 0.37000828981399536, "step": 493, "step_time": 24.734983175001616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.978302240371704, "epoch": 0.00494, "grad_norm": 0.13513009250164032, "kl": 0.360340578481555, "learning_rate": 7.99998058767676e-06, "loss": -0.0801, "step": 494, "step_time": 5.102995043998817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 9.6875, "completions/mean_terminated_length": 4.117647171020508, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.5223376154899597, "epoch": 0.00495, "frac_reward_zero_std": 0.0, "grad_norm": 0.13421611487865448, "kl": 0.22253388911485672, "learning_rate": 7.999980502814336e-06, "loss": -0.0842, "num_tokens": 14761572.0, "reward": -22.935823440551758, "reward_std": 18.24248695373535, "rewards/rollout_reward_func/mean": -22.935823440551758, "rewards/rollout_reward_func/std": 18.24248695373535, "sampling/importance_sampling_ratio/max": 1.3423089981079102, "sampling/importance_sampling_ratio/mean": 0.41088682413101196, "sampling/importance_sampling_ratio/min": 2.3730752218398266e-05, "sampling/sampling_logp_difference/max": 1.7484036684036255, "sampling/sampling_logp_difference/mean": 0.4200107753276825, "step": 495, "step_time": 22.769775234999543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5225143432617188, "epoch": 0.00496, "grad_norm": 0.13208302855491638, "kl": 0.2119848746806383, "learning_rate": 7.999980417766827e-06, "loss": -0.0843, "step": 496, "step_time": 4.998490790998403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 9.84375, "completions/mean_terminated_length": 4.411764621734619, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.440274327993393, "epoch": 0.00497, "frac_reward_zero_std": 0.0, "grad_norm": 0.1358642429113388, "kl": 0.1893120352178812, "learning_rate": 7.999980332534234e-06, "loss": -0.0584, "num_tokens": 14833811.0, "reward": -19.315654754638672, "reward_std": 16.742454528808594, "rewards/rollout_reward_func/mean": -19.315654754638672, "rewards/rollout_reward_func/std": 16.742454528808594, "sampling/importance_sampling_ratio/max": 1.3555660247802734, "sampling/importance_sampling_ratio/mean": 0.4688986837863922, "sampling/importance_sampling_ratio/min": 8.130371497827582e-06, "sampling/sampling_logp_difference/max": 1.5674673318862915, "sampling/sampling_logp_difference/mean": 0.38107433915138245, "step": 497, "step_time": 24.035278310002468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.446887969970703, "epoch": 0.00498, "grad_norm": 0.1389978677034378, "kl": 0.18457390926778316, "learning_rate": 7.999980247116551e-06, "loss": -0.0584, "step": 498, "step_time": 4.951193460001377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 9.09375, "completions/mean_terminated_length": 4.3684210777282715, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4018044769763947, "epoch": 0.00499, "frac_reward_zero_std": 0.0, "grad_norm": 0.19411654770374298, "kl": 0.1772363968193531, "learning_rate": 7.999980161513784e-06, "loss": -0.0686, "num_tokens": 14903347.0, "reward": -21.163070678710938, "reward_std": 16.02184295654297, "rewards/rollout_reward_func/mean": -21.163070678710938, "rewards/rollout_reward_func/std": 16.021841049194336, "sampling/importance_sampling_ratio/max": 1.3939342498779297, "sampling/importance_sampling_ratio/mean": 0.5620460510253906, "sampling/importance_sampling_ratio/min": 4.128478394704871e-07, "sampling/sampling_logp_difference/max": 2.1232423782348633, "sampling/sampling_logp_difference/mean": 0.35820528864860535, "step": 499, "step_time": 23.941877537998153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.399316966533661, "epoch": 0.005, "grad_norm": 0.20594853162765503, "kl": 0.17596573010087013, "learning_rate": 7.999980075725931e-06, "loss": -0.0688, "step": 500, "step_time": 5.281601324002622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.1875, "completions/mean_terminated_length": 4.526315689086914, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.791247248649597, "epoch": 0.00501, "frac_reward_zero_std": 0.0, "grad_norm": 0.13271191716194153, "kl": 0.1520271971821785, "learning_rate": 7.999979989752992e-06, "loss": -0.0913, "num_tokens": 14971783.0, "reward": -22.421531677246094, "reward_std": 15.577054977416992, "rewards/rollout_reward_func/mean": -22.421531677246094, "rewards/rollout_reward_func/std": 15.577054977416992, "sampling/importance_sampling_ratio/max": 1.4488033056259155, "sampling/importance_sampling_ratio/mean": 0.5161742568016052, "sampling/importance_sampling_ratio/min": 2.1953017181886025e-08, "sampling/sampling_logp_difference/max": 2.06265926361084, "sampling/sampling_logp_difference/mean": 0.44650915265083313, "step": 501, "step_time": 22.416990233001343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7895249128341675, "epoch": 0.00502, "grad_norm": 0.13493138551712036, "kl": 0.15373935922980309, "learning_rate": 7.999979903594967e-06, "loss": -0.0918, "step": 502, "step_time": 4.836391084998468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 5.384615421295166, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9301931858062744, "epoch": 0.00503, "frac_reward_zero_std": 0.0, "grad_norm": 0.18135777115821838, "kl": 0.26021165773272514, "learning_rate": 7.999979817251854e-06, "loss": -0.0846, "num_tokens": 15041148.0, "reward": -14.786160469055176, "reward_std": 19.54051971435547, "rewards/rollout_reward_func/mean": -14.786160469055176, "rewards/rollout_reward_func/std": 19.54051971435547, "sampling/importance_sampling_ratio/max": 1.476239800453186, "sampling/importance_sampling_ratio/mean": 0.6744074821472168, "sampling/importance_sampling_ratio/min": 0.0001151088290498592, "sampling/sampling_logp_difference/max": 1.4291642904281616, "sampling/sampling_logp_difference/mean": 0.31869328022003174, "step": 503, "step_time": 23.78969867199885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9293220341205597, "epoch": 0.00504, "grad_norm": 0.19135065376758575, "kl": 0.251952912658453, "learning_rate": 7.999979730723655e-06, "loss": -0.0854, "step": 504, "step_time": 4.921056891000262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.46875, "completions/mean_terminated_length": 5.588235378265381, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.8982162475585938, "epoch": 0.00505, "frac_reward_zero_std": 0.0, "grad_norm": 0.15251515805721283, "kl": 0.10943751409649849, "learning_rate": 7.99997964401037e-06, "loss": -0.101, "num_tokens": 15107648.0, "reward": -25.01186752319336, "reward_std": 22.028100967407227, "rewards/rollout_reward_func/mean": -25.01186752319336, "rewards/rollout_reward_func/std": 22.02810287475586, "sampling/importance_sampling_ratio/max": 1.4776523113250732, "sampling/importance_sampling_ratio/mean": 0.4303729832172394, "sampling/importance_sampling_ratio/min": 3.108796136075398e-07, "sampling/sampling_logp_difference/max": 2.414944648742676, "sampling/sampling_logp_difference/mean": 0.4763700067996979, "step": 505, "step_time": 20.97922166799981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.897136092185974, "epoch": 0.00506, "grad_norm": 0.14679530262947083, "kl": 0.11125560291111469, "learning_rate": 7.999979557112e-06, "loss": -0.1016, "step": 506, "step_time": 5.469214398999611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.1875, "completions/mean_terminated_length": 4.636363983154297, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.26311057806015, "epoch": 0.00507, "frac_reward_zero_std": 0.0, "grad_norm": 0.18783023953437805, "kl": 0.2561366334557533, "learning_rate": 7.999979470028542e-06, "loss": -0.0693, "num_tokens": 15175185.0, "reward": -17.510101318359375, "reward_std": 19.969514846801758, "rewards/rollout_reward_func/mean": -17.510101318359375, "rewards/rollout_reward_func/std": 19.969514846801758, "sampling/importance_sampling_ratio/max": 1.2057515382766724, "sampling/importance_sampling_ratio/mean": 0.502650260925293, "sampling/importance_sampling_ratio/min": 1.2661064197061478e-08, "sampling/sampling_logp_difference/max": 2.2887141704559326, "sampling/sampling_logp_difference/mean": 0.405100017786026, "step": 507, "step_time": 24.19272692999948 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 2.2620075941085815, "epoch": 0.00508, "grad_norm": 0.1812216192483902, "kl": 0.2560460492968559, "learning_rate": 7.99997938276e-06, "loss": -0.0694, "step": 508, "step_time": 5.4763821919987095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.34375, "completions/mean_terminated_length": 4.458333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.741052433848381, "epoch": 0.00509, "frac_reward_zero_std": 0.0, "grad_norm": 0.12168125063180923, "kl": 0.231058482080698, "learning_rate": 7.999979295306369e-06, "loss": -0.0936, "num_tokens": 15247125.0, "reward": -11.583511352539062, "reward_std": 19.768598556518555, "rewards/rollout_reward_func/mean": -11.583511352539062, "rewards/rollout_reward_func/std": 19.768596649169922, "sampling/importance_sampling_ratio/max": 1.7143174409866333, "sampling/importance_sampling_ratio/mean": 0.7079207897186279, "sampling/importance_sampling_ratio/min": 8.335004508808197e-08, "sampling/sampling_logp_difference/max": 2.0893070697784424, "sampling/sampling_logp_difference/mean": 0.3051382303237915, "step": 509, "step_time": 24.95626230400012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7446381449699402, "epoch": 0.0051, "grad_norm": 0.12006252259016037, "kl": 0.2216223180294037, "learning_rate": 7.999979207667654e-06, "loss": -0.0934, "step": 510, "step_time": 5.561620020000191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 4.800000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.387643337249756, "epoch": 0.00511, "frac_reward_zero_std": 0.0, "grad_norm": 0.14623931050300598, "kl": 0.1996527910232544, "learning_rate": 7.999979119843853e-06, "loss": -0.1069, "num_tokens": 15321702.0, "reward": -19.08687973022461, "reward_std": 18.829059600830078, "rewards/rollout_reward_func/mean": -19.08687973022461, "rewards/rollout_reward_func/std": 18.829059600830078, "sampling/importance_sampling_ratio/max": 1.5666577816009521, "sampling/importance_sampling_ratio/mean": 0.5744220614433289, "sampling/importance_sampling_ratio/min": 6.741191782566602e-07, "sampling/sampling_logp_difference/max": 2.0639140605926514, "sampling/sampling_logp_difference/mean": 0.40878576040267944, "step": 511, "step_time": 24.342081448998215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3937729001045227, "epoch": 0.00512, "grad_norm": 0.14478646218776703, "kl": 0.19011089578270912, "learning_rate": 7.999979031834965e-06, "loss": -0.1071, "step": 512, "step_time": 5.050928856002429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.5625, "completions/mean_terminated_length": 5.18181848526001, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4113646745681763, "epoch": 0.00513, "frac_reward_zero_std": 0.0, "grad_norm": 0.15538457036018372, "kl": 0.1487669600173831, "learning_rate": 7.99997894364099e-06, "loss": -0.0883, "num_tokens": 15397994.0, "reward": -22.932497024536133, "reward_std": 20.193836212158203, "rewards/rollout_reward_func/mean": -22.932497024536133, "rewards/rollout_reward_func/std": 20.193836212158203, "sampling/importance_sampling_ratio/max": 1.2902387380599976, "sampling/importance_sampling_ratio/mean": 0.507900595664978, "sampling/importance_sampling_ratio/min": 3.0356576985468564e-07, "sampling/sampling_logp_difference/max": 2.2184362411499023, "sampling/sampling_logp_difference/mean": 0.3807198107242584, "step": 513, "step_time": 25.555862460001663 }, { "clip_ratio/high_max": 0.040277778171002865, "clip_ratio/high_mean": 0.010069444542750716, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010069444542750716, "entropy": 2.4091996550559998, "epoch": 0.00514, "grad_norm": 0.12447121739387512, "kl": 0.14173655398190022, "learning_rate": 7.99997885526193e-06, "loss": -0.0895, "step": 514, "step_time": 5.322010605998003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.65625, "completions/mean_terminated_length": 5.782608985900879, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1298939287662506, "epoch": 0.00515, "frac_reward_zero_std": 0.0, "grad_norm": 0.09805925190448761, "kl": 0.1750204749405384, "learning_rate": 7.999978766697782e-06, "loss": -0.0918, "num_tokens": 15476720.0, "reward": -21.9211483001709, "reward_std": 22.463886260986328, "rewards/rollout_reward_func/mean": -21.9211483001709, "rewards/rollout_reward_func/std": 22.463884353637695, "sampling/importance_sampling_ratio/max": 1.675930380821228, "sampling/importance_sampling_ratio/mean": 0.5280312299728394, "sampling/importance_sampling_ratio/min": 4.1599065298214555e-05, "sampling/sampling_logp_difference/max": 1.6939330101013184, "sampling/sampling_logp_difference/mean": 0.3495376706123352, "step": 515, "step_time": 26.385464587998285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.133001983165741, "epoch": 0.00516, "grad_norm": 0.09725520759820938, "kl": 0.17437801137566566, "learning_rate": 7.99997867794855e-06, "loss": -0.0915, "step": 516, "step_time": 5.9798627119998855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 4.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.9198020696640015, "epoch": 0.00517, "frac_reward_zero_std": 0.0, "grad_norm": 0.15334562957286835, "kl": 0.1181060029193759, "learning_rate": 7.99997858901423e-06, "loss": -0.0881, "num_tokens": 15552682.0, "reward": -24.892366409301758, "reward_std": 24.490673065185547, "rewards/rollout_reward_func/mean": -24.892366409301758, "rewards/rollout_reward_func/std": 24.490673065185547, "sampling/importance_sampling_ratio/max": 1.2403879165649414, "sampling/importance_sampling_ratio/mean": 0.38255763053894043, "sampling/importance_sampling_ratio/min": 7.635565957286872e-09, "sampling/sampling_logp_difference/max": 2.4428317546844482, "sampling/sampling_logp_difference/mean": 0.47919777035713196, "step": 517, "step_time": 25.141423070999736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.916719675064087, "epoch": 0.00518, "grad_norm": 0.1566312462091446, "kl": 0.11347010172903538, "learning_rate": 7.999978499894826e-06, "loss": -0.0893, "step": 518, "step_time": 5.659394375001284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.84375, "completions/mean_terminated_length": 4.550000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3425883650779724, "epoch": 0.00519, "frac_reward_zero_std": 0.0, "grad_norm": 0.12020807713270187, "kl": 0.1415723329409957, "learning_rate": 7.999978410590335e-06, "loss": -0.1034, "num_tokens": 15630225.0, "reward": -23.615699768066406, "reward_std": 22.718847274780273, "rewards/rollout_reward_func/mean": -23.615699768066406, "rewards/rollout_reward_func/std": 22.718847274780273, "sampling/importance_sampling_ratio/max": 1.353162169456482, "sampling/importance_sampling_ratio/mean": 0.5644866228103638, "sampling/importance_sampling_ratio/min": 9.715652140585007e-07, "sampling/sampling_logp_difference/max": 1.7080813646316528, "sampling/sampling_logp_difference/mean": 0.34415340423583984, "step": 519, "step_time": 28.072190154000054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3347459733486176, "epoch": 0.0052, "grad_norm": 0.11245807260274887, "kl": 0.13711841963231564, "learning_rate": 7.999978321100757e-06, "loss": -0.1037, "step": 520, "step_time": 5.872004879000087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.21875, "completions/mean_terminated_length": 6.4375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.539493888616562, "epoch": 0.00521, "frac_reward_zero_std": 0.0, "grad_norm": 0.13637486100196838, "kl": 0.10225660353899002, "learning_rate": 7.999978231426094e-06, "loss": -0.0809, "num_tokens": 15700851.0, "reward": -29.82391357421875, "reward_std": 22.936473846435547, "rewards/rollout_reward_func/mean": -29.82391357421875, "rewards/rollout_reward_func/std": 22.936473846435547, "sampling/importance_sampling_ratio/max": 1.2923505306243896, "sampling/importance_sampling_ratio/mean": 0.4065125286579132, "sampling/importance_sampling_ratio/min": 8.27761414257111e-06, "sampling/sampling_logp_difference/max": 1.7098233699798584, "sampling/sampling_logp_difference/mean": 0.33771073818206787, "step": 521, "step_time": 25.2239331890014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5342701077461243, "epoch": 0.00522, "grad_norm": 0.1354452520608902, "kl": 0.10140529368072748, "learning_rate": 7.999978141566344e-06, "loss": -0.082, "step": 522, "step_time": 5.259847268000158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 6.117647171020508, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.8075463473796844, "epoch": 0.00523, "frac_reward_zero_std": 0.0, "grad_norm": 0.1340557485818863, "kl": 0.08409828506410122, "learning_rate": 7.999978051521509e-06, "loss": -0.1133, "num_tokens": 15777291.0, "reward": -26.391359329223633, "reward_std": 26.44964599609375, "rewards/rollout_reward_func/mean": -26.391359329223633, "rewards/rollout_reward_func/std": 26.44964599609375, "sampling/importance_sampling_ratio/max": 1.2205100059509277, "sampling/importance_sampling_ratio/mean": 0.46506309509277344, "sampling/importance_sampling_ratio/min": 1.705734007373394e-06, "sampling/sampling_logp_difference/max": 2.08674955368042, "sampling/sampling_logp_difference/mean": 0.40235939621925354, "step": 523, "step_time": 24.185476358001324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.802259087562561, "epoch": 0.00524, "grad_norm": 0.1305496096611023, "kl": 0.08550981618463993, "learning_rate": 7.999977961291588e-06, "loss": -0.1135, "step": 524, "step_time": 5.417971866001608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.03125, "completions/mean_terminated_length": 4.764706134796143, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.712225377559662, "epoch": 0.00525, "frac_reward_zero_std": 0.0, "grad_norm": 0.1920086294412613, "kl": 0.142098237760365, "learning_rate": 7.99997787087658e-06, "loss": -0.1041, "num_tokens": 15852066.0, "reward": -28.091453552246094, "reward_std": 23.433151245117188, "rewards/rollout_reward_func/mean": -28.091453552246094, "rewards/rollout_reward_func/std": 23.433151245117188, "sampling/importance_sampling_ratio/max": 1.345760464668274, "sampling/importance_sampling_ratio/mean": 0.4739055335521698, "sampling/importance_sampling_ratio/min": 3.3186862147260854e-09, "sampling/sampling_logp_difference/max": 2.2777891159057617, "sampling/sampling_logp_difference/mean": 0.44799360632896423, "step": 525, "step_time": 24.64388704599878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.713220626115799, "epoch": 0.00526, "grad_norm": 0.19358064234256744, "kl": 0.14576633367687464, "learning_rate": 7.999977780276485e-06, "loss": -0.1045, "step": 526, "step_time": 5.887201245002871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 9.75, "completions/mean_terminated_length": 4.235294342041016, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.177923172712326, "epoch": 0.00527, "frac_reward_zero_std": 0.0, "grad_norm": 0.10257299244403839, "kl": 0.17780345864593983, "learning_rate": 7.999977689491306e-06, "loss": -0.0894, "num_tokens": 15929833.0, "reward": -25.57181167602539, "reward_std": 21.57073211669922, "rewards/rollout_reward_func/mean": -25.57181167602539, "rewards/rollout_reward_func/std": 21.57073211669922, "sampling/importance_sampling_ratio/max": 1.277652382850647, "sampling/importance_sampling_ratio/mean": 0.4948100447654724, "sampling/importance_sampling_ratio/min": 3.36851881002076e-05, "sampling/sampling_logp_difference/max": 1.8235667943954468, "sampling/sampling_logp_difference/mean": 0.3696085512638092, "step": 527, "step_time": 24.936313184001847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1722015142440796, "epoch": 0.00528, "grad_norm": 0.10180305689573288, "kl": 0.1802537813782692, "learning_rate": 7.99997759852104e-06, "loss": -0.0895, "step": 528, "step_time": 5.986162035002053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 4.6315789222717285, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1999351382255554, "epoch": 0.00529, "frac_reward_zero_std": 0.0, "grad_norm": 0.14343413710594177, "kl": 0.16693082079291344, "learning_rate": 7.999977507365686e-06, "loss": -0.0643, "num_tokens": 16006039.0, "reward": -23.069826126098633, "reward_std": 22.665613174438477, "rewards/rollout_reward_func/mean": -23.069826126098633, "rewards/rollout_reward_func/std": 22.665611267089844, "sampling/importance_sampling_ratio/max": 1.4287937879562378, "sampling/importance_sampling_ratio/mean": 0.531559944152832, "sampling/importance_sampling_ratio/min": 1.1222149623790756e-05, "sampling/sampling_logp_difference/max": 1.6697955131530762, "sampling/sampling_logp_difference/mean": 0.31233417987823486, "step": 529, "step_time": 26.299894438001502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1944556534290314, "epoch": 0.0053, "grad_norm": 0.1480443924665451, "kl": 0.17201079428195953, "learning_rate": 7.999977416025248e-06, "loss": -0.0643, "step": 530, "step_time": 6.379406658001244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 7.8125, "completions/mean_terminated_length": 4.090909004211426, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0629856884479523, "epoch": 0.00531, "frac_reward_zero_std": 0.0, "grad_norm": 0.17903093993663788, "kl": 0.18685079738497734, "learning_rate": 7.999977324499723e-06, "loss": -0.0966, "num_tokens": 16085273.0, "reward": -15.724159240722656, "reward_std": 21.772390365600586, "rewards/rollout_reward_func/mean": -15.724159240722656, "rewards/rollout_reward_func/std": 21.772390365600586, "sampling/importance_sampling_ratio/max": 1.4325486421585083, "sampling/importance_sampling_ratio/mean": 0.696200966835022, "sampling/importance_sampling_ratio/min": 1.4780019228055608e-05, "sampling/sampling_logp_difference/max": 1.5068309307098389, "sampling/sampling_logp_difference/mean": 0.3233673572540283, "step": 531, "step_time": 27.715813098999206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.060730278491974, "epoch": 0.00532, "grad_norm": 0.18986189365386963, "kl": 0.18512185662984848, "learning_rate": 7.999977232789113e-06, "loss": -0.0972, "step": 532, "step_time": 5.559234422998998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.21875, "completions/mean_terminated_length": 5.150000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1554220616817474, "epoch": 0.00533, "frac_reward_zero_std": 0.0, "grad_norm": 0.12542110681533813, "kl": 0.118995176628232, "learning_rate": 7.999977140893417e-06, "loss": -0.0976, "num_tokens": 16165378.0, "reward": -22.818309783935547, "reward_std": 24.463298797607422, "rewards/rollout_reward_func/mean": -22.818309783935547, "rewards/rollout_reward_func/std": 24.463298797607422, "sampling/importance_sampling_ratio/max": 1.4122623205184937, "sampling/importance_sampling_ratio/mean": 0.5970444679260254, "sampling/importance_sampling_ratio/min": 2.133140696969349e-05, "sampling/sampling_logp_difference/max": 1.8614797592163086, "sampling/sampling_logp_difference/mean": 0.33888357877731323, "step": 533, "step_time": 26.100097606999043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1557550132274628, "epoch": 0.00534, "grad_norm": 0.12954452633857727, "kl": 0.11946611292660236, "learning_rate": 7.999977048812635e-06, "loss": -0.0979, "step": 534, "step_time": 5.645127452000452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.78125, "completions/mean_terminated_length": 4.94444465637207, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.410335600376129, "epoch": 0.00535, "frac_reward_zero_std": 0.0, "grad_norm": 0.09916669130325317, "kl": 0.11003082618117332, "learning_rate": 7.999976956546766e-06, "loss": -0.0898, "num_tokens": 16242370.0, "reward": -24.71609115600586, "reward_std": 22.237680435180664, "rewards/rollout_reward_func/mean": -24.71609115600586, "rewards/rollout_reward_func/std": 22.237680435180664, "sampling/importance_sampling_ratio/max": 1.6109317541122437, "sampling/importance_sampling_ratio/mean": 0.5057700872421265, "sampling/importance_sampling_ratio/min": 8.664116649015341e-06, "sampling/sampling_logp_difference/max": 1.892593264579773, "sampling/sampling_logp_difference/mean": 0.3593211770057678, "step": 535, "step_time": 26.126600351997695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4075496792793274, "epoch": 0.00536, "grad_norm": 0.09969095885753632, "kl": 0.1106190700083971, "learning_rate": 7.99997686409581e-06, "loss": -0.09, "step": 536, "step_time": 5.999921808999716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 9.4375, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.564828038215637, "epoch": 0.00537, "frac_reward_zero_std": 0.0, "grad_norm": 0.149555504322052, "kl": 0.11902590282261372, "learning_rate": 7.99997677145977e-06, "loss": -0.0935, "num_tokens": 16320591.0, "reward": -29.53679084777832, "reward_std": 19.17433738708496, "rewards/rollout_reward_func/mean": -29.53679084777832, "rewards/rollout_reward_func/std": 19.17433738708496, "sampling/importance_sampling_ratio/max": 1.2939295768737793, "sampling/importance_sampling_ratio/mean": 0.4705699682235718, "sampling/importance_sampling_ratio/min": 2.8116533545841094e-09, "sampling/sampling_logp_difference/max": 2.0620131492614746, "sampling/sampling_logp_difference/mean": 0.44038134813308716, "step": 537, "step_time": 25.291952167001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.55855530500412, "epoch": 0.00538, "grad_norm": 0.14716777205467224, "kl": 0.1209065392613411, "learning_rate": 7.999976678638642e-06, "loss": -0.0939, "step": 538, "step_time": 5.99558506700123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 10.46875, "completions/mean_terminated_length": 4.200000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.596274435520172, "epoch": 0.00539, "frac_reward_zero_std": 0.0, "grad_norm": 0.11741464585065842, "kl": 0.08289640676230192, "learning_rate": 7.99997658563243e-06, "loss": -0.0915, "num_tokens": 16397854.0, "reward": -26.11037826538086, "reward_std": 23.741037368774414, "rewards/rollout_reward_func/mean": -26.11037826538086, "rewards/rollout_reward_func/std": 23.741037368774414, "sampling/importance_sampling_ratio/max": 1.5399551391601562, "sampling/importance_sampling_ratio/mean": 0.4548865556716919, "sampling/importance_sampling_ratio/min": 2.5769411877263337e-05, "sampling/sampling_logp_difference/max": 1.4729382991790771, "sampling/sampling_logp_difference/mean": 0.37351176142692566, "step": 539, "step_time": 25.90506659800303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5909209847450256, "epoch": 0.0054, "grad_norm": 0.11758916825056076, "kl": 0.08427499141544104, "learning_rate": 7.999976492441131e-06, "loss": -0.0919, "step": 540, "step_time": 5.946117338999102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.5, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5553722977638245, "epoch": 0.00541, "frac_reward_zero_std": 0.0, "grad_norm": 0.14876726269721985, "kl": 0.23285307921469212, "learning_rate": 7.999976399064746e-06, "loss": -0.0717, "num_tokens": 16471515.0, "reward": -28.362049102783203, "reward_std": 21.743778228759766, "rewards/rollout_reward_func/mean": -28.362049102783203, "rewards/rollout_reward_func/std": 21.743776321411133, "sampling/importance_sampling_ratio/max": 1.1934047937393188, "sampling/importance_sampling_ratio/mean": 0.42544299364089966, "sampling/importance_sampling_ratio/min": 1.3440967228106615e-09, "sampling/sampling_logp_difference/max": 2.4079339504241943, "sampling/sampling_logp_difference/mean": 0.42236191034317017, "step": 541, "step_time": 25.230676576002224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5553473830223083, "epoch": 0.00542, "grad_norm": 0.15139101445674896, "kl": 0.22035399824380875, "learning_rate": 7.999976305503275e-06, "loss": -0.0723, "step": 542, "step_time": 5.193109043000732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7188656628131866, "epoch": 0.00543, "frac_reward_zero_std": 0.0, "grad_norm": 0.1864631325006485, "kl": 0.18687132745981216, "learning_rate": 7.999976211756718e-06, "loss": -0.0853, "num_tokens": 16550622.0, "reward": -18.575424194335938, "reward_std": 23.492177963256836, "rewards/rollout_reward_func/mean": -18.575424194335938, "rewards/rollout_reward_func/std": 23.492177963256836, "sampling/importance_sampling_ratio/max": 1.4858733415603638, "sampling/importance_sampling_ratio/mean": 0.6830133199691772, "sampling/importance_sampling_ratio/min": 2.2834527044324204e-05, "sampling/sampling_logp_difference/max": 1.640305757522583, "sampling/sampling_logp_difference/mean": 0.28623637557029724, "step": 543, "step_time": 27.331112581001435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.719136267900467, "epoch": 0.00544, "grad_norm": 0.1889543980360031, "kl": 0.18649466149508953, "learning_rate": 7.999976117825075e-06, "loss": -0.0863, "step": 544, "step_time": 5.52142000999811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 9.78125, "completions/mean_terminated_length": 4.294117450714111, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5197007060050964, "epoch": 0.00545, "frac_reward_zero_std": 0.0, "grad_norm": 0.17202720046043396, "kl": 0.206658024340868, "learning_rate": 7.999976023708346e-06, "loss": -0.1044, "num_tokens": 16631301.0, "reward": -24.369531631469727, "reward_std": 24.52963638305664, "rewards/rollout_reward_func/mean": -24.369531631469727, "rewards/rollout_reward_func/std": 24.529638290405273, "sampling/importance_sampling_ratio/max": 1.4525433778762817, "sampling/importance_sampling_ratio/mean": 0.5371415019035339, "sampling/importance_sampling_ratio/min": 3.608126020182567e-10, "sampling/sampling_logp_difference/max": 2.2147152423858643, "sampling/sampling_logp_difference/mean": 0.43296611309051514, "step": 545, "step_time": 26.721156801997495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5198997855186462, "epoch": 0.00546, "grad_norm": 0.17779448628425598, "kl": 0.2126951776444912, "learning_rate": 7.999975929406532e-06, "loss": -0.1045, "step": 546, "step_time": 6.168747163997978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 8.9375, "completions/mean_terminated_length": 4.1052632331848145, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1209547221660614, "epoch": 0.00547, "frac_reward_zero_std": 0.0, "grad_norm": 0.2003205567598343, "kl": 0.17905844748020172, "learning_rate": 7.99997583491963e-06, "loss": -0.0913, "num_tokens": 16707841.0, "reward": -22.308826446533203, "reward_std": 23.705646514892578, "rewards/rollout_reward_func/mean": -22.308826446533203, "rewards/rollout_reward_func/std": 23.705646514892578, "sampling/importance_sampling_ratio/max": 1.3140183687210083, "sampling/importance_sampling_ratio/mean": 0.5301704406738281, "sampling/importance_sampling_ratio/min": 6.138191565696616e-06, "sampling/sampling_logp_difference/max": 2.4566445350646973, "sampling/sampling_logp_difference/mean": 0.3771078586578369, "step": 547, "step_time": 26.55518393499915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.117727428674698, "epoch": 0.00548, "grad_norm": 0.19885067641735077, "kl": 0.18897400423884392, "learning_rate": 7.999975740247644e-06, "loss": -0.0923, "step": 548, "step_time": 5.984130312999696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.8125, "completions/mean_terminated_length": 5.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4883806705474854, "epoch": 0.00549, "frac_reward_zero_std": 0.0, "grad_norm": 0.11974375694990158, "kl": 0.0860560741275549, "learning_rate": 7.99997564539057e-06, "loss": -0.09, "num_tokens": 16782688.0, "reward": -23.385263442993164, "reward_std": 23.144140243530273, "rewards/rollout_reward_func/mean": -23.385263442993164, "rewards/rollout_reward_func/std": 23.144140243530273, "sampling/importance_sampling_ratio/max": 1.2790801525115967, "sampling/importance_sampling_ratio/mean": 0.5123468637466431, "sampling/importance_sampling_ratio/min": 1.4810933635089896e-07, "sampling/sampling_logp_difference/max": 1.587803840637207, "sampling/sampling_logp_difference/mean": 0.3471105396747589, "step": 549, "step_time": 25.522917169997527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.479794889688492, "epoch": 0.0055, "grad_norm": 0.11316294968128204, "kl": 0.08956722542643547, "learning_rate": 7.999975550348412e-06, "loss": -0.0905, "step": 550, "step_time": 6.2516953989998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.59375, "completions/mean_terminated_length": 5.210526466369629, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1581971347332, "epoch": 0.00551, "frac_reward_zero_std": 0.0, "grad_norm": 0.10277154296636581, "kl": 0.14161051623523235, "learning_rate": 7.999975455121166e-06, "loss": -0.0945, "num_tokens": 16860261.0, "reward": -23.921010971069336, "reward_std": 25.414583206176758, "rewards/rollout_reward_func/mean": -23.921010971069336, "rewards/rollout_reward_func/std": 25.414583206176758, "sampling/importance_sampling_ratio/max": 1.7644093036651611, "sampling/importance_sampling_ratio/mean": 0.5791536569595337, "sampling/importance_sampling_ratio/min": 2.209894319094019e-06, "sampling/sampling_logp_difference/max": 1.9209437370300293, "sampling/sampling_logp_difference/mean": 0.3401491045951843, "step": 551, "step_time": 25.63482321599986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.151269018650055, "epoch": 0.00552, "grad_norm": 0.10118355602025986, "kl": 0.14594027027487755, "learning_rate": 7.999975359708836e-06, "loss": -0.0949, "step": 552, "step_time": 5.543325546001142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.625, "completions/mean_terminated_length": 5.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.6372366547584534, "epoch": 0.00553, "frac_reward_zero_std": 0.0, "grad_norm": 0.1308639645576477, "kl": 0.6785032842308283, "learning_rate": 7.99997526411142e-06, "loss": -0.1008, "num_tokens": 16932872.0, "reward": -31.738248825073242, "reward_std": 23.152326583862305, "rewards/rollout_reward_func/mean": -31.738248825073242, "rewards/rollout_reward_func/std": 23.152328491210938, "sampling/importance_sampling_ratio/max": 1.5490413904190063, "sampling/importance_sampling_ratio/mean": 0.4374718964099884, "sampling/importance_sampling_ratio/min": 7.601446583294091e-09, "sampling/sampling_logp_difference/max": 2.659304618835449, "sampling/sampling_logp_difference/mean": 0.47898373007774353, "step": 553, "step_time": 23.8870557299997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.633584588766098, "epoch": 0.00554, "grad_norm": 0.12915655970573425, "kl": 0.6846471689641476, "learning_rate": 7.999975168328916e-06, "loss": -0.1004, "step": 554, "step_time": 5.312749688000622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 8.96875, "completions/mean_terminated_length": 4.157894611358643, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.38057279586792, "epoch": 0.00555, "frac_reward_zero_std": 0.0, "grad_norm": 0.7887076139450073, "kl": 2.422310523688793, "learning_rate": 7.999975072361328e-06, "loss": -0.0868, "num_tokens": 17005546.0, "reward": -23.8175048828125, "reward_std": 25.815265655517578, "rewards/rollout_reward_func/mean": -23.8175048828125, "rewards/rollout_reward_func/std": 25.815265655517578, "sampling/importance_sampling_ratio/max": 1.6269035339355469, "sampling/importance_sampling_ratio/mean": 0.5266069769859314, "sampling/importance_sampling_ratio/min": 1.238702918016088e-09, "sampling/sampling_logp_difference/max": 4.103202819824219, "sampling/sampling_logp_difference/mean": 0.4228965640068054, "step": 555, "step_time": 25.177169690001392 }, { "clip_ratio/high_max": 0.0357142873108387, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 2.3888664841651917, "epoch": 0.00556, "grad_norm": 0.14841076731681824, "kl": 0.48591240495443344, "learning_rate": 7.999974976208653e-06, "loss": -0.0911, "step": 556, "step_time": 5.823648911000419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 9.3125, "completions/mean_terminated_length": 4.111111164093018, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2230101227760315, "epoch": 0.00557, "frac_reward_zero_std": 0.0, "grad_norm": 0.11699850857257843, "kl": 0.2928675375878811, "learning_rate": 7.999974879870894e-06, "loss": -0.0834, "num_tokens": 17081155.0, "reward": -24.446109771728516, "reward_std": 24.569332122802734, "rewards/rollout_reward_func/mean": -24.446109771728516, "rewards/rollout_reward_func/std": 24.569332122802734, "sampling/importance_sampling_ratio/max": 1.893079161643982, "sampling/importance_sampling_ratio/mean": 0.5585829615592957, "sampling/importance_sampling_ratio/min": 7.80284850065982e-08, "sampling/sampling_logp_difference/max": 2.2045459747314453, "sampling/sampling_logp_difference/mean": 0.40534162521362305, "step": 557, "step_time": 25.79702844699932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2312398850917816, "epoch": 0.00558, "grad_norm": 0.11182236671447754, "kl": 0.2764459550380707, "learning_rate": 7.999974783348047e-06, "loss": -0.0842, "step": 558, "step_time": 5.917503005000981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.875, "completions/mean_terminated_length": 4.695652484893799, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.086265444755554, "epoch": 0.00559, "frac_reward_zero_std": 0.0, "grad_norm": 0.1573217213153839, "kl": 0.1897767223417759, "learning_rate": 7.999974686640113e-06, "loss": -0.0945, "num_tokens": 17156934.0, "reward": -21.894916534423828, "reward_std": 24.624902725219727, "rewards/rollout_reward_func/mean": -21.894916534423828, "rewards/rollout_reward_func/std": 24.624900817871094, "sampling/importance_sampling_ratio/max": 1.435731053352356, "sampling/importance_sampling_ratio/mean": 0.6341923475265503, "sampling/importance_sampling_ratio/min": 4.213630745653063e-06, "sampling/sampling_logp_difference/max": 2.2639200687408447, "sampling/sampling_logp_difference/mean": 0.3393963575363159, "step": 559, "step_time": 26.132923932998892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0950700640678406, "epoch": 0.0056, "grad_norm": 0.15485930442810059, "kl": 0.19070490822196007, "learning_rate": 7.999974589747096e-06, "loss": -0.0946, "step": 560, "step_time": 5.659296226000151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 5.090909004211426, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.393818497657776, "epoch": 0.00561, "frac_reward_zero_std": 0.0, "grad_norm": 0.13047485053539276, "kl": 0.1775109488517046, "learning_rate": 7.99997449266899e-06, "loss": -0.1007, "num_tokens": 17238157.0, "reward": -19.410118103027344, "reward_std": 22.30792999267578, "rewards/rollout_reward_func/mean": -19.410118103027344, "rewards/rollout_reward_func/std": 22.30792808532715, "sampling/importance_sampling_ratio/max": 1.6245144605636597, "sampling/importance_sampling_ratio/mean": 0.6020606756210327, "sampling/importance_sampling_ratio/min": 8.089742209449469e-08, "sampling/sampling_logp_difference/max": 2.3918590545654297, "sampling/sampling_logp_difference/mean": 0.37629812955856323, "step": 561, "step_time": 26.348125154998343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.396536946296692, "epoch": 0.00562, "grad_norm": 0.12840530276298523, "kl": 0.1808551400899887, "learning_rate": 7.999974395405802e-06, "loss": -0.1006, "step": 562, "step_time": 5.464785603000564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.75, "completions/mean_terminated_length": 4.952381134033203, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2933159470558167, "epoch": 0.00563, "frac_reward_zero_std": 0.0, "grad_norm": 0.12365119159221649, "kl": 0.1734551638364792, "learning_rate": 7.999974297957524e-06, "loss": -0.0994, "num_tokens": 17312882.0, "reward": -24.258901596069336, "reward_std": 22.89052963256836, "rewards/rollout_reward_func/mean": -24.258901596069336, "rewards/rollout_reward_func/std": 22.89052963256836, "sampling/importance_sampling_ratio/max": 1.470572590827942, "sampling/importance_sampling_ratio/mean": 0.5879641771316528, "sampling/importance_sampling_ratio/min": 2.6937934762827354e-06, "sampling/sampling_logp_difference/max": 2.044806480407715, "sampling/sampling_logp_difference/mean": 0.3606460392475128, "step": 563, "step_time": 26.097393684001872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2982912063598633, "epoch": 0.00564, "grad_norm": 0.12343523651361465, "kl": 0.17507510259747505, "learning_rate": 7.999974200324165e-06, "loss": -0.099, "step": 564, "step_time": 5.489015259001462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.90625, "completions/mean_terminated_length": 4.529411792755127, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.63772714138031, "epoch": 0.00565, "frac_reward_zero_std": 0.0, "grad_norm": 0.14676235616207123, "kl": 0.19336039945483208, "learning_rate": 7.999974102505715e-06, "loss": -0.0903, "num_tokens": 17389168.0, "reward": -24.060649871826172, "reward_std": 25.412322998046875, "rewards/rollout_reward_func/mean": -24.060649871826172, "rewards/rollout_reward_func/std": 25.412322998046875, "sampling/importance_sampling_ratio/max": 1.4654171466827393, "sampling/importance_sampling_ratio/mean": 0.47684159874916077, "sampling/importance_sampling_ratio/min": 2.188171066563882e-07, "sampling/sampling_logp_difference/max": 2.7256486415863037, "sampling/sampling_logp_difference/mean": 0.4763616919517517, "step": 565, "step_time": 25.894432557000982 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 2.6437286734580994, "epoch": 0.00566, "grad_norm": 0.13761775195598602, "kl": 0.17301850859075785, "learning_rate": 7.999974004502184e-06, "loss": -0.0909, "step": 566, "step_time": 5.869971421998343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.875, "completions/mean_terminated_length": 5.142857074737549, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.260606348514557, "epoch": 0.00567, "frac_reward_zero_std": 0.0, "grad_norm": 0.1703071892261505, "kl": 0.17955193854868412, "learning_rate": 7.999973906313564e-06, "loss": -0.1136, "num_tokens": 17467462.0, "reward": -19.87325668334961, "reward_std": 27.394861221313477, "rewards/rollout_reward_func/mean": -19.87325668334961, "rewards/rollout_reward_func/std": 27.394861221313477, "sampling/importance_sampling_ratio/max": 1.4643863439559937, "sampling/importance_sampling_ratio/mean": 0.6017451882362366, "sampling/importance_sampling_ratio/min": 2.6214139747793297e-09, "sampling/sampling_logp_difference/max": 2.3127670288085938, "sampling/sampling_logp_difference/mean": 0.38744473457336426, "step": 567, "step_time": 25.152780082999016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2624127864837646, "epoch": 0.00568, "grad_norm": 0.17361600697040558, "kl": 0.18220999650657177, "learning_rate": 7.999973807939857e-06, "loss": -0.114, "step": 568, "step_time": 5.887957605002157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 8.90625, "completions/mean_terminated_length": 4.052631378173828, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.125989109277725, "epoch": 0.00569, "frac_reward_zero_std": 0.0, "grad_norm": 0.09247458726167679, "kl": 0.374212846159935, "learning_rate": 7.999973709381066e-06, "loss": -0.0764, "num_tokens": 17543349.0, "reward": -23.919689178466797, "reward_std": 25.054183959960938, "rewards/rollout_reward_func/mean": -23.919689178466797, "rewards/rollout_reward_func/std": 25.054183959960938, "sampling/importance_sampling_ratio/max": 1.4065347909927368, "sampling/importance_sampling_ratio/mean": 0.5184735655784607, "sampling/importance_sampling_ratio/min": 8.678435392539541e-07, "sampling/sampling_logp_difference/max": 2.3099756240844727, "sampling/sampling_logp_difference/mean": 0.3641379773616791, "step": 569, "step_time": 27.232033780001075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1289976835250854, "epoch": 0.0057, "grad_norm": 0.09918776899576187, "kl": 0.38661376386880875, "learning_rate": 7.99997361063719e-06, "loss": -0.0766, "step": 570, "step_time": 6.140815748000023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.78125, "completions/mean_terminated_length": 5.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2988617718219757, "epoch": 0.00571, "frac_reward_zero_std": 0.0, "grad_norm": 0.12645134329795837, "kl": 0.1821651253849268, "learning_rate": 7.999973511708227e-06, "loss": -0.1043, "num_tokens": 17618787.0, "reward": -21.023557662963867, "reward_std": 23.802715301513672, "rewards/rollout_reward_func/mean": -21.023557662963867, "rewards/rollout_reward_func/std": 23.802715301513672, "sampling/importance_sampling_ratio/max": 1.370056390762329, "sampling/importance_sampling_ratio/mean": 0.5902690887451172, "sampling/importance_sampling_ratio/min": 4.607185928762192e-06, "sampling/sampling_logp_difference/max": 2.177809715270996, "sampling/sampling_logp_difference/mean": 0.35919201374053955, "step": 571, "step_time": 27.021855582999706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3022541999816895, "epoch": 0.00572, "grad_norm": 0.12754033505916595, "kl": 0.18168794736266136, "learning_rate": 7.999973412594178e-06, "loss": -0.1045, "step": 572, "step_time": 5.463401461000103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.6875, "completions/mean_terminated_length": 4.857142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0471481680870056, "epoch": 0.00573, "frac_reward_zero_std": 0.0, "grad_norm": 0.10905621200799942, "kl": 0.23928244784474373, "learning_rate": 7.999973313295043e-06, "loss": -0.1127, "num_tokens": 17696554.0, "reward": -21.13314437866211, "reward_std": 24.96681022644043, "rewards/rollout_reward_func/mean": -21.13314437866211, "rewards/rollout_reward_func/std": 24.966808319091797, "sampling/importance_sampling_ratio/max": 1.5044020414352417, "sampling/importance_sampling_ratio/mean": 0.6336081027984619, "sampling/importance_sampling_ratio/min": 0.00011806995462393388, "sampling/sampling_logp_difference/max": 1.726633071899414, "sampling/sampling_logp_difference/mean": 0.3679848611354828, "step": 573, "step_time": 25.912712296998507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0401797890663147, "epoch": 0.00574, "grad_norm": 0.10611597448587418, "kl": 0.23884256184101105, "learning_rate": 7.999973213810824e-06, "loss": -0.1133, "step": 574, "step_time": 5.50517316100013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.8125, "completions/mean_terminated_length": 4.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.786251187324524, "epoch": 0.00575, "frac_reward_zero_std": 0.0, "grad_norm": 0.14092141389846802, "kl": 0.18202773481607437, "learning_rate": 7.999973114141517e-06, "loss": -0.1028, "num_tokens": 17774172.0, "reward": -23.814910888671875, "reward_std": 24.0299129486084, "rewards/rollout_reward_func/mean": -23.814910888671875, "rewards/rollout_reward_func/std": 24.029911041259766, "sampling/importance_sampling_ratio/max": 1.5574904680252075, "sampling/importance_sampling_ratio/mean": 0.5569055676460266, "sampling/importance_sampling_ratio/min": 7.03115121680753e-10, "sampling/sampling_logp_difference/max": 2.405724287033081, "sampling/sampling_logp_difference/mean": 0.5291261672973633, "step": 575, "step_time": 27.175702803000604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.776757061481476, "epoch": 0.00576, "grad_norm": 0.1369185745716095, "kl": 0.18965675868093967, "learning_rate": 7.999973014287126e-06, "loss": -0.103, "step": 576, "step_time": 5.468430052000258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 8.84375, "completions/mean_terminated_length": 3.9473683834075928, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3678009808063507, "epoch": 0.00577, "frac_reward_zero_std": 0.0, "grad_norm": 0.11175768822431564, "kl": 0.22026219218969345, "learning_rate": 7.999972914247647e-06, "loss": -0.0943, "num_tokens": 17852161.0, "reward": -23.475971221923828, "reward_std": 24.473546981811523, "rewards/rollout_reward_func/mean": -23.475971221923828, "rewards/rollout_reward_func/std": 24.47354507446289, "sampling/importance_sampling_ratio/max": 1.506197452545166, "sampling/importance_sampling_ratio/mean": 0.5480304956436157, "sampling/importance_sampling_ratio/min": 8.6040694213807e-07, "sampling/sampling_logp_difference/max": 1.7142177820205688, "sampling/sampling_logp_difference/mean": 0.3804410696029663, "step": 577, "step_time": 26.594357046998994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.358855366706848, "epoch": 0.00578, "grad_norm": 0.1122027188539505, "kl": 0.2264053300023079, "learning_rate": 7.999972814023084e-06, "loss": -0.0946, "step": 578, "step_time": 6.078791412999635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 10.375, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.58358371257782, "epoch": 0.00579, "frac_reward_zero_std": 0.0, "grad_norm": 0.11672450602054596, "kl": 0.1653351727873087, "learning_rate": 7.999972713613435e-06, "loss": -0.1103, "num_tokens": 17923970.0, "reward": -27.125381469726562, "reward_std": 24.8447265625, "rewards/rollout_reward_func/mean": -27.125381469726562, "rewards/rollout_reward_func/std": 24.8447265625, "sampling/importance_sampling_ratio/max": 1.4622597694396973, "sampling/importance_sampling_ratio/mean": 0.5161304473876953, "sampling/importance_sampling_ratio/min": 9.322626937091627e-08, "sampling/sampling_logp_difference/max": 2.0054640769958496, "sampling/sampling_logp_difference/mean": 0.43708252906799316, "step": 579, "step_time": 24.30891622299896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5785827338695526, "epoch": 0.0058, "grad_norm": 0.11850117892026901, "kl": 0.168983556330204, "learning_rate": 7.9999726130187e-06, "loss": -0.1103, "step": 580, "step_time": 5.963120758000514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.1875, "completions/mean_terminated_length": 4.636363983154297, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9535876512527466, "epoch": 0.00581, "frac_reward_zero_std": 0.0, "grad_norm": 0.12539921700954437, "kl": 0.22375567257404327, "learning_rate": 7.999972512238878e-06, "loss": -0.0752, "num_tokens": 17998151.0, "reward": -23.153602600097656, "reward_std": 19.973478317260742, "rewards/rollout_reward_func/mean": -23.153602600097656, "rewards/rollout_reward_func/std": 19.973478317260742, "sampling/importance_sampling_ratio/max": 1.5462632179260254, "sampling/importance_sampling_ratio/mean": 0.6262373924255371, "sampling/importance_sampling_ratio/min": 0.00015578190505038947, "sampling/sampling_logp_difference/max": 1.8694097995758057, "sampling/sampling_logp_difference/mean": 0.3124827444553375, "step": 581, "step_time": 25.432894482999473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9456985294818878, "epoch": 0.00582, "grad_norm": 0.12456416338682175, "kl": 0.23081500828266144, "learning_rate": 7.999972411273972e-06, "loss": -0.0749, "step": 582, "step_time": 5.376379878003718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 6.28125, "completions/mean_terminated_length": 4.038461685180664, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2109449058771133, "epoch": 0.00583, "frac_reward_zero_std": 0.0, "grad_norm": 0.13830474019050598, "kl": 0.4095449782907963, "learning_rate": 7.99997231012398e-06, "loss": -0.0593, "num_tokens": 18077132.0, "reward": -15.430373191833496, "reward_std": 21.15486717224121, "rewards/rollout_reward_func/mean": -15.430373191833496, "rewards/rollout_reward_func/std": 21.15486717224121, "sampling/importance_sampling_ratio/max": 1.7134652137756348, "sampling/importance_sampling_ratio/mean": 0.7353492975234985, "sampling/importance_sampling_ratio/min": 0.00023062188120093197, "sampling/sampling_logp_difference/max": 1.3909317255020142, "sampling/sampling_logp_difference/mean": 0.23414723575115204, "step": 583, "step_time": 28.151859164001507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.205574169754982, "epoch": 0.00584, "grad_norm": 0.13755807280540466, "kl": 0.4149189367890358, "learning_rate": 7.9999722087889e-06, "loss": -0.0586, "step": 584, "step_time": 5.621081900997524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.15625, "completions/mean_terminated_length": 4.590909004211426, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7943709641695023, "epoch": 0.00585, "frac_reward_zero_std": 0.0, "grad_norm": 0.13655835390090942, "kl": 0.2862204946577549, "learning_rate": 7.999972107268737e-06, "loss": -0.0944, "num_tokens": 18153859.0, "reward": -21.870588302612305, "reward_std": 22.411039352416992, "rewards/rollout_reward_func/mean": -21.870588302612305, "rewards/rollout_reward_func/std": 22.41103744506836, "sampling/importance_sampling_ratio/max": 1.4834450483322144, "sampling/importance_sampling_ratio/mean": 0.6537636518478394, "sampling/importance_sampling_ratio/min": 9.615116869099438e-05, "sampling/sampling_logp_difference/max": 1.8128725290298462, "sampling/sampling_logp_difference/mean": 0.30287784337997437, "step": 585, "step_time": 27.81156369600103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.792768657207489, "epoch": 0.00586, "grad_norm": 0.13573049008846283, "kl": 0.28760519437491894, "learning_rate": 7.999972005563487e-06, "loss": -0.0948, "step": 586, "step_time": 5.669932351998796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 9.03125, "completions/mean_terminated_length": 4.263157844543457, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.180759847164154, "epoch": 0.00587, "frac_reward_zero_std": 0.0, "grad_norm": 0.20300599932670593, "kl": 0.5547974836081266, "learning_rate": 7.999971903673153e-06, "loss": -0.111, "num_tokens": 18232652.0, "reward": -20.879905700683594, "reward_std": 23.751867294311523, "rewards/rollout_reward_func/mean": -20.879905700683594, "rewards/rollout_reward_func/std": 23.751869201660156, "sampling/importance_sampling_ratio/max": 1.4913192987442017, "sampling/importance_sampling_ratio/mean": 0.578145444393158, "sampling/importance_sampling_ratio/min": 7.098648893588688e-06, "sampling/sampling_logp_difference/max": 2.1677134037017822, "sampling/sampling_logp_difference/mean": 0.38407206535339355, "step": 587, "step_time": 27.053328535997935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.179781772196293, "epoch": 0.00588, "grad_norm": 0.21417298913002014, "kl": 0.5299619603902102, "learning_rate": 7.999971801597731e-06, "loss": -0.1115, "step": 588, "step_time": 5.970896502996766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.09375, "completions/mean_terminated_length": 4.950000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2416447401046753, "epoch": 0.00589, "frac_reward_zero_std": 0.0, "grad_norm": 0.14749601483345032, "kl": 0.26688723266124725, "learning_rate": 7.999971699337224e-06, "loss": -0.1044, "num_tokens": 18308831.0, "reward": -21.920154571533203, "reward_std": 23.471553802490234, "rewards/rollout_reward_func/mean": -21.920154571533203, "rewards/rollout_reward_func/std": 23.471553802490234, "sampling/importance_sampling_ratio/max": 1.6428704261779785, "sampling/importance_sampling_ratio/mean": 0.5615371465682983, "sampling/importance_sampling_ratio/min": 1.205684293381637e-05, "sampling/sampling_logp_difference/max": 2.0172781944274902, "sampling/sampling_logp_difference/mean": 0.4019530415534973, "step": 589, "step_time": 24.293164022999918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.239801049232483, "epoch": 0.0059, "grad_norm": 0.14883780479431152, "kl": 0.27003736793994904, "learning_rate": 7.99997159689163e-06, "loss": -0.1049, "step": 590, "step_time": 5.3813139560006675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.53125, "completions/mean_terminated_length": 4.61904764175415, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1231954097747803, "epoch": 0.00591, "frac_reward_zero_std": 0.0, "grad_norm": 0.19544187188148499, "kl": 0.6471785232424736, "learning_rate": 7.999971494260952e-06, "loss": -0.0864, "num_tokens": 18387556.0, "reward": -21.337356567382812, "reward_std": 25.030338287353516, "rewards/rollout_reward_func/mean": -21.337356567382812, "rewards/rollout_reward_func/std": 25.030336380004883, "sampling/importance_sampling_ratio/max": 1.6527868509292603, "sampling/importance_sampling_ratio/mean": 0.546350359916687, "sampling/importance_sampling_ratio/min": 6.293463883366712e-08, "sampling/sampling_logp_difference/max": 2.1195712089538574, "sampling/sampling_logp_difference/mean": 0.4013412296772003, "step": 591, "step_time": 27.126917532001244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.125939041376114, "epoch": 0.00592, "grad_norm": 0.19423435628414154, "kl": 0.6220084875822067, "learning_rate": 7.999971391445188e-06, "loss": -0.087, "step": 592, "step_time": 5.485525037000116 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 4.727272987365723, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.208116888999939, "epoch": 0.00593, "frac_reward_zero_std": 0.0, "grad_norm": 0.12936751544475555, "kl": 0.3108297921717167, "learning_rate": 7.999971288444338e-06, "loss": -0.1095, "num_tokens": 18466065.0, "reward": -18.232425689697266, "reward_std": 27.439237594604492, "rewards/rollout_reward_func/mean": -18.232425689697266, "rewards/rollout_reward_func/std": 27.439237594604492, "sampling/importance_sampling_ratio/max": 1.4490940570831299, "sampling/importance_sampling_ratio/mean": 0.7045608758926392, "sampling/importance_sampling_ratio/min": 7.512747401960951e-08, "sampling/sampling_logp_difference/max": 2.0205726623535156, "sampling/sampling_logp_difference/mean": 0.43482786417007446, "step": 593, "step_time": 27.502782824998576 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 2.2061642855405807, "epoch": 0.00594, "grad_norm": 0.1300436109304428, "kl": 0.3087522853165865, "learning_rate": 7.999971185258403e-06, "loss": -0.1097, "step": 594, "step_time": 5.937330462000318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 9.40625, "completions/mean_terminated_length": 4.277777671813965, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.336544156074524, "epoch": 0.00595, "frac_reward_zero_std": 0.0, "grad_norm": 0.19334706664085388, "kl": 0.19409929402172565, "learning_rate": 7.999971081887381e-06, "loss": -0.1289, "num_tokens": 18540141.0, "reward": -22.159080505371094, "reward_std": 23.838634490966797, "rewards/rollout_reward_func/mean": -22.159080505371094, "rewards/rollout_reward_func/std": 23.83863067626953, "sampling/importance_sampling_ratio/max": 2.0775952339172363, "sampling/importance_sampling_ratio/mean": 0.6480960845947266, "sampling/importance_sampling_ratio/min": 1.4133393051452003e-05, "sampling/sampling_logp_difference/max": 1.802141547203064, "sampling/sampling_logp_difference/mean": 0.3899206519126892, "step": 595, "step_time": 26.286742845000845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3293997943401337, "epoch": 0.00596, "grad_norm": 0.19617435336112976, "kl": 0.2047174833714962, "learning_rate": 7.999970978331275e-06, "loss": -0.1304, "step": 596, "step_time": 5.446321172001262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.71875, "completions/mean_terminated_length": 5.409090995788574, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3113940060138702, "epoch": 0.00597, "frac_reward_zero_std": 0.0, "grad_norm": 0.1639123111963272, "kl": 0.2236039713025093, "learning_rate": 7.999970874590083e-06, "loss": -0.1056, "num_tokens": 18620004.0, "reward": -20.02342414855957, "reward_std": 22.338594436645508, "rewards/rollout_reward_func/mean": -20.02342414855957, "rewards/rollout_reward_func/std": 22.338594436645508, "sampling/importance_sampling_ratio/max": 1.8396531343460083, "sampling/importance_sampling_ratio/mean": 0.6813002824783325, "sampling/importance_sampling_ratio/min": 2.1849786207894795e-05, "sampling/sampling_logp_difference/max": 1.6509895324707031, "sampling/sampling_logp_difference/mean": 0.3470439910888672, "step": 597, "step_time": 26.151008957998783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2959282100200653, "epoch": 0.00598, "grad_norm": 0.1602659523487091, "kl": 0.232785327360034, "learning_rate": 7.999970770663804e-06, "loss": -0.1065, "step": 598, "step_time": 6.016140406001796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.53125, "completions/mean_terminated_length": 5.1052632331848145, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1565805971622467, "epoch": 0.00599, "frac_reward_zero_std": 0.0, "grad_norm": 0.1359053999185562, "kl": 0.25369945727288723, "learning_rate": 7.99997066655244e-06, "loss": -0.0836, "num_tokens": 18696507.0, "reward": -25.490812301635742, "reward_std": 22.902263641357422, "rewards/rollout_reward_func/mean": -25.490812301635742, "rewards/rollout_reward_func/std": 22.902263641357422, "sampling/importance_sampling_ratio/max": 1.9866900444030762, "sampling/importance_sampling_ratio/mean": 0.5602339506149292, "sampling/importance_sampling_ratio/min": 6.58435146760894e-06, "sampling/sampling_logp_difference/max": 1.6346445083618164, "sampling/sampling_logp_difference/mean": 0.34900152683258057, "step": 599, "step_time": 24.46177923799769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1444608569145203, "epoch": 0.006, "grad_norm": 0.13442958891391754, "kl": 0.27479805424809456, "learning_rate": 7.99997056225599e-06, "loss": -0.0841, "step": 600, "step_time": 5.566940081997018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.46875, "completions/mean_terminated_length": 4.625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6590802073478699, "epoch": 0.00601, "frac_reward_zero_std": 0.0, "grad_norm": 0.08932670950889587, "kl": 1.0743574239313602, "learning_rate": 7.999970457774456e-06, "loss": -0.0929, "num_tokens": 18778599.0, "reward": -14.994665145874023, "reward_std": 23.928831100463867, "rewards/rollout_reward_func/mean": -14.994665145874023, "rewards/rollout_reward_func/std": 23.928829193115234, "sampling/importance_sampling_ratio/max": 1.767046570777893, "sampling/importance_sampling_ratio/mean": 0.7523579597473145, "sampling/importance_sampling_ratio/min": 1.7879585811897414e-06, "sampling/sampling_logp_difference/max": 1.780167818069458, "sampling/sampling_logp_difference/mean": 0.31371408700942993, "step": 601, "step_time": 28.620637921003436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6501307487487793, "epoch": 0.00602, "grad_norm": 0.08801165968179703, "kl": 1.1177916042506695, "learning_rate": 7.999970353107835e-06, "loss": -0.0928, "step": 602, "step_time": 5.677702193999721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.09375, "completions/mean_terminated_length": 3.965517282485962, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5431994721293449, "epoch": 0.00603, "frac_reward_zero_std": 0.0, "grad_norm": 0.40382033586502075, "kl": 2.7221352979540825, "learning_rate": 7.999970248256127e-06, "loss": -0.0419, "num_tokens": 18861668.0, "reward": -5.858057022094727, "reward_std": 20.057109832763672, "rewards/rollout_reward_func/mean": -5.858057022094727, "rewards/rollout_reward_func/std": 20.057109832763672, "sampling/importance_sampling_ratio/max": 1.6753346920013428, "sampling/importance_sampling_ratio/mean": 0.9546641111373901, "sampling/importance_sampling_ratio/min": 0.00020610842329915613, "sampling/sampling_logp_difference/max": 3.081942319869995, "sampling/sampling_logp_difference/mean": 0.19819939136505127, "step": 603, "step_time": 29.561643565999475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5427552461624146, "epoch": 0.00604, "grad_norm": 0.29388636350631714, "kl": 1.9963352531194687, "learning_rate": 7.999970143219335e-06, "loss": -0.0442, "step": 604, "step_time": 5.94890444000157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.5625, "completions/mean_terminated_length": 4.6666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8507796823978424, "epoch": 0.00605, "frac_reward_zero_std": 0.0, "grad_norm": 0.11002406477928162, "kl": 0.6621173694729805, "learning_rate": 7.999970037997458e-06, "loss": -0.0855, "num_tokens": 18938270.0, "reward": -20.50128746032715, "reward_std": 22.56723403930664, "rewards/rollout_reward_func/mean": -20.50128746032715, "rewards/rollout_reward_func/std": 22.56723403930664, "sampling/importance_sampling_ratio/max": 1.3912091255187988, "sampling/importance_sampling_ratio/mean": 0.5335522890090942, "sampling/importance_sampling_ratio/min": 2.529214270907687e-06, "sampling/sampling_logp_difference/max": 2.2140798568725586, "sampling/sampling_logp_difference/mean": 0.34215396642684937, "step": 605, "step_time": 27.060080302999268 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.8561003506183624, "epoch": 0.00606, "grad_norm": 0.1083078384399414, "kl": 0.6422064080834389, "learning_rate": 7.999969932590495e-06, "loss": -0.0858, "step": 606, "step_time": 5.594133796999813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.9375, "completions/mean_terminated_length": 4.400000095367432, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3864914625883102, "epoch": 0.00607, "frac_reward_zero_std": 0.0, "grad_norm": 0.2715384364128113, "kl": 1.2661868184804916, "learning_rate": 7.999969826998444e-06, "loss": -0.0497, "num_tokens": 19020825.0, "reward": -10.94578742980957, "reward_std": 20.7152042388916, "rewards/rollout_reward_func/mean": -10.94578742980957, "rewards/rollout_reward_func/std": 20.71520233154297, "sampling/importance_sampling_ratio/max": 1.789035439491272, "sampling/importance_sampling_ratio/mean": 0.703238844871521, "sampling/importance_sampling_ratio/min": 2.9801296363984875e-07, "sampling/sampling_logp_difference/max": 2.2510130405426025, "sampling/sampling_logp_difference/mean": 0.31169643998146057, "step": 607, "step_time": 30.21841999299977 }, { "clip_ratio/high_max": 0.01315789483487606, "clip_ratio/high_mean": 0.003289473708719015, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "entropy": 1.4009778648614883, "epoch": 0.00608, "grad_norm": 0.18527615070343018, "kl": 0.9788959622383118, "learning_rate": 7.99996972122131e-06, "loss": -0.0514, "step": 608, "step_time": 5.648707933998594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.34375, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8132106065750122, "epoch": 0.00609, "frac_reward_zero_std": 0.0, "grad_norm": 0.0817028358578682, "kl": 0.5252716057002544, "learning_rate": 7.99996961525909e-06, "loss": -0.0978, "num_tokens": 19099442.0, "reward": -13.983162879943848, "reward_std": 25.15252113342285, "rewards/rollout_reward_func/mean": -13.983162879943848, "rewards/rollout_reward_func/std": 25.15252113342285, "sampling/importance_sampling_ratio/max": 1.7388771772384644, "sampling/importance_sampling_ratio/mean": 0.6509032845497131, "sampling/importance_sampling_ratio/min": 3.171300340909511e-05, "sampling/sampling_logp_difference/max": 2.2397351264953613, "sampling/sampling_logp_difference/mean": 0.3548632860183716, "step": 609, "step_time": 27.115958174999832 }, { "clip_ratio/high_max": 0.043750000186264515, "clip_ratio/high_mean": 0.010937500046566129, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010937500046566129, "entropy": 1.8275059163570404, "epoch": 0.0061, "grad_norm": 0.07576092332601547, "kl": 0.4733543340116739, "learning_rate": 7.999969509111784e-06, "loss": -0.0977, "step": 610, "step_time": 5.503015271997356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.6875, "completions/mean_terminated_length": 4.777777671813965, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.126356989145279, "epoch": 0.00611, "frac_reward_zero_std": 0.0, "grad_norm": 0.12045647948980331, "kl": 0.32144301757216454, "learning_rate": 7.999969402779394e-06, "loss": -0.0986, "num_tokens": 19173829.0, "reward": -21.535905838012695, "reward_std": 29.24544334411621, "rewards/rollout_reward_func/mean": -21.535905838012695, "rewards/rollout_reward_func/std": 29.245441436767578, "sampling/importance_sampling_ratio/max": 1.3351547718048096, "sampling/importance_sampling_ratio/mean": 0.4988451898097992, "sampling/importance_sampling_ratio/min": 3.476491110632196e-05, "sampling/sampling_logp_difference/max": 1.5430246591567993, "sampling/sampling_logp_difference/mean": 0.34489956498146057, "step": 611, "step_time": 25.642141022997748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1332616806030273, "epoch": 0.00612, "grad_norm": 0.12777458131313324, "kl": 0.3093788381665945, "learning_rate": 7.999969296261916e-06, "loss": -0.0984, "step": 612, "step_time": 5.330482817002121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 8.53125, "completions/mean_terminated_length": 4.050000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1389512419700623, "epoch": 0.00613, "frac_reward_zero_std": 0.0, "grad_norm": 0.09928254783153534, "kl": 0.2122911475598812, "learning_rate": 7.999969189559353e-06, "loss": -0.1152, "num_tokens": 19252561.0, "reward": -17.989948272705078, "reward_std": 26.790205001831055, "rewards/rollout_reward_func/mean": -17.989948272705078, "rewards/rollout_reward_func/std": 26.790205001831055, "sampling/importance_sampling_ratio/max": 1.4658344984054565, "sampling/importance_sampling_ratio/mean": 0.7312487363815308, "sampling/importance_sampling_ratio/min": 3.0086880542512517e-06, "sampling/sampling_logp_difference/max": 1.8224390745162964, "sampling/sampling_logp_difference/mean": 0.39532041549682617, "step": 613, "step_time": 25.95003098899906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.143752008676529, "epoch": 0.00614, "grad_norm": 0.10140038281679153, "kl": 0.2091820202767849, "learning_rate": 7.999969082671705e-06, "loss": -0.1149, "step": 614, "step_time": 5.974937082995893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 8.875, "completions/mean_terminated_length": 4.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1452705562114716, "epoch": 0.00615, "frac_reward_zero_std": 0.0, "grad_norm": 0.15804420411586761, "kl": 0.3305419161915779, "learning_rate": 7.999968975598971e-06, "loss": -0.0848, "num_tokens": 19328891.0, "reward": -20.98773956298828, "reward_std": 26.353408813476562, "rewards/rollout_reward_func/mean": -20.98773956298828, "rewards/rollout_reward_func/std": 26.353408813476562, "sampling/importance_sampling_ratio/max": 1.512174129486084, "sampling/importance_sampling_ratio/mean": 0.5746089816093445, "sampling/importance_sampling_ratio/min": 2.4975297492346726e-06, "sampling/sampling_logp_difference/max": 1.791580080986023, "sampling/sampling_logp_difference/mean": 0.37253570556640625, "step": 615, "step_time": 26.369679616000212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1513692438602448, "epoch": 0.00616, "grad_norm": 0.1699775904417038, "kl": 0.31129007413983345, "learning_rate": 7.99996886834115e-06, "loss": -0.0852, "step": 616, "step_time": 5.547387998998602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 10.03125, "completions/mean_terminated_length": 4.0625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.7436825037002563, "epoch": 0.00617, "frac_reward_zero_std": 0.0, "grad_norm": 0.15566569566726685, "kl": 0.2617926709353924, "learning_rate": 7.999968760898247e-06, "loss": -0.1026, "num_tokens": 19404241.0, "reward": -25.828657150268555, "reward_std": 25.750362396240234, "rewards/rollout_reward_func/mean": -25.828657150268555, "rewards/rollout_reward_func/std": 25.7503604888916, "sampling/importance_sampling_ratio/max": 1.408029317855835, "sampling/importance_sampling_ratio/mean": 0.48798811435699463, "sampling/importance_sampling_ratio/min": 1.5147689964578603e-06, "sampling/sampling_logp_difference/max": 2.0217397212982178, "sampling/sampling_logp_difference/mean": 0.46868056058883667, "step": 617, "step_time": 26.165613553002913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.749543696641922, "epoch": 0.00618, "grad_norm": 0.16118790209293365, "kl": 0.25567949935793877, "learning_rate": 7.999968653270258e-06, "loss": -0.1034, "step": 618, "step_time": 5.3669708389988955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 4.800000190734863, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1155946254730225, "epoch": 0.00619, "frac_reward_zero_std": 0.0, "grad_norm": 0.12754718959331512, "kl": 0.4990856274962425, "learning_rate": 7.99996854545718e-06, "loss": -0.104, "num_tokens": 19482532.0, "reward": -22.365764617919922, "reward_std": 21.385631561279297, "rewards/rollout_reward_func/mean": -22.365764617919922, "rewards/rollout_reward_func/std": 21.385631561279297, "sampling/importance_sampling_ratio/max": 1.6235792636871338, "sampling/importance_sampling_ratio/mean": 0.5731230974197388, "sampling/importance_sampling_ratio/min": 5.574068381974939e-06, "sampling/sampling_logp_difference/max": 2.104766845703125, "sampling/sampling_logp_difference/mean": 0.3744870722293854, "step": 619, "step_time": 27.04040009900018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1206756234169006, "epoch": 0.0062, "grad_norm": 0.12966905534267426, "kl": 0.47951839864254, "learning_rate": 7.99996843745902e-06, "loss": -0.1038, "step": 620, "step_time": 5.520055858001797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.71875, "completions/mean_terminated_length": 4.958333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8797029852867126, "epoch": 0.00621, "frac_reward_zero_std": 0.0, "grad_norm": 0.16816407442092896, "kl": 0.6702649369835854, "learning_rate": 7.999968329275773e-06, "loss": -0.0721, "num_tokens": 19563382.0, "reward": -16.849027633666992, "reward_std": 23.1032657623291, "rewards/rollout_reward_func/mean": -16.849027633666992, "rewards/rollout_reward_func/std": 23.10326385498047, "sampling/importance_sampling_ratio/max": 1.5001060962677002, "sampling/importance_sampling_ratio/mean": 0.6350655555725098, "sampling/importance_sampling_ratio/min": 2.115899633281515e-06, "sampling/sampling_logp_difference/max": 1.715463399887085, "sampling/sampling_logp_difference/mean": 0.3570702373981476, "step": 621, "step_time": 28.305663245999312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8894944787025452, "epoch": 0.00622, "grad_norm": 0.15526586771011353, "kl": 0.6082751154899597, "learning_rate": 7.99996822090744e-06, "loss": -0.0727, "step": 622, "step_time": 5.587456827002825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 6.153846263885498, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.115492582321167, "epoch": 0.00623, "frac_reward_zero_std": 0.0, "grad_norm": 0.13198821246623993, "kl": 0.09504146873950958, "learning_rate": 7.999968112354022e-06, "loss": -0.1026, "num_tokens": 19632569.0, "reward": -33.045249938964844, "reward_std": 24.385406494140625, "rewards/rollout_reward_func/mean": -33.045249938964844, "rewards/rollout_reward_func/std": 24.385406494140625, "sampling/importance_sampling_ratio/max": 1.3585413694381714, "sampling/importance_sampling_ratio/mean": 0.3382173180580139, "sampling/importance_sampling_ratio/min": 2.0531896183229037e-08, "sampling/sampling_logp_difference/max": 2.2075390815734863, "sampling/sampling_logp_difference/mean": 0.4557814598083496, "step": 623, "step_time": 23.210882125002172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.123735010623932, "epoch": 0.00624, "grad_norm": 0.1315007209777832, "kl": 0.09227718319743872, "learning_rate": 7.999968003615517e-06, "loss": -0.103, "step": 624, "step_time": 5.731469048998406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.8125, "completions/mean_terminated_length": 4.933333396911621, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5731855034828186, "epoch": 0.00625, "frac_reward_zero_std": 0.0, "grad_norm": 0.10165689140558243, "kl": 0.22416307777166367, "learning_rate": 7.99996789469193e-06, "loss": -0.0946, "num_tokens": 19707170.0, "reward": -26.054611206054688, "reward_std": 24.064186096191406, "rewards/rollout_reward_func/mean": -26.054611206054688, "rewards/rollout_reward_func/std": 24.064184188842773, "sampling/importance_sampling_ratio/max": 1.363811731338501, "sampling/importance_sampling_ratio/mean": 0.41586363315582275, "sampling/importance_sampling_ratio/min": 6.746279541403055e-05, "sampling/sampling_logp_difference/max": 1.9065784215927124, "sampling/sampling_logp_difference/mean": 0.39969897270202637, "step": 625, "step_time": 25.469609295001646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5701730251312256, "epoch": 0.00626, "grad_norm": 0.10267152637243271, "kl": 0.21833933889865875, "learning_rate": 7.999967785583254e-06, "loss": -0.095, "step": 626, "step_time": 5.477091943999767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.78125, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3203022181987762, "epoch": 0.00627, "frac_reward_zero_std": 0.0, "grad_norm": 0.09378661960363388, "kl": 0.3239961937069893, "learning_rate": 7.999967676289494e-06, "loss": -0.1035, "num_tokens": 19785222.0, "reward": -17.42394256591797, "reward_std": 23.214527130126953, "rewards/rollout_reward_func/mean": -17.42394256591797, "rewards/rollout_reward_func/std": 23.214527130126953, "sampling/importance_sampling_ratio/max": 1.4610724449157715, "sampling/importance_sampling_ratio/mean": 0.5784198045730591, "sampling/importance_sampling_ratio/min": 1.2737187660150084e-07, "sampling/sampling_logp_difference/max": 2.318121910095215, "sampling/sampling_logp_difference/mean": 0.4345320463180542, "step": 627, "step_time": 27.5897384029995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3263668417930603, "epoch": 0.00628, "grad_norm": 0.09470919519662857, "kl": 0.3148778025060892, "learning_rate": 7.99996756681065e-06, "loss": -0.1031, "step": 628, "step_time": 5.865866419999293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.65625, "completions/mean_terminated_length": 4.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.307404786348343, "epoch": 0.00629, "frac_reward_zero_std": 0.0, "grad_norm": 0.1145913302898407, "kl": 0.2376404143869877, "learning_rate": 7.999967457146718e-06, "loss": -0.1106, "num_tokens": 19863262.0, "reward": -19.530181884765625, "reward_std": 24.504213333129883, "rewards/rollout_reward_func/mean": -19.530181884765625, "rewards/rollout_reward_func/std": 24.50421142578125, "sampling/importance_sampling_ratio/max": 1.4237879514694214, "sampling/importance_sampling_ratio/mean": 0.6530815362930298, "sampling/importance_sampling_ratio/min": 4.3160497398275766e-09, "sampling/sampling_logp_difference/max": 2.0064969062805176, "sampling/sampling_logp_difference/mean": 0.43631941080093384, "step": 629, "step_time": 27.577689664996797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.308304727077484, "epoch": 0.0063, "grad_norm": 0.11547376215457916, "kl": 0.23866429924964905, "learning_rate": 7.9999673472977e-06, "loss": -0.1113, "step": 630, "step_time": 5.741538720001699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 10.40625, "completions/mean_terminated_length": 4.066667079925537, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.893839120864868, "epoch": 0.00631, "frac_reward_zero_std": 0.0, "grad_norm": 0.10840391367673874, "kl": 0.1557929962873459, "learning_rate": 7.999967237263601e-06, "loss": -0.1126, "num_tokens": 19938856.0, "reward": -25.070907592773438, "reward_std": 28.428180694580078, "rewards/rollout_reward_func/mean": -25.070907592773438, "rewards/rollout_reward_func/std": 28.428178787231445, "sampling/importance_sampling_ratio/max": 1.514614224433899, "sampling/importance_sampling_ratio/mean": 0.49853515625, "sampling/importance_sampling_ratio/min": 6.772615535055593e-08, "sampling/sampling_logp_difference/max": 1.7471516132354736, "sampling/sampling_logp_difference/mean": 0.437521755695343, "step": 631, "step_time": 25.283979945999818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.894266188144684, "epoch": 0.00632, "grad_norm": 0.10582112520933151, "kl": 0.1566600650548935, "learning_rate": 7.999967127044413e-06, "loss": -0.1129, "step": 632, "step_time": 5.524168603000362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.5625, "completions/mean_terminated_length": 4.6666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.060067504644394, "epoch": 0.00633, "frac_reward_zero_std": 0.0, "grad_norm": 0.13314543664455414, "kl": 0.30550356954336166, "learning_rate": 7.99996701664014e-06, "loss": -0.1057, "num_tokens": 20015501.0, "reward": -20.65645980834961, "reward_std": 22.241308212280273, "rewards/rollout_reward_func/mean": -20.65645980834961, "rewards/rollout_reward_func/std": 22.24130630493164, "sampling/importance_sampling_ratio/max": 1.4470492601394653, "sampling/importance_sampling_ratio/mean": 0.682289183139801, "sampling/importance_sampling_ratio/min": 2.900817541728884e-08, "sampling/sampling_logp_difference/max": 2.148359775543213, "sampling/sampling_logp_difference/mean": 0.35485386848449707, "step": 633, "step_time": 26.568097399998805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.055324226617813, "epoch": 0.00634, "grad_norm": 0.13442516326904297, "kl": 0.3147093541920185, "learning_rate": 7.999966906050781e-06, "loss": -0.1058, "step": 634, "step_time": 6.373458432999541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.9375, "completions/mean_terminated_length": 5.727272987365723, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4817110300064087, "epoch": 0.00635, "frac_reward_zero_std": 0.0, "grad_norm": 0.06547622382640839, "kl": 0.2924400791525841, "learning_rate": 7.999966795276339e-06, "loss": -0.0976, "num_tokens": 20094175.0, "reward": -19.28155517578125, "reward_std": 24.66374969482422, "rewards/rollout_reward_func/mean": -19.28155517578125, "rewards/rollout_reward_func/std": 24.66374969482422, "sampling/importance_sampling_ratio/max": 1.3679957389831543, "sampling/importance_sampling_ratio/mean": 0.5458931922912598, "sampling/importance_sampling_ratio/min": 6.55172016195138e-07, "sampling/sampling_logp_difference/max": 1.8327083587646484, "sampling/sampling_logp_difference/mean": 0.39978522062301636, "step": 635, "step_time": 26.142089654998927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4841871857643127, "epoch": 0.00636, "grad_norm": 0.06555991619825363, "kl": 0.2906426042318344, "learning_rate": 7.999966684316808e-06, "loss": -0.098, "step": 636, "step_time": 6.068421760999627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.09375, "completions/mean_terminated_length": 4.3684210777282715, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4818503856658936, "epoch": 0.00637, "frac_reward_zero_std": 0.0, "grad_norm": 0.13246338069438934, "kl": 1.2510506063699722, "learning_rate": 7.999966573172194e-06, "loss": -0.0851, "num_tokens": 20169684.0, "reward": -18.182668685913086, "reward_std": 25.114416122436523, "rewards/rollout_reward_func/mean": -18.182668685913086, "rewards/rollout_reward_func/std": 25.11441421508789, "sampling/importance_sampling_ratio/max": 1.5726338624954224, "sampling/importance_sampling_ratio/mean": 0.53236323595047, "sampling/importance_sampling_ratio/min": 2.4392534214712214e-06, "sampling/sampling_logp_difference/max": 1.8897786140441895, "sampling/sampling_logp_difference/mean": 0.42165422439575195, "step": 637, "step_time": 27.3623508590008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.480437457561493, "epoch": 0.00638, "grad_norm": 0.12547343969345093, "kl": 1.1374957375228405, "learning_rate": 7.999966461842494e-06, "loss": -0.0854, "step": 638, "step_time": 5.899495442999978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 10.375, "completions/mean_terminated_length": 4.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.571569085121155, "epoch": 0.00639, "frac_reward_zero_std": 0.0, "grad_norm": 0.13940435647964478, "kl": 0.2829723581671715, "learning_rate": 7.999966350327711e-06, "loss": -0.0798, "num_tokens": 20248539.0, "reward": -26.542625427246094, "reward_std": 27.821361541748047, "rewards/rollout_reward_func/mean": -26.542625427246094, "rewards/rollout_reward_func/std": 27.821361541748047, "sampling/importance_sampling_ratio/max": 1.3285729885101318, "sampling/importance_sampling_ratio/mean": 0.3709157109260559, "sampling/importance_sampling_ratio/min": 7.983718433024478e-07, "sampling/sampling_logp_difference/max": 2.195972442626953, "sampling/sampling_logp_difference/mean": 0.41709035634994507, "step": 639, "step_time": 25.060031322002033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5748817920684814, "epoch": 0.0064, "grad_norm": 0.14241278171539307, "kl": 0.26868366822600365, "learning_rate": 7.99996623862784e-06, "loss": -0.0803, "step": 640, "step_time": 5.9727085570011695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.40625, "completions/mean_terminated_length": 6.200000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.961980640888214, "epoch": 0.00641, "frac_reward_zero_std": 0.0, "grad_norm": 0.11739874631166458, "kl": 0.1562904790043831, "learning_rate": 7.999966126742882e-06, "loss": -0.0951, "num_tokens": 20328010.0, "reward": -33.749820709228516, "reward_std": 22.388980865478516, "rewards/rollout_reward_func/mean": -33.749820709228516, "rewards/rollout_reward_func/std": 22.388978958129883, "sampling/importance_sampling_ratio/max": 1.3459053039550781, "sampling/importance_sampling_ratio/mean": 0.36125674843788147, "sampling/importance_sampling_ratio/min": 1.1395909496059176e-06, "sampling/sampling_logp_difference/max": 1.7963099479675293, "sampling/sampling_logp_difference/mean": 0.4582155644893646, "step": 641, "step_time": 27.199211322000338 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 2.9662522077560425, "epoch": 0.00642, "grad_norm": 0.10819130390882492, "kl": 0.1524019604548812, "learning_rate": 7.999966014672842e-06, "loss": -0.0952, "step": 642, "step_time": 5.884242903000995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.25, "completions/mean_terminated_length": 5.200000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.79657518863678, "epoch": 0.00643, "frac_reward_zero_std": 0.0, "grad_norm": 0.08995018899440765, "kl": 0.30316636338829994, "learning_rate": 7.999965902417713e-06, "loss": -0.1116, "num_tokens": 20412867.0, "reward": -24.117273330688477, "reward_std": 31.093578338623047, "rewards/rollout_reward_func/mean": -24.117273330688477, "rewards/rollout_reward_func/std": 31.09358024597168, "sampling/importance_sampling_ratio/max": 1.5491750240325928, "sampling/importance_sampling_ratio/mean": 0.5280014276504517, "sampling/importance_sampling_ratio/min": 2.9908850507354146e-08, "sampling/sampling_logp_difference/max": 2.1176140308380127, "sampling/sampling_logp_difference/mean": 0.5055174827575684, "step": 643, "step_time": 30.35977593899952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.801402747631073, "epoch": 0.00644, "grad_norm": 0.08933348208665848, "kl": 0.30932917818427086, "learning_rate": 7.999965789977501e-06, "loss": -0.1114, "step": 644, "step_time": 6.290921212001194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.0625, "completions/mean_terminated_length": 4.900000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.257305383682251, "epoch": 0.00645, "frac_reward_zero_std": 0.0, "grad_norm": 0.166189044713974, "kl": 0.21712860092520714, "learning_rate": 7.999965677352206e-06, "loss": -0.1104, "num_tokens": 20495321.0, "reward": -24.807941436767578, "reward_std": 26.955074310302734, "rewards/rollout_reward_func/mean": -24.807941436767578, "rewards/rollout_reward_func/std": 26.955074310302734, "sampling/importance_sampling_ratio/max": 1.4262021780014038, "sampling/importance_sampling_ratio/mean": 0.6168153882026672, "sampling/importance_sampling_ratio/min": 2.1959551759209717e-06, "sampling/sampling_logp_difference/max": 2.068978786468506, "sampling/sampling_logp_difference/mean": 0.3670220375061035, "step": 645, "step_time": 28.110847531997933 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.018749999813735485, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018749999813735485, "entropy": 2.2545323371887207, "epoch": 0.00646, "grad_norm": 0.14386780560016632, "kl": 0.21238235011696815, "learning_rate": 7.999965564541822e-06, "loss": -0.1108, "step": 646, "step_time": 6.545823010001186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0625, "completions/mean_terminated_length": 4.714285850524902, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.946557402610779, "epoch": 0.00647, "frac_reward_zero_std": 0.0, "grad_norm": 0.1390216201543808, "kl": 0.1861478742212057, "learning_rate": 7.999965451546353e-06, "loss": -0.092, "num_tokens": 20577434.0, "reward": -29.469722747802734, "reward_std": 31.10002326965332, "rewards/rollout_reward_func/mean": -29.469722747802734, "rewards/rollout_reward_func/std": 31.10002326965332, "sampling/importance_sampling_ratio/max": 1.271155595779419, "sampling/importance_sampling_ratio/mean": 0.39074647426605225, "sampling/importance_sampling_ratio/min": 1.3208692450916715e-07, "sampling/sampling_logp_difference/max": 2.052212715148926, "sampling/sampling_logp_difference/mean": 0.4752662479877472, "step": 647, "step_time": 27.444183833998977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.943458616733551, "epoch": 0.00648, "grad_norm": 0.13779345154762268, "kl": 0.18821552023291588, "learning_rate": 7.999965338365799e-06, "loss": -0.0921, "step": 648, "step_time": 6.486810698999761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.15625, "completions/mean_terminated_length": 5.08695650100708, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0207452476024628, "epoch": 0.00649, "frac_reward_zero_std": 0.0, "grad_norm": 0.08130314946174622, "kl": 0.31066974997520447, "learning_rate": 7.999965225000161e-06, "loss": -0.087, "num_tokens": 20659614.0, "reward": -21.325838088989258, "reward_std": 25.76519012451172, "rewards/rollout_reward_func/mean": -21.325838088989258, "rewards/rollout_reward_func/std": 25.765188217163086, "sampling/importance_sampling_ratio/max": 1.4884597063064575, "sampling/importance_sampling_ratio/mean": 0.6055600643157959, "sampling/importance_sampling_ratio/min": 2.3673303530813428e-06, "sampling/sampling_logp_difference/max": 1.9964699745178223, "sampling/sampling_logp_difference/mean": 0.35084134340286255, "step": 649, "step_time": 30.02990013100134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0148517191410065, "epoch": 0.0065, "grad_norm": 0.080906443297863, "kl": 0.31123338639736176, "learning_rate": 7.999965111449436e-06, "loss": -0.0869, "step": 650, "step_time": 6.303547913998045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 8.90625, "completions/mean_terminated_length": 4.052631378173828, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0055125951766968, "epoch": 0.00651, "frac_reward_zero_std": 0.0, "grad_norm": 0.13708266615867615, "kl": 0.5045887790620327, "learning_rate": 7.999964997713627e-06, "loss": -0.0998, "num_tokens": 20744801.0, "reward": -22.802833557128906, "reward_std": 24.58704376220703, "rewards/rollout_reward_func/mean": -22.802833557128906, "rewards/rollout_reward_func/std": 24.5870418548584, "sampling/importance_sampling_ratio/max": 1.4864815473556519, "sampling/importance_sampling_ratio/mean": 0.6515681743621826, "sampling/importance_sampling_ratio/min": 1.875398538686568e-06, "sampling/sampling_logp_difference/max": 2.139604091644287, "sampling/sampling_logp_difference/mean": 0.3666258454322815, "step": 651, "step_time": 29.934231950999674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0020836293697357, "epoch": 0.00652, "grad_norm": 0.13655400276184082, "kl": 0.5063628070056438, "learning_rate": 7.999964883792732e-06, "loss": -0.1, "step": 652, "step_time": 6.096558183002344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 7.03125, "completions/mean_terminated_length": 4.0416669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6610270142555237, "epoch": 0.00653, "frac_reward_zero_std": 0.0, "grad_norm": 0.0686761736869812, "kl": 0.4176147133111954, "learning_rate": 7.999964769686752e-06, "loss": -0.0961, "num_tokens": 20834950.0, "reward": -16.290876388549805, "reward_std": 26.863758087158203, "rewards/rollout_reward_func/mean": -16.290876388549805, "rewards/rollout_reward_func/std": 26.86375617980957, "sampling/importance_sampling_ratio/max": 1.4339600801467896, "sampling/importance_sampling_ratio/mean": 0.7988111972808838, "sampling/importance_sampling_ratio/min": 7.033463043626398e-05, "sampling/sampling_logp_difference/max": 1.3259472846984863, "sampling/sampling_logp_difference/mean": 0.2953815162181854, "step": 653, "step_time": 32.154296340000656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6556038856506348, "epoch": 0.00654, "grad_norm": 0.06974176317453384, "kl": 0.4256502762436867, "learning_rate": 7.999964655395686e-06, "loss": -0.0967, "step": 654, "step_time": 6.149085045997708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.3125, "completions/mean_terminated_length": 4.285714149475098, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.072273850440979, "epoch": 0.00655, "frac_reward_zero_std": 0.0, "grad_norm": 0.09766478836536407, "kl": 0.27831051498651505, "learning_rate": 7.999964540919536e-06, "loss": -0.101, "num_tokens": 20923539.0, "reward": -18.391231536865234, "reward_std": 27.48537254333496, "rewards/rollout_reward_func/mean": -18.391231536865234, "rewards/rollout_reward_func/std": 27.48537254333496, "sampling/importance_sampling_ratio/max": 1.4122645854949951, "sampling/importance_sampling_ratio/mean": 0.729995608329773, "sampling/importance_sampling_ratio/min": 1.0734265032397161e-08, "sampling/sampling_logp_difference/max": 2.565030574798584, "sampling/sampling_logp_difference/mean": 0.4006303548812866, "step": 655, "step_time": 31.43962716200076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0663557648658752, "epoch": 0.00656, "grad_norm": 0.09591426700353622, "kl": 0.2833607420325279, "learning_rate": 7.9999644262583e-06, "loss": -0.1015, "step": 656, "step_time": 6.655101202000878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 4.319999694824219, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.723981648683548, "epoch": 0.00657, "frac_reward_zero_std": 0.0, "grad_norm": 0.11187063157558441, "kl": 0.5286018177866936, "learning_rate": 7.99996431141198e-06, "loss": -0.0671, "num_tokens": 21013269.0, "reward": -17.61727523803711, "reward_std": 22.10216522216797, "rewards/rollout_reward_func/mean": -17.61727523803711, "rewards/rollout_reward_func/std": 22.10216522216797, "sampling/importance_sampling_ratio/max": 1.3445420265197754, "sampling/importance_sampling_ratio/mean": 0.6686777472496033, "sampling/importance_sampling_ratio/min": 0.00011825114779639989, "sampling/sampling_logp_difference/max": 1.6648077964782715, "sampling/sampling_logp_difference/mean": 0.33862268924713135, "step": 657, "step_time": 32.57613126599972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.719174712896347, "epoch": 0.00658, "grad_norm": 0.11536981165409088, "kl": 0.5493535324931145, "learning_rate": 7.999964196380572e-06, "loss": -0.0675, "step": 658, "step_time": 6.614634792000288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 4.5217390060424805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6351659297943115, "epoch": 0.00659, "frac_reward_zero_std": 0.0, "grad_norm": 0.11263998597860336, "kl": 0.7482200860977173, "learning_rate": 7.99996408116408e-06, "loss": -0.0755, "num_tokens": 21102105.0, "reward": -17.46851921081543, "reward_std": 25.406951904296875, "rewards/rollout_reward_func/mean": -17.46851921081543, "rewards/rollout_reward_func/std": 25.406951904296875, "sampling/importance_sampling_ratio/max": 1.3234004974365234, "sampling/importance_sampling_ratio/mean": 0.575827419757843, "sampling/importance_sampling_ratio/min": 2.950788984890096e-05, "sampling/sampling_logp_difference/max": 1.4553908109664917, "sampling/sampling_logp_difference/mean": 0.31993532180786133, "step": 659, "step_time": 31.35919929999909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6294025480747223, "epoch": 0.0066, "grad_norm": 0.11023439466953278, "kl": 0.7473734468221664, "learning_rate": 7.999963965762504e-06, "loss": -0.0757, "step": 660, "step_time": 6.55778629300039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.33887829631567, "epoch": 0.00661, "frac_reward_zero_std": 0.0, "grad_norm": 0.07384033501148224, "kl": 0.6085587665438652, "learning_rate": 7.99996385017584e-06, "loss": -0.0867, "num_tokens": 21189146.0, "reward": -13.143461227416992, "reward_std": 21.791976928710938, "rewards/rollout_reward_func/mean": -13.143461227416992, "rewards/rollout_reward_func/std": 21.791976928710938, "sampling/importance_sampling_ratio/max": 1.5417901277542114, "sampling/importance_sampling_ratio/mean": 0.8479669094085693, "sampling/importance_sampling_ratio/min": 3.3112396380374776e-08, "sampling/sampling_logp_difference/max": 2.3904178142547607, "sampling/sampling_logp_difference/mean": 0.3212655186653137, "step": 661, "step_time": 32.539062099003786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3355657309293747, "epoch": 0.00662, "grad_norm": 0.07442178577184677, "kl": 0.6134162247180939, "learning_rate": 7.999963734404094e-06, "loss": -0.0868, "step": 662, "step_time": 6.104107557999669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 9.3125, "completions/mean_terminated_length": 4.111111164093018, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.200173705816269, "epoch": 0.00663, "frac_reward_zero_std": 0.0, "grad_norm": 0.1612733155488968, "kl": 0.6561916619539261, "learning_rate": 7.999963618447261e-06, "loss": -0.0665, "num_tokens": 21278766.0, "reward": -26.287336349487305, "reward_std": 21.904638290405273, "rewards/rollout_reward_func/mean": -26.287336349487305, "rewards/rollout_reward_func/std": 21.904638290405273, "sampling/importance_sampling_ratio/max": 1.4737319946289062, "sampling/importance_sampling_ratio/mean": 0.41217416524887085, "sampling/importance_sampling_ratio/min": 4.5695978769799694e-06, "sampling/sampling_logp_difference/max": 1.5536222457885742, "sampling/sampling_logp_difference/mean": 0.38421159982681274, "step": 663, "step_time": 31.541647386999102 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 2.2014578580856323, "epoch": 0.00664, "grad_norm": 0.15337136387825012, "kl": 0.6307444702833891, "learning_rate": 7.999963502305342e-06, "loss": -0.0672, "step": 664, "step_time": 6.292803023998204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.3125, "completions/mean_terminated_length": 4.4166669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8561334908008575, "epoch": 0.00665, "frac_reward_zero_std": 0.0, "grad_norm": 0.1433493047952652, "kl": 0.3415103070437908, "learning_rate": 7.999963385978338e-06, "loss": -0.0942, "num_tokens": 21368573.0, "reward": -17.482463836669922, "reward_std": 24.575098037719727, "rewards/rollout_reward_func/mean": -17.482463836669922, "rewards/rollout_reward_func/std": 24.575098037719727, "sampling/importance_sampling_ratio/max": 1.527130126953125, "sampling/importance_sampling_ratio/mean": 0.7359402179718018, "sampling/importance_sampling_ratio/min": 2.095762738463236e-06, "sampling/sampling_logp_difference/max": 1.6519129276275635, "sampling/sampling_logp_difference/mean": 0.33399754762649536, "step": 665, "step_time": 31.744196063000345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.857534497976303, "epoch": 0.00666, "grad_norm": 0.14788591861724854, "kl": 0.34080328792333603, "learning_rate": 7.999963269466251e-06, "loss": -0.0948, "step": 666, "step_time": 6.16436008600067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 4.2727274894714355, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1132723093032837, "epoch": 0.00667, "frac_reward_zero_std": 0.0, "grad_norm": 0.13278691470623016, "kl": 0.26855411753058434, "learning_rate": 7.999963152769077e-06, "loss": -0.0993, "num_tokens": 21457298.0, "reward": -22.45656967163086, "reward_std": 21.240148544311523, "rewards/rollout_reward_func/mean": -22.45656967163086, "rewards/rollout_reward_func/std": 21.24014663696289, "sampling/importance_sampling_ratio/max": 1.4124212265014648, "sampling/importance_sampling_ratio/mean": 0.6595622301101685, "sampling/importance_sampling_ratio/min": 3.612920806972397e-07, "sampling/sampling_logp_difference/max": 1.9754395484924316, "sampling/sampling_logp_difference/mean": 0.35924455523490906, "step": 667, "step_time": 31.105397205998088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1132668554782867, "epoch": 0.00668, "grad_norm": 0.1367916613817215, "kl": 0.2663486748933792, "learning_rate": 7.999963035886818e-06, "loss": -0.0996, "step": 668, "step_time": 6.57150546900084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 4.153846263885498, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5348019003868103, "epoch": 0.00669, "frac_reward_zero_std": 0.0, "grad_norm": 0.2062409520149231, "kl": 0.4165898822247982, "learning_rate": 7.999962918819473e-06, "loss": -0.082, "num_tokens": 21548605.0, "reward": -15.689535140991211, "reward_std": 24.461008071899414, "rewards/rollout_reward_func/mean": -15.689535140991211, "rewards/rollout_reward_func/std": 24.46100616455078, "sampling/importance_sampling_ratio/max": 1.6456948518753052, "sampling/importance_sampling_ratio/mean": 0.80952388048172, "sampling/importance_sampling_ratio/min": 4.207759957353119e-06, "sampling/sampling_logp_difference/max": 1.9360888004302979, "sampling/sampling_logp_difference/mean": 0.3133859634399414, "step": 669, "step_time": 31.542703939001512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5300355553627014, "epoch": 0.0067, "grad_norm": 0.2101147621870041, "kl": 0.41812098026275635, "learning_rate": 7.999962801567045e-06, "loss": -0.0825, "step": 670, "step_time": 6.642526248999275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.90625, "completions/mean_terminated_length": 5.190476417541504, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.140243321657181, "epoch": 0.00671, "frac_reward_zero_std": 0.0, "grad_norm": 0.10056812316179276, "kl": 0.40297209843993187, "learning_rate": 7.99996268412953e-06, "loss": -0.1103, "num_tokens": 21633559.0, "reward": -25.09771156311035, "reward_std": 24.395618438720703, "rewards/rollout_reward_func/mean": -25.09771156311035, "rewards/rollout_reward_func/std": 24.395620346069336, "sampling/importance_sampling_ratio/max": 1.591501235961914, "sampling/importance_sampling_ratio/mean": 0.6135153770446777, "sampling/importance_sampling_ratio/min": 1.4955003280192614e-06, "sampling/sampling_logp_difference/max": 1.8732033967971802, "sampling/sampling_logp_difference/mean": 0.3683813214302063, "step": 671, "step_time": 29.643047862000458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1351345777511597, "epoch": 0.00672, "grad_norm": 0.10467872768640518, "kl": 0.40206367522478104, "learning_rate": 7.99996256650693e-06, "loss": -0.1109, "step": 672, "step_time": 7.015085442999407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.4375, "completions/mean_terminated_length": 4.476190567016602, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0601104497909546, "epoch": 0.00673, "frac_reward_zero_std": 0.0, "grad_norm": 0.16600848734378815, "kl": 0.4502246491611004, "learning_rate": 7.999962448699245e-06, "loss": -0.0897, "num_tokens": 21715994.0, "reward": -24.93099021911621, "reward_std": 24.62163543701172, "rewards/rollout_reward_func/mean": -24.93099021911621, "rewards/rollout_reward_func/std": 24.621633529663086, "sampling/importance_sampling_ratio/max": 1.4196431636810303, "sampling/importance_sampling_ratio/mean": 0.5547402501106262, "sampling/importance_sampling_ratio/min": 1.2453645581445016e-07, "sampling/sampling_logp_difference/max": 1.8517682552337646, "sampling/sampling_logp_difference/mean": 0.41018885374069214, "step": 673, "step_time": 29.76085186399905 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 2.055083394050598, "epoch": 0.00674, "grad_norm": 0.16880953311920166, "kl": 0.4554588347673416, "learning_rate": 7.999962330706475e-06, "loss": -0.0901, "step": 674, "step_time": 6.031466125998122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 4.782608985900879, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8918349146842957, "epoch": 0.00675, "frac_reward_zero_std": 0.0, "grad_norm": 0.1665429025888443, "kl": 0.46930424869060516, "learning_rate": 7.99996221252862e-06, "loss": -0.1072, "num_tokens": 21802375.0, "reward": -22.566238403320312, "reward_std": 24.618080139160156, "rewards/rollout_reward_func/mean": -22.566238403320312, "rewards/rollout_reward_func/std": 24.618078231811523, "sampling/importance_sampling_ratio/max": 1.6871020793914795, "sampling/importance_sampling_ratio/mean": 0.6961326599121094, "sampling/importance_sampling_ratio/min": 1.1739263754861895e-05, "sampling/sampling_logp_difference/max": 1.6330485343933105, "sampling/sampling_logp_difference/mean": 0.34052538871765137, "step": 675, "step_time": 30.702586208000866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8895714581012726, "epoch": 0.00676, "grad_norm": 0.16991615295410156, "kl": 0.4472098648548126, "learning_rate": 7.99996209416568e-06, "loss": -0.1082, "step": 676, "step_time": 6.762282738998692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.4375, "completions/mean_terminated_length": 4.583333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.94219571352005, "epoch": 0.00677, "frac_reward_zero_std": 0.0, "grad_norm": 0.16625311970710754, "kl": 0.6730115488171577, "learning_rate": 7.999961975617654e-06, "loss": -0.0782, "num_tokens": 21893610.0, "reward": -15.901041984558105, "reward_std": 23.357282638549805, "rewards/rollout_reward_func/mean": -15.901041984558105, "rewards/rollout_reward_func/std": 23.357282638549805, "sampling/importance_sampling_ratio/max": 1.472316026687622, "sampling/importance_sampling_ratio/mean": 0.6340665817260742, "sampling/importance_sampling_ratio/min": 7.608047098983661e-07, "sampling/sampling_logp_difference/max": 1.7453367710113525, "sampling/sampling_logp_difference/mean": 0.3223382234573364, "step": 677, "step_time": 30.83159613799944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9371199309825897, "epoch": 0.00678, "grad_norm": 0.1585593819618225, "kl": 0.6769905984401703, "learning_rate": 7.999961856884545e-06, "loss": -0.0787, "step": 678, "step_time": 6.704306159002954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.480000019073486, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4473504349589348, "epoch": 0.00679, "frac_reward_zero_std": 0.0, "grad_norm": 0.14618019759655, "kl": 0.8995693922042847, "learning_rate": 7.999961737966349e-06, "loss": -0.0728, "num_tokens": 21985636.0, "reward": -10.741098403930664, "reward_std": 21.635730743408203, "rewards/rollout_reward_func/mean": -10.741098403930664, "rewards/rollout_reward_func/std": 21.635730743408203, "sampling/importance_sampling_ratio/max": 1.6447126865386963, "sampling/importance_sampling_ratio/mean": 0.7035515308380127, "sampling/importance_sampling_ratio/min": 3.907212885678746e-06, "sampling/sampling_logp_difference/max": 1.848320484161377, "sampling/sampling_logp_difference/mean": 0.31361666321754456, "step": 679, "step_time": 35.079032684001504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4498214423656464, "epoch": 0.0068, "grad_norm": 0.15845690667629242, "kl": 0.989015094935894, "learning_rate": 7.999961618863067e-06, "loss": -0.0732, "step": 680, "step_time": 6.526039854003102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 11.65625, "completions/mean_terminated_length": 4.4166669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.776858329772949, "epoch": 0.00681, "frac_reward_zero_std": 0.0, "grad_norm": 0.24994193017482758, "kl": 1.2977657597512007, "learning_rate": 7.999961499574702e-06, "loss": -0.0653, "num_tokens": 22068045.0, "reward": -33.70984649658203, "reward_std": 26.344738006591797, "rewards/rollout_reward_func/mean": -33.70984649658203, "rewards/rollout_reward_func/std": 26.344738006591797, "sampling/importance_sampling_ratio/max": 1.4524418115615845, "sampling/importance_sampling_ratio/mean": 0.28614282608032227, "sampling/importance_sampling_ratio/min": 2.4019482225412503e-05, "sampling/sampling_logp_difference/max": 1.7372980117797852, "sampling/sampling_logp_difference/mean": 0.39038023352622986, "step": 681, "step_time": 28.77915388099973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.783377945423126, "epoch": 0.00682, "grad_norm": 0.2265658676624298, "kl": 1.03402048535645, "learning_rate": 7.99996138010125e-06, "loss": -0.0666, "step": 682, "step_time": 6.6233442690008815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.84375, "completions/mean_terminated_length": 4.279999732971191, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5907342433929443, "epoch": 0.00683, "frac_reward_zero_std": 0.0, "grad_norm": 0.20443764328956604, "kl": 0.8744380474090576, "learning_rate": 7.999961260442715e-06, "loss": -0.0896, "num_tokens": 22158259.0, "reward": -14.889429092407227, "reward_std": 24.735944747924805, "rewards/rollout_reward_func/mean": -14.889429092407227, "rewards/rollout_reward_func/std": 24.735944747924805, "sampling/importance_sampling_ratio/max": 1.378827452659607, "sampling/importance_sampling_ratio/mean": 0.7520580887794495, "sampling/importance_sampling_ratio/min": 1.415263341186801e-06, "sampling/sampling_logp_difference/max": 1.6704988479614258, "sampling/sampling_logp_difference/mean": 0.3519858717918396, "step": 683, "step_time": 32.15866575899963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6029666662216187, "epoch": 0.00684, "grad_norm": 0.18574212491512299, "kl": 0.7894719764590263, "learning_rate": 7.999961140599095e-06, "loss": -0.0905, "step": 684, "step_time": 6.295068850000462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 7.8125, "completions/mean_terminated_length": 4.090909004211426, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.815757155418396, "epoch": 0.00685, "frac_reward_zero_std": 0.0, "grad_norm": 0.09961003810167313, "kl": 0.43232467770576477, "learning_rate": 7.999961020570388e-06, "loss": -0.094, "num_tokens": 22244975.0, "reward": -17.309967041015625, "reward_std": 26.313758850097656, "rewards/rollout_reward_func/mean": -17.309967041015625, "rewards/rollout_reward_func/std": 26.313758850097656, "sampling/importance_sampling_ratio/max": 1.6332932710647583, "sampling/importance_sampling_ratio/mean": 0.6979697346687317, "sampling/importance_sampling_ratio/min": 1.320657645464962e-07, "sampling/sampling_logp_difference/max": 2.254377841949463, "sampling/sampling_logp_difference/mean": 0.3689444959163666, "step": 685, "step_time": 32.925706154999716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.818581223487854, "epoch": 0.00686, "grad_norm": 0.09694860875606537, "kl": 0.4258435368537903, "learning_rate": 7.999960900356597e-06, "loss": -0.0942, "step": 686, "step_time": 6.285014961002162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 8.59375, "completions/mean_terminated_length": 4.150000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.369971752166748, "epoch": 0.00687, "frac_reward_zero_std": 0.0, "grad_norm": 0.13563935458660126, "kl": 0.23327383399009705, "learning_rate": 7.999960779957721e-06, "loss": -0.0961, "num_tokens": 22336146.0, "reward": -20.34718894958496, "reward_std": 28.497610092163086, "rewards/rollout_reward_func/mean": -20.34718894958496, "rewards/rollout_reward_func/std": 28.497610092163086, "sampling/importance_sampling_ratio/max": 1.3899145126342773, "sampling/importance_sampling_ratio/mean": 0.6292546391487122, "sampling/importance_sampling_ratio/min": 8.423568509385859e-10, "sampling/sampling_logp_difference/max": 2.9778385162353516, "sampling/sampling_logp_difference/mean": 0.4099883437156677, "step": 687, "step_time": 29.698508230003426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3773017823696136, "epoch": 0.00688, "grad_norm": 0.13443595170974731, "kl": 0.22779962047934532, "learning_rate": 7.999960659373759e-06, "loss": -0.096, "step": 688, "step_time": 6.603846151001562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.53125, "completions/mean_terminated_length": 5.159999847412109, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9589502215385437, "epoch": 0.00689, "frac_reward_zero_std": 0.0, "grad_norm": 0.09198153018951416, "kl": 0.31003662571310997, "learning_rate": 7.999960538604712e-06, "loss": -0.0949, "num_tokens": 22424669.0, "reward": -16.933273315429688, "reward_std": 24.45439338684082, "rewards/rollout_reward_func/mean": -16.933273315429688, "rewards/rollout_reward_func/std": 24.45439338684082, "sampling/importance_sampling_ratio/max": 1.2847177982330322, "sampling/importance_sampling_ratio/mean": 0.6645220518112183, "sampling/importance_sampling_ratio/min": 9.861825134294122e-08, "sampling/sampling_logp_difference/max": 2.198701858520508, "sampling/sampling_logp_difference/mean": 0.32377350330352783, "step": 689, "step_time": 32.36201724399871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9626691043376923, "epoch": 0.0069, "grad_norm": 0.0901421383023262, "kl": 0.3090794198215008, "learning_rate": 7.999960417650583e-06, "loss": -0.0949, "step": 690, "step_time": 6.785939370000051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 7.90625, "completions/mean_terminated_length": 4.227272987365723, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5478522181510925, "epoch": 0.00691, "frac_reward_zero_std": 0.0, "grad_norm": 0.15685732662677765, "kl": 0.43200526386499405, "learning_rate": 7.999960296511364e-06, "loss": -0.0593, "num_tokens": 22514701.0, "reward": -12.828824996948242, "reward_std": 25.01542854309082, "rewards/rollout_reward_func/mean": -12.828824996948242, "rewards/rollout_reward_func/std": 25.01542854309082, "sampling/importance_sampling_ratio/max": 1.550368309020996, "sampling/importance_sampling_ratio/mean": 0.6317896842956543, "sampling/importance_sampling_ratio/min": 1.5632113381514046e-09, "sampling/sampling_logp_difference/max": 1.9267222881317139, "sampling/sampling_logp_difference/mean": 0.3121344745159149, "step": 691, "step_time": 32.332874259000164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.554211676120758, "epoch": 0.00692, "grad_norm": 0.16004937887191772, "kl": 0.4225546643137932, "learning_rate": 7.999960175187063e-06, "loss": -0.0596, "step": 692, "step_time": 6.930586360002053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.9375, "completions/mean_terminated_length": 4.700000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.49189156293869, "epoch": 0.00693, "frac_reward_zero_std": 0.0, "grad_norm": 0.15409080684185028, "kl": 0.2975580096244812, "learning_rate": 7.999960053677677e-06, "loss": -0.1091, "num_tokens": 22600104.0, "reward": -23.57364273071289, "reward_std": 25.519460678100586, "rewards/rollout_reward_func/mean": -23.57364273071289, "rewards/rollout_reward_func/std": 25.519460678100586, "sampling/importance_sampling_ratio/max": 1.383339762687683, "sampling/importance_sampling_ratio/mean": 0.5467262268066406, "sampling/importance_sampling_ratio/min": 6.718609711242607e-06, "sampling/sampling_logp_difference/max": 1.8251495361328125, "sampling/sampling_logp_difference/mean": 0.39614173769950867, "step": 693, "step_time": 29.405526291999195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5030011534690857, "epoch": 0.00694, "grad_norm": 0.1575859934091568, "kl": 0.2890687324106693, "learning_rate": 7.999959931983205e-06, "loss": -0.1093, "step": 694, "step_time": 6.0436464289996366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.59375, "completions/mean_terminated_length": 4.611111164093018, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3958257138729095, "epoch": 0.00695, "frac_reward_zero_std": 0.0, "grad_norm": 0.1255095899105072, "kl": 0.3439358212053776, "learning_rate": 7.999959810103648e-06, "loss": -0.091, "num_tokens": 22684903.0, "reward": -25.366397857666016, "reward_std": 28.82420539855957, "rewards/rollout_reward_func/mean": -25.366397857666016, "rewards/rollout_reward_func/std": 28.824203491210938, "sampling/importance_sampling_ratio/max": 1.385697603225708, "sampling/importance_sampling_ratio/mean": 0.5222383737564087, "sampling/importance_sampling_ratio/min": 1.6795956980786286e-07, "sampling/sampling_logp_difference/max": 2.5256025791168213, "sampling/sampling_logp_difference/mean": 0.4036051034927368, "step": 695, "step_time": 30.183350227998744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4050183296203613, "epoch": 0.00696, "grad_norm": 0.12955744564533234, "kl": 0.32208866626024246, "learning_rate": 7.999959688039008e-06, "loss": -0.0912, "step": 696, "step_time": 6.148303360998398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.46875, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4860571026802063, "epoch": 0.00697, "frac_reward_zero_std": 0.0, "grad_norm": 0.06321074813604355, "kl": 0.27468449994921684, "learning_rate": 7.999959565789282e-06, "loss": -0.085, "num_tokens": 22770962.0, "reward": -19.89962387084961, "reward_std": 28.261167526245117, "rewards/rollout_reward_func/mean": -19.89962387084961, "rewards/rollout_reward_func/std": 28.261167526245117, "sampling/importance_sampling_ratio/max": 1.6086021661758423, "sampling/importance_sampling_ratio/mean": 0.509413480758667, "sampling/importance_sampling_ratio/min": 1.5131567873694074e-11, "sampling/sampling_logp_difference/max": 2.0962929725646973, "sampling/sampling_logp_difference/mean": 0.4493051767349243, "step": 697, "step_time": 30.389563353000995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.493159145116806, "epoch": 0.00698, "grad_norm": 0.06515277922153473, "kl": 0.2622462064027786, "learning_rate": 7.999959443354469e-06, "loss": -0.0855, "step": 698, "step_time": 6.57580537299873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 9.125, "completions/mean_terminated_length": 4.4210524559021, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.196862667798996, "epoch": 0.00699, "frac_reward_zero_std": 0.0, "grad_norm": 0.12057773768901825, "kl": 0.20800644531846046, "learning_rate": 7.999959320734573e-06, "loss": -0.0987, "num_tokens": 22859342.0, "reward": -23.287694931030273, "reward_std": 28.00124740600586, "rewards/rollout_reward_func/mean": -23.287694931030273, "rewards/rollout_reward_func/std": 28.001243591308594, "sampling/importance_sampling_ratio/max": 1.3639763593673706, "sampling/importance_sampling_ratio/mean": 0.5999627113342285, "sampling/importance_sampling_ratio/min": 5.033487104810774e-05, "sampling/sampling_logp_difference/max": 1.7623025178909302, "sampling/sampling_logp_difference/mean": 0.34359127283096313, "step": 699, "step_time": 30.164292569999816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2056769132614136, "epoch": 0.007, "grad_norm": 0.1265331506729126, "kl": 0.20237909257411957, "learning_rate": 7.999959197929591e-06, "loss": -0.099, "step": 700, "step_time": 6.732131098999162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.53125, "completions/mean_terminated_length": 5.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.62521892786026, "epoch": 0.00701, "frac_reward_zero_std": 0.0, "grad_norm": 0.12159682810306549, "kl": 0.1639443151652813, "learning_rate": 7.999959074939526e-06, "loss": -0.0755, "num_tokens": 22941450.0, "reward": -28.583431243896484, "reward_std": 27.42955207824707, "rewards/rollout_reward_func/mean": -28.583431243896484, "rewards/rollout_reward_func/std": 27.429550170898438, "sampling/importance_sampling_ratio/max": 1.2092626094818115, "sampling/importance_sampling_ratio/mean": 0.427918404340744, "sampling/importance_sampling_ratio/min": 1.8017651655100053e-06, "sampling/sampling_logp_difference/max": 1.924176812171936, "sampling/sampling_logp_difference/mean": 0.37546253204345703, "step": 701, "step_time": 29.12667827000223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.631860077381134, "epoch": 0.00702, "grad_norm": 0.1276267170906067, "kl": 0.1568329520523548, "learning_rate": 7.999958951764375e-06, "loss": -0.0762, "step": 702, "step_time": 6.632828603000235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 10.125, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.6155903041362762, "epoch": 0.00703, "frac_reward_zero_std": 0.0, "grad_norm": 0.1487264335155487, "kl": 0.1690497901290655, "learning_rate": 7.999958828404138e-06, "loss": -0.1082, "num_tokens": 23026183.0, "reward": -25.783958435058594, "reward_std": 28.922935485839844, "rewards/rollout_reward_func/mean": -25.783958435058594, "rewards/rollout_reward_func/std": 28.92293357849121, "sampling/importance_sampling_ratio/max": 1.3012794256210327, "sampling/importance_sampling_ratio/mean": 0.5065099000930786, "sampling/importance_sampling_ratio/min": 1.7301992727425386e-08, "sampling/sampling_logp_difference/max": 1.8792297840118408, "sampling/sampling_logp_difference/mean": 0.45636752247810364, "step": 703, "step_time": 30.514788102998864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.622409462928772, "epoch": 0.00704, "grad_norm": 0.15986573696136475, "kl": 0.16151129640638828, "learning_rate": 7.999958704858817e-06, "loss": -0.1094, "step": 704, "step_time": 5.998439626997424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 11.28125, "completions/mean_terminated_length": 4.384615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.056996762752533, "epoch": 0.00705, "frac_reward_zero_std": 0.0, "grad_norm": 0.19096902012825012, "kl": 0.17506340984255075, "learning_rate": 7.999958581128412e-06, "loss": -0.0838, "num_tokens": 23107836.0, "reward": -31.16791534423828, "reward_std": 27.559621810913086, "rewards/rollout_reward_func/mean": -31.16791534423828, "rewards/rollout_reward_func/std": 27.559621810913086, "sampling/importance_sampling_ratio/max": 1.1104172468185425, "sampling/importance_sampling_ratio/mean": 0.3223922550678253, "sampling/importance_sampling_ratio/min": 1.9394876815681528e-08, "sampling/sampling_logp_difference/max": 2.0891098976135254, "sampling/sampling_logp_difference/mean": 0.48569363355636597, "step": 705, "step_time": 28.276463534000868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.0564162731170654, "epoch": 0.00706, "grad_norm": 0.1954774260520935, "kl": 0.17122199665755033, "learning_rate": 7.99995845721292e-06, "loss": -0.0832, "step": 706, "step_time": 6.0294251649993384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.6678802371025085, "epoch": 0.00707, "frac_reward_zero_std": 0.0, "grad_norm": 0.13952651619911194, "kl": 0.07565593160688877, "learning_rate": 7.999958333112344e-06, "loss": -0.0741, "num_tokens": 23189142.0, "reward": -39.09484100341797, "reward_std": 26.4124698638916, "rewards/rollout_reward_func/mean": -39.09484100341797, "rewards/rollout_reward_func/std": 26.41246795654297, "sampling/importance_sampling_ratio/max": 1.1828044652938843, "sampling/importance_sampling_ratio/mean": 0.21236345171928406, "sampling/importance_sampling_ratio/min": 4.5552475680388227e-10, "sampling/sampling_logp_difference/max": 2.558809757232666, "sampling/sampling_logp_difference/mean": 0.5435043573379517, "step": 707, "step_time": 26.82963888300219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.6659578680992126, "epoch": 0.00708, "grad_norm": 0.13989780843257904, "kl": 0.07513414602726698, "learning_rate": 7.999958208826683e-06, "loss": -0.0747, "step": 708, "step_time": 6.173994332000802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.28125, "completions/mean_terminated_length": 5.18181848526001, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.084819197654724, "epoch": 0.00709, "frac_reward_zero_std": 0.0, "grad_norm": 0.1250787079334259, "kl": 0.1281341202557087, "learning_rate": 7.999958084355938e-06, "loss": -0.0744, "num_tokens": 23270409.0, "reward": -36.51646423339844, "reward_std": 25.973316192626953, "rewards/rollout_reward_func/mean": -36.51646423339844, "rewards/rollout_reward_func/std": 25.973316192626953, "sampling/importance_sampling_ratio/max": 1.238939642906189, "sampling/importance_sampling_ratio/mean": 0.3012285828590393, "sampling/importance_sampling_ratio/min": 2.2673956934227135e-08, "sampling/sampling_logp_difference/max": 2.31758975982666, "sampling/sampling_logp_difference/mean": 0.4712603688240051, "step": 709, "step_time": 26.909669652997763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.0784096121788025, "epoch": 0.0071, "grad_norm": 0.12477295845746994, "kl": 0.12679754197597504, "learning_rate": 7.999957959700107e-06, "loss": -0.0748, "step": 710, "step_time": 6.525748928001121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 12.6875, "completions/mean_terminated_length": 4.222222328186035, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.4882434010505676, "epoch": 0.00711, "frac_reward_zero_std": 0.0, "grad_norm": 0.23015707731246948, "kl": 0.060922153294086456, "learning_rate": 7.999957834859192e-06, "loss": -0.0903, "num_tokens": 23351058.0, "reward": -33.25993347167969, "reward_std": 28.50149154663086, "rewards/rollout_reward_func/mean": -33.25993347167969, "rewards/rollout_reward_func/std": 28.501489639282227, "sampling/importance_sampling_ratio/max": 1.232781171798706, "sampling/importance_sampling_ratio/mean": 0.2663964629173279, "sampling/importance_sampling_ratio/min": 6.010739639350504e-07, "sampling/sampling_logp_difference/max": 2.2618000507354736, "sampling/sampling_logp_difference/mean": 0.5134680867195129, "step": 711, "step_time": 27.994859063999684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.4837636947631836, "epoch": 0.00712, "grad_norm": 0.22723384201526642, "kl": 0.061427175998687744, "learning_rate": 7.999957709833192e-06, "loss": -0.0908, "step": 712, "step_time": 6.65419949399984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 11.59375, "completions/mean_terminated_length": 4.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.151287019252777, "epoch": 0.00713, "frac_reward_zero_std": 0.0, "grad_norm": 0.22014114260673523, "kl": 0.1036722082644701, "learning_rate": 7.999957584622105e-06, "loss": -0.1093, "num_tokens": 23432988.0, "reward": -32.64842224121094, "reward_std": 26.753055572509766, "rewards/rollout_reward_func/mean": -32.64842224121094, "rewards/rollout_reward_func/std": 26.753053665161133, "sampling/importance_sampling_ratio/max": 1.2974773645401, "sampling/importance_sampling_ratio/mean": 0.36422908306121826, "sampling/importance_sampling_ratio/min": 4.763609950941827e-09, "sampling/sampling_logp_difference/max": 2.810499906539917, "sampling/sampling_logp_difference/mean": 0.49025487899780273, "step": 713, "step_time": 28.394981458002803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.150398850440979, "epoch": 0.00714, "grad_norm": 0.22069329023361206, "kl": 0.10404465813189745, "learning_rate": 7.999957459225936e-06, "loss": -0.1105, "step": 714, "step_time": 6.1499935789961455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.0625, "completions/mean_terminated_length": 5.466667175292969, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.9063318371772766, "epoch": 0.00715, "frac_reward_zero_std": 0.0, "grad_norm": 0.10309920459985733, "kl": 0.1594047248363495, "learning_rate": 7.99995733364468e-06, "loss": -0.106, "num_tokens": 23514514.0, "reward": -31.721012115478516, "reward_std": 29.270414352416992, "rewards/rollout_reward_func/mean": -31.721012115478516, "rewards/rollout_reward_func/std": 29.27041244506836, "sampling/importance_sampling_ratio/max": 1.5679633617401123, "sampling/importance_sampling_ratio/mean": 0.43282604217529297, "sampling/importance_sampling_ratio/min": 1.6305434655805584e-06, "sampling/sampling_logp_difference/max": 2.189652919769287, "sampling/sampling_logp_difference/mean": 0.44929996132850647, "step": 715, "step_time": 28.92631647500093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.9057233333587646, "epoch": 0.00716, "grad_norm": 0.10810650140047073, "kl": 0.158232681453228, "learning_rate": 7.999957207878342e-06, "loss": -0.1068, "step": 716, "step_time": 5.830718507999336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.9375, "completions/mean_terminated_length": 5.111111164093018, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.410343289375305, "epoch": 0.00717, "frac_reward_zero_std": 0.0, "grad_norm": 0.08487287163734436, "kl": 0.06524367816746235, "learning_rate": 7.999957081926916e-06, "loss": -0.089, "num_tokens": 23591136.0, "reward": -38.125308990478516, "reward_std": 26.48923110961914, "rewards/rollout_reward_func/mean": -38.125308990478516, "rewards/rollout_reward_func/std": 26.48923110961914, "sampling/importance_sampling_ratio/max": 1.4378308057785034, "sampling/importance_sampling_ratio/mean": 0.25458332896232605, "sampling/importance_sampling_ratio/min": 5.14695130959808e-08, "sampling/sampling_logp_difference/max": 2.5006299018859863, "sampling/sampling_logp_difference/mean": 0.5028942823410034, "step": 717, "step_time": 26.728055224002674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.402925670146942, "epoch": 0.00718, "grad_norm": 0.08536799997091293, "kl": 0.0656681302934885, "learning_rate": 7.999956955790408e-06, "loss": -0.0892, "step": 718, "step_time": 5.8276274029994966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 13.03125, "completions/mean_terminated_length": 4.125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.9441152811050415, "epoch": 0.00719, "frac_reward_zero_std": 0.0, "grad_norm": 0.11753389239311218, "kl": 0.09965213388204575, "learning_rate": 7.999956829468814e-06, "loss": -0.0857, "num_tokens": 23670113.0, "reward": -37.49424743652344, "reward_std": 24.792455673217773, "rewards/rollout_reward_func/mean": -37.49424743652344, "rewards/rollout_reward_func/std": 24.792457580566406, "sampling/importance_sampling_ratio/max": 1.432978630065918, "sampling/importance_sampling_ratio/mean": 0.24074475467205048, "sampling/importance_sampling_ratio/min": 3.331597220324056e-10, "sampling/sampling_logp_difference/max": 2.6473846435546875, "sampling/sampling_logp_difference/mean": 0.4185101091861725, "step": 719, "step_time": 26.135692615000153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.937519669532776, "epoch": 0.0072, "grad_norm": 0.11728409677743912, "kl": 0.10522237606346607, "learning_rate": 7.999956702962134e-06, "loss": -0.0856, "step": 720, "step_time": 6.034680156999457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 13.25, "completions/mean_terminated_length": 6.222222328186035, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.7707587480545044, "epoch": 0.00721, "frac_reward_zero_std": 0.0, "grad_norm": 0.15922501683235168, "kl": 0.08214247599244118, "learning_rate": 7.999956576270371e-06, "loss": -0.1099, "num_tokens": 23748460.0, "reward": -36.85615539550781, "reward_std": 25.115882873535156, "rewards/rollout_reward_func/mean": -36.85615539550781, "rewards/rollout_reward_func/std": 25.115882873535156, "sampling/importance_sampling_ratio/max": 1.2246125936508179, "sampling/importance_sampling_ratio/mean": 0.23061096668243408, "sampling/importance_sampling_ratio/min": 3.309217936475761e-05, "sampling/sampling_logp_difference/max": 2.418288230895996, "sampling/sampling_logp_difference/mean": 0.3775549530982971, "step": 721, "step_time": 27.04173336399799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7640638947486877, "epoch": 0.00722, "grad_norm": 0.15997901558876038, "kl": 0.08247592579573393, "learning_rate": 7.999956449393523e-06, "loss": -0.1105, "step": 722, "step_time": 6.762982006001039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.09375, "completions/mean_terminated_length": 5.666666507720947, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.549217402935028, "epoch": 0.00723, "frac_reward_zero_std": 0.0, "grad_norm": 0.1310255229473114, "kl": 0.06579677853733301, "learning_rate": 7.999956322331587e-06, "loss": -0.0802, "num_tokens": 23823865.0, "reward": -38.16793441772461, "reward_std": 27.47905731201172, "rewards/rollout_reward_func/mean": -38.16793441772461, "rewards/rollout_reward_func/std": 27.479055404663086, "sampling/importance_sampling_ratio/max": 1.293859601020813, "sampling/importance_sampling_ratio/mean": 0.21762077510356903, "sampling/importance_sampling_ratio/min": 5.408113423754912e-08, "sampling/sampling_logp_difference/max": 2.4321961402893066, "sampling/sampling_logp_difference/mean": 0.5121967792510986, "step": 723, "step_time": 27.508260448998044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.540254771709442, "epoch": 0.00724, "grad_norm": 0.1282912939786911, "kl": 0.06541509367525578, "learning_rate": 7.999956195084569e-06, "loss": -0.0804, "step": 724, "step_time": 6.124708457999077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.65625, "completions/mean_terminated_length": 5.300000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.064907968044281, "epoch": 0.00725, "frac_reward_zero_std": 0.0, "grad_norm": 0.10020873695611954, "kl": 0.08202253002673388, "learning_rate": 7.999956067652467e-06, "loss": -0.0767, "num_tokens": 23905930.0, "reward": -35.66494369506836, "reward_std": 26.382368087768555, "rewards/rollout_reward_func/mean": -35.66494369506836, "rewards/rollout_reward_func/std": 26.382366180419922, "sampling/importance_sampling_ratio/max": 1.3099277019500732, "sampling/importance_sampling_ratio/mean": 0.26700711250305176, "sampling/importance_sampling_ratio/min": 7.268305580510059e-06, "sampling/sampling_logp_difference/max": 1.9079786539077759, "sampling/sampling_logp_difference/mean": 0.45000550150871277, "step": 725, "step_time": 27.19313637399864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.053810477256775, "epoch": 0.00726, "grad_norm": 0.09791059792041779, "kl": 0.08418542426079512, "learning_rate": 7.99995594003528e-06, "loss": -0.077, "step": 726, "step_time": 6.468700592997266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.0625, "completions/mean_terminated_length": 5.555555820465088, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.1671611070632935, "epoch": 0.00727, "frac_reward_zero_std": 0.0, "grad_norm": 0.11060139536857605, "kl": 0.055829391814768314, "learning_rate": 7.999955812233007e-06, "loss": -0.083, "num_tokens": 23984008.0, "reward": -38.88223648071289, "reward_std": 27.639404296875, "rewards/rollout_reward_func/mean": -38.88223648071289, "rewards/rollout_reward_func/std": 27.639402389526367, "sampling/importance_sampling_ratio/max": 1.52312171459198, "sampling/importance_sampling_ratio/mean": 0.2683320939540863, "sampling/importance_sampling_ratio/min": 4.72485851332749e-07, "sampling/sampling_logp_difference/max": 2.4392685890197754, "sampling/sampling_logp_difference/mean": 0.4322938621044159, "step": 727, "step_time": 27.164357245001156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.1548901796340942, "epoch": 0.00728, "grad_norm": 0.10995956510305405, "kl": 0.05540188681334257, "learning_rate": 7.999955684245649e-06, "loss": -0.083, "step": 728, "step_time": 5.9646388459987065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.375, "completions/mean_terminated_length": 5.454545497894287, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.0555374026298523, "epoch": 0.00729, "frac_reward_zero_std": 0.0, "grad_norm": 0.12805472314357758, "kl": 0.08104760572314262, "learning_rate": 7.999955556073208e-06, "loss": -0.1077, "num_tokens": 24063845.0, "reward": -37.74343490600586, "reward_std": 25.463642120361328, "rewards/rollout_reward_func/mean": -37.74343490600586, "rewards/rollout_reward_func/std": 25.463642120361328, "sampling/importance_sampling_ratio/max": 1.1733384132385254, "sampling/importance_sampling_ratio/mean": 0.3152740001678467, "sampling/importance_sampling_ratio/min": 6.623657355930845e-08, "sampling/sampling_logp_difference/max": 2.449904441833496, "sampling/sampling_logp_difference/mean": 0.45983731746673584, "step": 729, "step_time": 27.309563512999375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.048973500728607, "epoch": 0.0073, "grad_norm": 0.11977144330739975, "kl": 0.08140146173536777, "learning_rate": 7.999955427715682e-06, "loss": -0.108, "step": 730, "step_time": 6.464860223000869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 12.1875, "completions/mean_terminated_length": 4.909090995788574, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.683759033679962, "epoch": 0.00731, "frac_reward_zero_std": 0.0, "grad_norm": 0.1819240301847458, "kl": 0.1054200641810894, "learning_rate": 7.99995529917307e-06, "loss": -0.0873, "num_tokens": 24141844.0, "reward": -36.66279220581055, "reward_std": 31.261730194091797, "rewards/rollout_reward_func/mean": -36.66279220581055, "rewards/rollout_reward_func/std": 31.261728286743164, "sampling/importance_sampling_ratio/max": 1.1913563013076782, "sampling/importance_sampling_ratio/mean": 0.291876882314682, "sampling/importance_sampling_ratio/min": 3.291764369350858e-05, "sampling/sampling_logp_difference/max": 2.2814443111419678, "sampling/sampling_logp_difference/mean": 0.39751720428466797, "step": 731, "step_time": 28.584241147000284 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.009226190857589245, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009226190857589245, "entropy": 2.6801224648952484, "epoch": 0.00732, "grad_norm": 0.18497900664806366, "kl": 0.10541732236742973, "learning_rate": 7.999955170445374e-06, "loss": -0.0879, "step": 732, "step_time": 6.169828259997303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 11.8125, "completions/mean_terminated_length": 4.833333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.0280357599258423, "epoch": 0.00733, "frac_reward_zero_std": 0.0, "grad_norm": 0.16995389759540558, "kl": 0.11306098103523254, "learning_rate": 7.999955041532593e-06, "loss": -0.0927, "num_tokens": 24224199.0, "reward": -33.42877960205078, "reward_std": 24.915555953979492, "rewards/rollout_reward_func/mean": -33.42877960205078, "rewards/rollout_reward_func/std": 24.915555953979492, "sampling/importance_sampling_ratio/max": 1.3400120735168457, "sampling/importance_sampling_ratio/mean": 0.37180474400520325, "sampling/importance_sampling_ratio/min": 1.8901066312082548e-07, "sampling/sampling_logp_difference/max": 2.0781378746032715, "sampling/sampling_logp_difference/mean": 0.45732581615448, "step": 733, "step_time": 29.75053072100127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.025616943836212, "epoch": 0.00734, "grad_norm": 0.17683947086334229, "kl": 0.11334547214210033, "learning_rate": 7.999954912434727e-06, "loss": -0.093, "step": 734, "step_time": 6.619032370997957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.40625, "completions/mean_terminated_length": 6.200000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5879648327827454, "epoch": 0.00735, "frac_reward_zero_std": 0.0, "grad_norm": 0.13436146080493927, "kl": 0.2513965629041195, "learning_rate": 7.999954783151778e-06, "loss": -0.1052, "num_tokens": 24304185.0, "reward": -31.054054260253906, "reward_std": 27.80286407470703, "rewards/rollout_reward_func/mean": -31.054054260253906, "rewards/rollout_reward_func/std": 27.80286407470703, "sampling/importance_sampling_ratio/max": 1.6559221744537354, "sampling/importance_sampling_ratio/mean": 0.3828357458114624, "sampling/importance_sampling_ratio/min": 3.5513000966602704e-06, "sampling/sampling_logp_difference/max": 2.0670084953308105, "sampling/sampling_logp_difference/mean": 0.3863677680492401, "step": 735, "step_time": 26.785973012001705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.581213116645813, "epoch": 0.00736, "grad_norm": 0.13688236474990845, "kl": 0.2519144657999277, "learning_rate": 7.999954653683744e-06, "loss": -0.1057, "step": 736, "step_time": 5.933900966998408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.9375, "completions/mean_terminated_length": 6.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.949440121650696, "epoch": 0.00737, "frac_reward_zero_std": 0.0, "grad_norm": 0.15714691579341888, "kl": 0.10834148433059454, "learning_rate": 7.999954524030623e-06, "loss": -0.1116, "num_tokens": 24382793.0, "reward": -34.607208251953125, "reward_std": 26.461772918701172, "rewards/rollout_reward_func/mean": -34.607208251953125, "rewards/rollout_reward_func/std": 26.461774826049805, "sampling/importance_sampling_ratio/max": 1.622229814529419, "sampling/importance_sampling_ratio/mean": 0.3401077091693878, "sampling/importance_sampling_ratio/min": 1.4989401719844864e-08, "sampling/sampling_logp_difference/max": 2.2334513664245605, "sampling/sampling_logp_difference/mean": 0.4268631339073181, "step": 737, "step_time": 28.473672910999085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.9491653442382812, "epoch": 0.00738, "grad_norm": 0.1631643921136856, "kl": 0.10628770291805267, "learning_rate": 7.999954394192419e-06, "loss": -0.1126, "step": 738, "step_time": 5.856454628001302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.96875, "completions/mean_terminated_length": 5.266666889190674, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.332017660140991, "epoch": 0.00739, "frac_reward_zero_std": 0.0, "grad_norm": 0.17744246125221252, "kl": 0.38080773036926985, "learning_rate": 7.999954264169131e-06, "loss": -0.1016, "num_tokens": 24464219.0, "reward": -31.456443786621094, "reward_std": 29.560192108154297, "rewards/rollout_reward_func/mean": -31.456443786621094, "rewards/rollout_reward_func/std": 29.560190200805664, "sampling/importance_sampling_ratio/max": 1.3068149089813232, "sampling/importance_sampling_ratio/mean": 0.3675973117351532, "sampling/importance_sampling_ratio/min": 1.3368293139137677e-07, "sampling/sampling_logp_difference/max": 2.053596258163452, "sampling/sampling_logp_difference/mean": 0.3908209800720215, "step": 739, "step_time": 27.81400954899982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.331087827682495, "epoch": 0.0074, "grad_norm": 0.18291567265987396, "kl": 0.3602176606655121, "learning_rate": 7.999954133960758e-06, "loss": -0.1017, "step": 740, "step_time": 6.408517247999043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 10.9375, "completions/mean_terminated_length": 4.428571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4977830946445465, "epoch": 0.00741, "frac_reward_zero_std": 0.0, "grad_norm": 0.12568263709545135, "kl": 0.1319530988112092, "learning_rate": 7.999954003567298e-06, "loss": -0.103, "num_tokens": 24545876.0, "reward": -28.51095962524414, "reward_std": 27.642688751220703, "rewards/rollout_reward_func/mean": -28.51095962524414, "rewards/rollout_reward_func/std": 27.642688751220703, "sampling/importance_sampling_ratio/max": 1.4325507879257202, "sampling/importance_sampling_ratio/mean": 0.4777250587940216, "sampling/importance_sampling_ratio/min": 6.130466090326081e-07, "sampling/sampling_logp_difference/max": 2.0118825435638428, "sampling/sampling_logp_difference/mean": 0.38667041063308716, "step": 741, "step_time": 29.015993031000107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4953399300575256, "epoch": 0.00742, "grad_norm": 0.13312776386737823, "kl": 0.13244789000600576, "learning_rate": 7.999953872988757e-06, "loss": -0.1034, "step": 742, "step_time": 6.788761081001212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.96875, "completions/mean_terminated_length": 5.277777671813965, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2552801966667175, "epoch": 0.00743, "frac_reward_zero_std": 0.0, "grad_norm": 0.10594271123409271, "kl": 0.36337149888277054, "learning_rate": 7.999953742225128e-06, "loss": -0.096, "num_tokens": 24628404.0, "reward": -26.790470123291016, "reward_std": 28.222536087036133, "rewards/rollout_reward_func/mean": -26.790470123291016, "rewards/rollout_reward_func/std": 28.222536087036133, "sampling/importance_sampling_ratio/max": 1.3586755990982056, "sampling/importance_sampling_ratio/mean": 0.5038159489631653, "sampling/importance_sampling_ratio/min": 7.725420858584187e-10, "sampling/sampling_logp_difference/max": 2.0893962383270264, "sampling/sampling_logp_difference/mean": 0.37342432141304016, "step": 743, "step_time": 29.141348549999748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.25330314040184, "epoch": 0.00744, "grad_norm": 0.1052260771393776, "kl": 0.35727759450674057, "learning_rate": 7.999953611276415e-06, "loss": -0.0961, "step": 744, "step_time": 6.594202492000477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 11.3125, "completions/mean_terminated_length": 5.285714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.6382197439670563, "epoch": 0.00745, "frac_reward_zero_std": 0.0, "grad_norm": 0.14605456590652466, "kl": 0.13572184881195426, "learning_rate": 7.99995348014262e-06, "loss": -0.1047, "num_tokens": 24706582.0, "reward": -33.43541717529297, "reward_std": 27.08253288269043, "rewards/rollout_reward_func/mean": -33.43541717529297, "rewards/rollout_reward_func/std": 27.08253288269043, "sampling/importance_sampling_ratio/max": 1.2877367734909058, "sampling/importance_sampling_ratio/mean": 0.43177324533462524, "sampling/importance_sampling_ratio/min": 5.519350532168232e-10, "sampling/sampling_logp_difference/max": 2.2674360275268555, "sampling/sampling_logp_difference/mean": 0.4560094177722931, "step": 745, "step_time": 29.24477555500016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.637644499540329, "epoch": 0.00746, "grad_norm": 0.15251599252223969, "kl": 0.1363961542956531, "learning_rate": 7.999953348823737e-06, "loss": -0.1056, "step": 746, "step_time": 6.359021997997843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 5.777777671813965, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2415659427642822, "epoch": 0.00747, "frac_reward_zero_std": 0.0, "grad_norm": 0.11793117225170135, "kl": 0.1803071517497301, "learning_rate": 7.999953217319772e-06, "loss": -0.1084, "num_tokens": 24791398.0, "reward": -28.007827758789062, "reward_std": 25.199941635131836, "rewards/rollout_reward_func/mean": -28.007827758789062, "rewards/rollout_reward_func/std": 25.199941635131836, "sampling/importance_sampling_ratio/max": 1.2761839628219604, "sampling/importance_sampling_ratio/mean": 0.568932294845581, "sampling/importance_sampling_ratio/min": 1.8566895576554998e-08, "sampling/sampling_logp_difference/max": 3.2202107906341553, "sampling/sampling_logp_difference/mean": 0.3650333881378174, "step": 747, "step_time": 30.947955097997692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.240657687187195, "epoch": 0.00748, "grad_norm": 0.11918492615222931, "kl": 0.18323007598519325, "learning_rate": 7.999953085630722e-06, "loss": -0.1085, "step": 748, "step_time": 6.0212734150027245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.68570077419281, "epoch": 0.00749, "frac_reward_zero_std": 0.0, "grad_norm": 0.22413665056228638, "kl": 0.7336503677070141, "learning_rate": 7.999952953756588e-06, "loss": -0.0804, "num_tokens": 24875763.0, "reward": -32.890140533447266, "reward_std": 25.539997100830078, "rewards/rollout_reward_func/mean": -32.890140533447266, "rewards/rollout_reward_func/std": 25.539997100830078, "sampling/importance_sampling_ratio/max": 1.401129126548767, "sampling/importance_sampling_ratio/mean": 0.3115527927875519, "sampling/importance_sampling_ratio/min": 2.9409727630991256e-07, "sampling/sampling_logp_difference/max": 2.5651841163635254, "sampling/sampling_logp_difference/mean": 0.40495598316192627, "step": 749, "step_time": 29.245865975000925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.689805746078491, "epoch": 0.0075, "grad_norm": 0.21856696903705597, "kl": 0.6402356754988432, "learning_rate": 7.999952821697368e-06, "loss": -0.0812, "step": 750, "step_time": 6.18120958200052 } ], "logging_steps": 1.0, "max_steps": 400000, "num_input_tokens_seen": 24875763, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }