{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.015, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.9375, "completions/mean_terminated_length": 7.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 4.2894787192344666, "epoch": 1e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0953264907002449, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0155, "num_tokens": 43135.0, "reward": -0.17697054147720337, "reward_std": 0.861113429069519, "rewards/rollout_reward_func/mean": -0.17697054147720337, "rewards/rollout_reward_func/std": 0.8611133694648743, "sampling/importance_sampling_ratio/max": 0.4261567294597626, "sampling/importance_sampling_ratio/mean": 0.07546356320381165, "sampling/importance_sampling_ratio/min": 5.603598030035073e-09, "sampling/sampling_logp_difference/max": 2.6285929679870605, "sampling/sampling_logp_difference/mean": 0.679397463798523, "step": 1, "step_time": 16.132317998097278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.2894787192344666, "epoch": 2e-05, "grad_norm": 0.09508741647005081, "kl": 0.0, "learning_rate": 2.2857142857142855e-07, "loss": -0.0155, "step": 2, "step_time": 8.105621085007442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.8125, "completions/mean_terminated_length": 7.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.977905511856079, "epoch": 3e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.13508382439613342, "kl": 0.00028613330505322665, "learning_rate": 4.571428571428571e-07, "loss": -0.0203, "num_tokens": 85197.0, "reward": 0.14988508820533752, "reward_std": 0.9003480672836304, "rewards/rollout_reward_func/mean": 0.14988508820533752, "rewards/rollout_reward_func/std": 0.9003480076789856, "sampling/importance_sampling_ratio/max": 0.4181932508945465, "sampling/importance_sampling_ratio/mean": 0.13435205817222595, "sampling/importance_sampling_ratio/min": 5.857365437123008e-08, "sampling/sampling_logp_difference/max": 2.123720407485962, "sampling/sampling_logp_difference/mean": 0.5970968008041382, "step": 3, "step_time": 15.408900799026014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.984070986509323, "epoch": 4e-05, "grad_norm": 0.13495057821273804, "kl": 0.00029379685474850703, "learning_rate": 6.857142857142857e-07, "loss": -0.0202, "step": 4, "step_time": 9.480604179989314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.71875, "completions/mean_terminated_length": 7.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 4.367568552494049, "epoch": 5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.09918283671140671, "kl": 0.00032783812639536336, "learning_rate": 9.142857142857142e-07, "loss": -0.0155, "num_tokens": 128680.0, "reward": -0.19373467564582825, "reward_std": 0.858532726764679, "rewards/rollout_reward_func/mean": -0.19373467564582825, "rewards/rollout_reward_func/std": 0.858532726764679, "sampling/importance_sampling_ratio/max": 0.40376797318458557, "sampling/importance_sampling_ratio/mean": 0.07722157984972, "sampling/importance_sampling_ratio/min": 3.0040661624930465e-10, "sampling/sampling_logp_difference/max": 2.581655263900757, "sampling/sampling_logp_difference/mean": 0.6826087236404419, "step": 5, "step_time": 15.77188739401754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.367772221565247, "epoch": 6e-05, "grad_norm": 0.09837891906499863, "kl": 0.00025927361093636137, "learning_rate": 1.1428571428571428e-06, "loss": -0.0155, "step": 6, "step_time": 8.14989799101022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 6.40000057220459, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 4.547019720077515, "epoch": 7e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0895848274230957, "kl": 0.0003025367986992933, "learning_rate": 1.3714285714285715e-06, "loss": -0.0146, "num_tokens": 171897.0, "reward": -0.15008971095085144, "reward_std": 0.805182933807373, "rewards/rollout_reward_func/mean": -0.15008971095085144, "rewards/rollout_reward_func/std": 0.805182933807373, "sampling/importance_sampling_ratio/max": 0.35022103786468506, "sampling/importance_sampling_ratio/mean": 0.06185203790664673, "sampling/importance_sampling_ratio/min": 4.046498899845652e-11, "sampling/sampling_logp_difference/max": 2.6546506881713867, "sampling/sampling_logp_difference/mean": 0.7794643640518188, "step": 7, "step_time": 15.545058936957503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.549445152282715, "epoch": 8e-05, "grad_norm": 0.08728455752134323, "kl": 0.00027346282331564, "learning_rate": 1.6e-06, "loss": -0.0146, "step": 8, "step_time": 8.106913096999051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.0625, "completions/mean_terminated_length": 5.44444465637207, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 4.107187896966934, "epoch": 9e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.15525685250759125, "kl": 0.00023710494770057267, "learning_rate": 1.8285714285714284e-06, "loss": -0.0236, "num_tokens": 215469.0, "reward": 0.04051198065280914, "reward_std": 0.8969102501869202, "rewards/rollout_reward_func/mean": 0.04051198065280914, "rewards/rollout_reward_func/std": 0.8969102501869202, "sampling/importance_sampling_ratio/max": 0.4193938076496124, "sampling/importance_sampling_ratio/mean": 0.13912013173103333, "sampling/importance_sampling_ratio/min": 1.0476339934450962e-10, "sampling/sampling_logp_difference/max": 2.3024039268493652, "sampling/sampling_logp_difference/mean": 0.6489021182060242, "step": 9, "step_time": 16.79810807094327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.101956337690353, "epoch": 0.0001, "grad_norm": 0.15447907149791718, "kl": 0.00028152009326731786, "learning_rate": 2.057142857142857e-06, "loss": -0.0238, "step": 10, "step_time": 8.142225561954547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.15625, "completions/mean_terminated_length": 9.166666984558105, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 4.004406154155731, "epoch": 0.00011, "frac_reward_zero_std": 0.0, "grad_norm": 0.07993295788764954, "kl": 0.0003179481682309415, "learning_rate": 2.2857142857142856e-06, "loss": -0.0142, "num_tokens": 258416.0, "reward": -0.015168089419603348, "reward_std": 0.838731050491333, "rewards/rollout_reward_func/mean": -0.015168089419603348, "rewards/rollout_reward_func/std": 0.838731050491333, "sampling/importance_sampling_ratio/max": 0.40848639607429504, "sampling/importance_sampling_ratio/mean": 0.0640551745891571, "sampling/importance_sampling_ratio/min": 4.009795873116673e-08, "sampling/sampling_logp_difference/max": 2.415602922439575, "sampling/sampling_logp_difference/mean": 0.5970046520233154, "step": 11, "step_time": 15.855879417038523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.004289656877518, "epoch": 0.00012, "grad_norm": 0.079463891685009, "kl": 0.0004016714228782803, "learning_rate": 2.5142857142857142e-06, "loss": -0.0142, "step": 12, "step_time": 8.16768709506141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 11.8125, "completions/mean_terminated_length": 5.692307949066162, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 4.127161264419556, "epoch": 0.00013, "frac_reward_zero_std": 0.0, "grad_norm": 0.1086171567440033, "kl": 0.0006401330319931731, "learning_rate": 2.742857142857143e-06, "loss": -0.0169, "num_tokens": 297747.0, "reward": -0.26871758699417114, "reward_std": 0.853215217590332, "rewards/rollout_reward_func/mean": -0.26871758699417114, "rewards/rollout_reward_func/std": 0.8532151579856873, "sampling/importance_sampling_ratio/max": 0.42569461464881897, "sampling/importance_sampling_ratio/mean": 0.08216315507888794, "sampling/importance_sampling_ratio/min": 3.917125468433369e-08, "sampling/sampling_logp_difference/max": 2.2928214073181152, "sampling/sampling_logp_difference/mean": 0.6310036778450012, "step": 13, "step_time": 15.504144847975112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.127153098583221, "epoch": 0.00014, "grad_norm": 0.10953257232904434, "kl": 0.0008612029159849044, "learning_rate": 2.9714285714285716e-06, "loss": -0.0171, "step": 14, "step_time": 9.572727330989437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.21875, "completions/mean_terminated_length": 5.6666669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 4.1135295033454895, "epoch": 0.00015, "frac_reward_zero_std": 0.0, "grad_norm": 0.0558856837451458, "kl": 0.00140177778666839, "learning_rate": 3.2e-06, "loss": -0.0045, "num_tokens": 341444.0, "reward": 0.4615798592567444, "reward_std": 0.6355739831924438, "rewards/rollout_reward_func/mean": 0.4615798592567444, "rewards/rollout_reward_func/std": 0.6355739235877991, "sampling/importance_sampling_ratio/max": 0.41525131464004517, "sampling/importance_sampling_ratio/mean": 0.1273254007101059, "sampling/importance_sampling_ratio/min": 2.3815482919076203e-10, "sampling/sampling_logp_difference/max": 2.3411736488342285, "sampling/sampling_logp_difference/mean": 0.6654903888702393, "step": 15, "step_time": 15.453662687941687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.102516859769821, "epoch": 0.00016, "grad_norm": 0.05710635706782341, "kl": 0.0020717958686873317, "learning_rate": 3.428571428571428e-06, "loss": -0.0044, "step": 16, "step_time": 8.140518494037678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.4375, "completions/mean_terminated_length": 6.454545497894287, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 4.122020095586777, "epoch": 0.00017, "frac_reward_zero_std": 0.0, "grad_norm": 0.14883020520210266, "kl": 0.0026931983011309057, "learning_rate": 3.657142857142857e-06, "loss": -0.028, "num_tokens": 381819.0, "reward": -0.024414777755737305, "reward_std": 0.9009594321250916, "rewards/rollout_reward_func/mean": -0.024414777755737305, "rewards/rollout_reward_func/std": 0.9009594321250916, "sampling/importance_sampling_ratio/max": 0.49133816361427307, "sampling/importance_sampling_ratio/mean": 0.16813601553440094, "sampling/importance_sampling_ratio/min": 1.4057838448877646e-08, "sampling/sampling_logp_difference/max": 2.9047303199768066, "sampling/sampling_logp_difference/mean": 0.6671483516693115, "step": 17, "step_time": 15.19698279997101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.102741211652756, "epoch": 0.00018, "grad_norm": 0.1501336693763733, "kl": 0.0037085008807480335, "learning_rate": 3.885714285714286e-06, "loss": -0.0284, "step": 18, "step_time": 8.070774594991235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.9375, "completions/mean_terminated_length": 5.259259223937988, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.6760843992233276, "epoch": 0.00019, "frac_reward_zero_std": 0.0, "grad_norm": 0.0779130682349205, "kl": 0.006111533439252526, "learning_rate": 4.114285714285714e-06, "loss": -0.0086, "num_tokens": 423124.0, "reward": 0.3379635214805603, "reward_std": 0.78679358959198, "rewards/rollout_reward_func/mean": 0.3379635214805603, "rewards/rollout_reward_func/std": 0.78679358959198, "sampling/importance_sampling_ratio/max": 0.5058296322822571, "sampling/importance_sampling_ratio/mean": 0.2171005755662918, "sampling/importance_sampling_ratio/min": 1.9234468595641374e-07, "sampling/sampling_logp_difference/max": 2.401179313659668, "sampling/sampling_logp_difference/mean": 0.5321379899978638, "step": 19, "step_time": 15.80787633502041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.6499961018562317, "epoch": 0.0002, "grad_norm": 0.07810136675834656, "kl": 0.009475538274273276, "learning_rate": 4.342857142857142e-06, "loss": -0.0093, "step": 20, "step_time": 8.134666830010246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.96875, "completions/mean_terminated_length": 8.279999732971191, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 4.0525446236133575, "epoch": 0.00021, "frac_reward_zero_std": 0.0, "grad_norm": 0.08680616319179535, "kl": 0.010767699801363051, "learning_rate": 4.571428571428571e-06, "loss": -0.0182, "num_tokens": 464763.0, "reward": -0.1820240318775177, "reward_std": 0.6879092454910278, "rewards/rollout_reward_func/mean": -0.1820240318775177, "rewards/rollout_reward_func/std": 0.6879092454910278, "sampling/importance_sampling_ratio/max": 0.5314440131187439, "sampling/importance_sampling_ratio/mean": 0.14910827577114105, "sampling/importance_sampling_ratio/min": 1.96008459463215e-10, "sampling/sampling_logp_difference/max": 2.341249704360962, "sampling/sampling_logp_difference/mean": 0.6364729404449463, "step": 21, "step_time": 15.351485627965303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.02737694978714, "epoch": 0.00022, "grad_norm": 0.08582854270935059, "kl": 0.014415889978408813, "learning_rate": 4.8e-06, "loss": -0.0186, "step": 22, "step_time": 8.087962069024798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.78125, "completions/mean_terminated_length": 5.526315689086914, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.8478090167045593, "epoch": 0.00023, "frac_reward_zero_std": 0.0, "grad_norm": 0.1653035432100296, "kl": 0.02179299236740917, "learning_rate": 5.0285714285714285e-06, "loss": -0.0269, "num_tokens": 508524.0, "reward": 0.23664569854736328, "reward_std": 0.8107379078865051, "rewards/rollout_reward_func/mean": 0.23664569854736328, "rewards/rollout_reward_func/std": 0.8107379078865051, "sampling/importance_sampling_ratio/max": 0.6584811806678772, "sampling/importance_sampling_ratio/mean": 0.2032453417778015, "sampling/importance_sampling_ratio/min": 1.2068276378229825e-09, "sampling/sampling_logp_difference/max": 2.359328508377075, "sampling/sampling_logp_difference/mean": 0.634738564491272, "step": 23, "step_time": 15.403693512984319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 3.8219780027866364, "epoch": 0.00024, "grad_norm": 0.15976862609386444, "kl": 0.03011758206412196, "learning_rate": 5.257142857142857e-06, "loss": -0.0278, "step": 24, "step_time": 9.118668988987338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.15625, "completions/mean_terminated_length": 5.115384578704834, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.537194162607193, "epoch": 0.00025, "frac_reward_zero_std": 0.0, "grad_norm": 0.14320753514766693, "kl": 0.042146688560023904, "learning_rate": 5.485714285714286e-06, "loss": -0.028, "num_tokens": 552600.0, "reward": 0.507192075252533, "reward_std": 0.6834948658943176, "rewards/rollout_reward_func/mean": 0.507192075252533, "rewards/rollout_reward_func/std": 0.6834948658943176, "sampling/importance_sampling_ratio/max": 0.8172290325164795, "sampling/importance_sampling_ratio/mean": 0.32328933477401733, "sampling/importance_sampling_ratio/min": 5.263423474843876e-08, "sampling/sampling_logp_difference/max": 2.5534415245056152, "sampling/sampling_logp_difference/mean": 0.5461617708206177, "step": 25, "step_time": 15.011758435954107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.480475276708603, "epoch": 0.00026, "grad_norm": 0.1368740350008011, "kl": 0.05293749738484621, "learning_rate": 5.7142857142857145e-06, "loss": -0.0292, "step": 26, "step_time": 8.177773717994569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.09375, "completions/mean_terminated_length": 5.038461685180664, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.3318563997745514, "epoch": 0.00027, "frac_reward_zero_std": 0.0, "grad_norm": 0.18784494698047638, "kl": 0.0867075938731432, "learning_rate": 5.942857142857143e-06, "loss": -0.0364, "num_tokens": 595614.0, "reward": 0.28337404131889343, "reward_std": 0.8230791687965393, "rewards/rollout_reward_func/mean": 0.28337404131889343, "rewards/rollout_reward_func/std": 0.8230791687965393, "sampling/importance_sampling_ratio/max": 0.8728322386741638, "sampling/importance_sampling_ratio/mean": 0.39572715759277344, "sampling/importance_sampling_ratio/min": 3.022447572220699e-07, "sampling/sampling_logp_difference/max": 1.7447319030761719, "sampling/sampling_logp_difference/mean": 0.5443941950798035, "step": 27, "step_time": 14.795111182960682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.018522267695516348, "clip_ratio/low_min": 0.0062500000931322575, "clip_ratio/region_mean": 0.018522267695516348, "entropy": 3.234829932451248, "epoch": 0.00028, "grad_norm": 0.18382349610328674, "kl": 0.12804493866860867, "learning_rate": 6.171428571428571e-06, "loss": -0.0381, "step": 28, "step_time": 8.590553791960701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 5.428571701049805, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.21248596906662, "epoch": 0.00029, "frac_reward_zero_std": 0.0, "grad_norm": 0.10743711143732071, "kl": 0.1499989740550518, "learning_rate": 6.4e-06, "loss": -0.0314, "num_tokens": 636936.0, "reward": 0.33194947242736816, "reward_std": 0.8168965578079224, "rewards/rollout_reward_func/mean": 0.33194947242736816, "rewards/rollout_reward_func/std": 0.8168965578079224, "sampling/importance_sampling_ratio/max": 0.8636133670806885, "sampling/importance_sampling_ratio/mean": 0.36560770869255066, "sampling/importance_sampling_ratio/min": 5.460589314054687e-09, "sampling/sampling_logp_difference/max": 2.101886749267578, "sampling/sampling_logp_difference/mean": 0.5482168197631836, "step": 29, "step_time": 14.755556557996897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0310581149533391, "clip_ratio/low_min": 0.0062500000931322575, "clip_ratio/region_mean": 0.0310581149533391, "entropy": 3.0934344232082367, "epoch": 0.0003, "grad_norm": 0.11123989522457123, "kl": 0.2602653671056032, "learning_rate": 6.628571428571428e-06, "loss": -0.0323, "step": 30, "step_time": 8.12839046298177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 5.607142925262451, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.716022312641144, "epoch": 0.00031, "frac_reward_zero_std": 0.0, "grad_norm": 0.09614620357751846, "kl": 0.28115653339773417, "learning_rate": 6.857142857142856e-06, "loss": -0.0129, "num_tokens": 675692.0, "reward": -0.07067897915840149, "reward_std": 0.7836520671844482, "rewards/rollout_reward_func/mean": -0.07067897915840149, "rewards/rollout_reward_func/std": 0.7836520075798035, "sampling/importance_sampling_ratio/max": 1.1859302520751953, "sampling/importance_sampling_ratio/mean": 0.4444946348667145, "sampling/importance_sampling_ratio/min": 4.190356321487343e-06, "sampling/sampling_logp_difference/max": 1.941929817199707, "sampling/sampling_logp_difference/mean": 0.43936848640441895, "step": 31, "step_time": 14.674396738031646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.020571220433339477, "clip_ratio/low_min": 0.013194444589316845, "clip_ratio/region_mean": 0.020571220433339477, "entropy": 2.639510542154312, "epoch": 0.00032, "grad_norm": 0.11449538171291351, "kl": 0.39109013229608536, "learning_rate": 7.085714285714285e-06, "loss": -0.0126, "step": 32, "step_time": 8.085413199994946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.34375, "completions/mean_terminated_length": 4.633333683013916, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3097386211156845, "epoch": 0.00033, "frac_reward_zero_std": 0.0, "grad_norm": 0.15744425356388092, "kl": 0.42347080912441015, "learning_rate": 7.314285714285714e-06, "loss": -0.0196, "num_tokens": 716866.0, "reward": 0.5619326829910278, "reward_std": 0.6515778303146362, "rewards/rollout_reward_func/mean": 0.5619326829910278, "rewards/rollout_reward_func/std": 0.6515778303146362, "sampling/importance_sampling_ratio/max": 1.1215016841888428, "sampling/importance_sampling_ratio/mean": 0.579840362071991, "sampling/importance_sampling_ratio/min": 1.2757817557940143e-07, "sampling/sampling_logp_difference/max": 2.749905586242676, "sampling/sampling_logp_difference/mean": 0.40168237686157227, "step": 33, "step_time": 15.194808635947993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2394297122955322, "epoch": 0.00034, "grad_norm": 0.1580081284046173, "kl": 0.4619623301550746, "learning_rate": 7.542857142857142e-06, "loss": -0.0199, "step": 34, "step_time": 8.109537537005963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 4.965517044067383, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1726759523153305, "epoch": 0.00035, "frac_reward_zero_std": 0.0, "grad_norm": 0.25543302297592163, "kl": 0.23337720148265362, "learning_rate": 7.771428571428572e-06, "loss": -0.0364, "num_tokens": 757485.0, "reward": 0.19854119420051575, "reward_std": 0.8590704798698425, "rewards/rollout_reward_func/mean": 0.19854119420051575, "rewards/rollout_reward_func/std": 0.8590704798698425, "sampling/importance_sampling_ratio/max": 1.1312776803970337, "sampling/importance_sampling_ratio/mean": 0.6760069131851196, "sampling/importance_sampling_ratio/min": 3.5621442293631844e-06, "sampling/sampling_logp_difference/max": 1.7979652881622314, "sampling/sampling_logp_difference/mean": 0.3434531092643738, "step": 35, "step_time": 14.578261517017381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.117391064763069, "epoch": 0.00036, "grad_norm": 0.2597109079360962, "kl": 0.2485052878037095, "learning_rate": 8e-06, "loss": -0.038, "step": 36, "step_time": 8.164077249035472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 4.266666889190674, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9325679689645767, "epoch": 0.00037, "frac_reward_zero_std": 0.0, "grad_norm": 0.18088729679584503, "kl": 0.3404070157557726, "learning_rate": 7.99999999962976e-06, "loss": -0.042, "num_tokens": 799523.0, "reward": 0.3919771909713745, "reward_std": 0.7183950543403625, "rewards/rollout_reward_func/mean": 0.3919771909713745, "rewards/rollout_reward_func/std": 0.7183949947357178, "sampling/importance_sampling_ratio/max": 1.2424436807632446, "sampling/importance_sampling_ratio/mean": 0.6991925239562988, "sampling/importance_sampling_ratio/min": 6.988130518692515e-09, "sampling/sampling_logp_difference/max": 2.2108943462371826, "sampling/sampling_logp_difference/mean": 0.3431573212146759, "step": 37, "step_time": 14.446563039004104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 1.8704033493995667, "epoch": 0.00038, "grad_norm": 0.19103175401687622, "kl": 0.36626406386494637, "learning_rate": 7.99999999851904e-06, "loss": -0.0436, "step": 38, "step_time": 9.243919375003316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.65625, "completions/mean_terminated_length": 4.586206912994385, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8400151878595352, "epoch": 0.00039, "frac_reward_zero_std": 0.0, "grad_norm": 0.2215472310781479, "kl": 0.6260720230638981, "learning_rate": 7.999999996667841e-06, "loss": -0.0328, "num_tokens": 837892.0, "reward": 0.16802485287189484, "reward_std": 0.7254509329795837, "rewards/rollout_reward_func/mean": 0.16802485287189484, "rewards/rollout_reward_func/std": 0.7254509925842285, "sampling/importance_sampling_ratio/max": 1.4188719987869263, "sampling/importance_sampling_ratio/mean": 0.737035870552063, "sampling/importance_sampling_ratio/min": 3.908744261593711e-08, "sampling/sampling_logp_difference/max": 2.0449235439300537, "sampling/sampling_logp_difference/mean": 0.3759535551071167, "step": 39, "step_time": 14.265431025036378 }, { "clip_ratio/high_max": 0.03750000009313226, "clip_ratio/high_mean": 0.01875000004656613, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01875000004656613, "entropy": 1.781663864850998, "epoch": 0.0004, "grad_norm": 0.14845974743366241, "kl": 0.6737813502550125, "learning_rate": 7.999999994076165e-06, "loss": -0.0341, "step": 40, "step_time": 8.011114727996755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1715133488178253, "epoch": 0.00041, "frac_reward_zero_std": 0.0, "grad_norm": 0.29717785120010376, "kl": 0.49516336247324944, "learning_rate": 7.999999990744006e-06, "loss": -0.0741, "num_tokens": 876529.0, "reward": 0.24639323353767395, "reward_std": 0.9247351884841919, "rewards/rollout_reward_func/mean": 0.24639323353767395, "rewards/rollout_reward_func/std": 0.9247351884841919, "sampling/importance_sampling_ratio/max": 1.5942364931106567, "sampling/importance_sampling_ratio/mean": 0.7607496380805969, "sampling/importance_sampling_ratio/min": 4.1131436034902435e-09, "sampling/sampling_logp_difference/max": 2.7251389026641846, "sampling/sampling_logp_difference/mean": 0.5329572558403015, "step": 41, "step_time": 14.373580445011612 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 2.1407160460948944, "epoch": 0.00042, "grad_norm": 0.17200890183448792, "kl": 0.5236840397119522, "learning_rate": 7.999999986671369e-06, "loss": -0.0766, "step": 42, "step_time": 8.068840801948681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.34375, "completions/mean_terminated_length": 4.241379261016846, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5658911243081093, "epoch": 0.00043, "frac_reward_zero_std": 0.0, "grad_norm": 0.22166626155376434, "kl": 0.4243749063462019, "learning_rate": 7.999999981858253e-06, "loss": -0.0359, "num_tokens": 919156.0, "reward": 0.2086961418390274, "reward_std": 0.7886365652084351, "rewards/rollout_reward_func/mean": 0.2086961418390274, "rewards/rollout_reward_func/std": 0.7886365652084351, "sampling/importance_sampling_ratio/max": 1.4361516237258911, "sampling/importance_sampling_ratio/mean": 0.8247853517532349, "sampling/importance_sampling_ratio/min": 1.7117969036917202e-05, "sampling/sampling_logp_difference/max": 1.919467806816101, "sampling/sampling_logp_difference/mean": 0.33899468183517456, "step": 43, "step_time": 15.5218665379507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5204302445054054, "epoch": 0.00044, "grad_norm": 0.2448151856660843, "kl": 0.4176702704280615, "learning_rate": 7.999999976304658e-06, "loss": -0.0368, "step": 44, "step_time": 8.108220500987954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 5.0714287757873535, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9982767701148987, "epoch": 0.00045, "frac_reward_zero_std": 0.0, "grad_norm": 0.16748252511024475, "kl": 0.6673126071691513, "learning_rate": 7.999999970010581e-06, "loss": -0.0503, "num_tokens": 961121.0, "reward": 0.4735022187232971, "reward_std": 0.805591344833374, "rewards/rollout_reward_func/mean": 0.4735022187232971, "rewards/rollout_reward_func/std": 0.805591344833374, "sampling/importance_sampling_ratio/max": 1.8357526063919067, "sampling/importance_sampling_ratio/mean": 0.6827009916305542, "sampling/importance_sampling_ratio/min": 5.607939215224178e-07, "sampling/sampling_logp_difference/max": 2.500825881958008, "sampling/sampling_logp_difference/mean": 0.4473329484462738, "step": 45, "step_time": 14.753170590964146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.9727061986923218, "epoch": 0.00046, "grad_norm": 0.14206217229366302, "kl": 0.6520557757467031, "learning_rate": 7.999999962976027e-06, "loss": -0.0519, "step": 46, "step_time": 8.184364435030147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 4.125, "completions/mean_terminated_length": 3.7419352531433105, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0805913805961609, "epoch": 0.00047, "frac_reward_zero_std": 0.25, "grad_norm": 0.22552913427352905, "kl": 0.4073444213718176, "learning_rate": 7.999999955200991e-06, "loss": -0.036, "num_tokens": 1004726.0, "reward": 0.6457632780075073, "reward_std": 0.6948646306991577, "rewards/rollout_reward_func/mean": 0.6457632780075073, "rewards/rollout_reward_func/std": 0.6948646903038025, "sampling/importance_sampling_ratio/max": 1.7613990306854248, "sampling/importance_sampling_ratio/mean": 1.0503935813903809, "sampling/importance_sampling_ratio/min": 4.6536410991393495e-06, "sampling/sampling_logp_difference/max": 1.6791272163391113, "sampling/sampling_logp_difference/mean": 0.2823474407196045, "step": 47, "step_time": 14.251372860919219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 1.0536294281482697, "epoch": 0.00048, "grad_norm": 0.22505633533000946, "kl": 0.4229491353034973, "learning_rate": 7.999999946685478e-06, "loss": -0.0377, "step": 48, "step_time": 9.215806047024671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.34375, "completions/mean_terminated_length": 4.964285850524902, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0663984268903732, "epoch": 0.00049, "frac_reward_zero_std": 0.0, "grad_norm": 0.13727501034736633, "kl": 0.4819028079509735, "learning_rate": 7.999999937429484e-06, "loss": -0.078, "num_tokens": 1046488.0, "reward": 0.34778380393981934, "reward_std": 0.7539032697677612, "rewards/rollout_reward_func/mean": 0.34778380393981934, "rewards/rollout_reward_func/std": 0.7539032101631165, "sampling/importance_sampling_ratio/max": 1.7389047145843506, "sampling/importance_sampling_ratio/mean": 0.7528799176216125, "sampling/importance_sampling_ratio/min": 2.533247922542614e-08, "sampling/sampling_logp_difference/max": 2.200942277908325, "sampling/sampling_logp_difference/mean": 0.522196888923645, "step": 49, "step_time": 14.718283370981226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.019866071874275804, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019866071874275804, "entropy": 2.0433158949017525, "epoch": 0.0005, "grad_norm": 0.10569190233945847, "kl": 0.529812540858984, "learning_rate": 7.999999927433012e-06, "loss": -0.0787, "step": 50, "step_time": 8.129872606979916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 4.239999771118164, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0379847399890423, "epoch": 0.00051, "frac_reward_zero_std": 0.0, "grad_norm": 0.16015349328517914, "kl": 0.34586830250918865, "learning_rate": 7.99999991669606e-06, "loss": -0.1083, "num_tokens": 1087167.0, "reward": 0.5509246587753296, "reward_std": 0.8159432411193848, "rewards/rollout_reward_func/mean": 0.5509246587753296, "rewards/rollout_reward_func/std": 0.81594318151474, "sampling/importance_sampling_ratio/max": 2.1273112297058105, "sampling/importance_sampling_ratio/mean": 0.9205597639083862, "sampling/importance_sampling_ratio/min": 5.197586805394394e-09, "sampling/sampling_logp_difference/max": 2.388481378555298, "sampling/sampling_logp_difference/mean": 0.4812689423561096, "step": 51, "step_time": 14.698704902024474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.0103560015559196, "epoch": 0.00052, "grad_norm": 0.12624835968017578, "kl": 0.3941082824021578, "learning_rate": 7.999999905218627e-06, "loss": -0.1094, "step": 52, "step_time": 8.107878160022665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 4.153846263885498, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5640363171696663, "epoch": 0.00053, "frac_reward_zero_std": 0.0, "grad_norm": 0.20212936401367188, "kl": 0.5708989035338163, "learning_rate": 7.999999893000716e-06, "loss": -0.0789, "num_tokens": 1126054.0, "reward": 0.9089628458023071, "reward_std": 0.6884047389030457, "rewards/rollout_reward_func/mean": 0.9089628458023071, "rewards/rollout_reward_func/std": 0.6884047389030457, "sampling/importance_sampling_ratio/max": 1.8703112602233887, "sampling/importance_sampling_ratio/mean": 0.8964175581932068, "sampling/importance_sampling_ratio/min": 1.1990795201199944e-07, "sampling/sampling_logp_difference/max": 2.9017322063446045, "sampling/sampling_logp_difference/mean": 0.4116418957710266, "step": 53, "step_time": 15.823209293012042 }, { "clip_ratio/high_max": 0.0386904776096344, "clip_ratio/high_mean": 0.0193452388048172, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028273810632526875, "entropy": 1.5407146140933037, "epoch": 0.00054, "grad_norm": 0.13224776089191437, "kl": 0.6488442402333021, "learning_rate": 7.999999880042326e-06, "loss": -0.0796, "step": 54, "step_time": 8.137097352009732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.9375, "completions/mean_terminated_length": 4.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1833574827760458, "epoch": 0.00055, "frac_reward_zero_std": 0.25, "grad_norm": 0.28694427013397217, "kl": 2.8881847951561213, "learning_rate": 7.999999866343456e-06, "loss": -0.0539, "num_tokens": 1166322.0, "reward": 0.9136826395988464, "reward_std": 0.7621045708656311, "rewards/rollout_reward_func/mean": 0.9136826395988464, "rewards/rollout_reward_func/std": 0.7621045708656311, "sampling/importance_sampling_ratio/max": 2.128807544708252, "sampling/importance_sampling_ratio/mean": 0.8647305965423584, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.524113893508911, "sampling/sampling_logp_difference/mean": 0.4051467180252075, "step": 55, "step_time": 14.576651369017782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1701631117612123, "epoch": 0.00056, "grad_norm": 0.3070880174636841, "kl": 3.100404404103756, "learning_rate": 7.999999851904105e-06, "loss": -0.0535, "step": 56, "step_time": 8.072108143009245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.400000095367432, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2180787473917007, "epoch": 0.00057, "frac_reward_zero_std": 0.0, "grad_norm": 0.2886713147163391, "kl": 2.108639694750309, "learning_rate": 7.999999836724277e-06, "loss": -0.0867, "num_tokens": 1208556.0, "reward": 0.6681703329086304, "reward_std": 0.754245400428772, "rewards/rollout_reward_func/mean": 0.6681703329086304, "rewards/rollout_reward_func/std": 0.754245400428772, "sampling/importance_sampling_ratio/max": 2.1806390285491943, "sampling/importance_sampling_ratio/mean": 0.8869718313217163, "sampling/importance_sampling_ratio/min": 6.008450270655885e-08, "sampling/sampling_logp_difference/max": 3.710440158843994, "sampling/sampling_logp_difference/mean": 0.4511517882347107, "step": 57, "step_time": 14.394566142989788 }, { "clip_ratio/high_max": 0.02741228137165308, "clip_ratio/high_mean": 0.01370614068582654, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01370614068582654, "entropy": 1.2158811539411545, "epoch": 0.00058, "grad_norm": 0.21481510996818542, "kl": 1.6557259038090706, "learning_rate": 7.999999820803968e-06, "loss": -0.0886, "step": 58, "step_time": 9.069039724970935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 3.612903118133545, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6922564376145601, "epoch": 0.00059, "frac_reward_zero_std": 0.25, "grad_norm": 0.07558686286211014, "kl": 0.5131912715733051, "learning_rate": 7.99999980414318e-06, "loss": -0.0473, "num_tokens": 1250081.0, "reward": 0.9438344240188599, "reward_std": 0.5373137593269348, "rewards/rollout_reward_func/mean": 0.9438344240188599, "rewards/rollout_reward_func/std": 0.5373137593269348, "sampling/importance_sampling_ratio/max": 1.9514696598052979, "sampling/importance_sampling_ratio/mean": 1.1051734685897827, "sampling/importance_sampling_ratio/min": 0.00013909334666095674, "sampling/sampling_logp_difference/max": 1.7213631868362427, "sampling/sampling_logp_difference/mean": 0.25027796626091003, "step": 59, "step_time": 14.369084537000163 }, { "clip_ratio/high_max": 0.06250000186264515, "clip_ratio/high_mean": 0.031250000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031250000931322575, "entropy": 0.7087843529880047, "epoch": 0.0006, "grad_norm": 0.09181752055883408, "kl": 0.49250202998518944, "learning_rate": 7.999999786741913e-06, "loss": -0.0469, "step": 60, "step_time": 8.11374318099115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.46875, "completions/mean_terminated_length": 4.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9091540165245533, "epoch": 0.00061, "frac_reward_zero_std": 0.0, "grad_norm": 0.18399380147457123, "kl": 1.1413435488939285, "learning_rate": 7.999999768600167e-06, "loss": -0.1012, "num_tokens": 1292704.0, "reward": 0.41022205352783203, "reward_std": 0.8182336091995239, "rewards/rollout_reward_func/mean": 0.41022205352783203, "rewards/rollout_reward_func/std": 0.8182336091995239, "sampling/importance_sampling_ratio/max": 2.045342445373535, "sampling/importance_sampling_ratio/mean": 0.6988683938980103, "sampling/importance_sampling_ratio/min": 7.011080924712587e-07, "sampling/sampling_logp_difference/max": 2.5354318618774414, "sampling/sampling_logp_difference/mean": 0.5143769979476929, "step": 61, "step_time": 14.808151211997028 }, { "clip_ratio/high_max": 0.03740530367940664, "clip_ratio/high_mean": 0.01870265183970332, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01870265183970332, "entropy": 1.9090990759432316, "epoch": 0.00062, "grad_norm": 0.14728140830993652, "kl": 0.8746138960123062, "learning_rate": 7.99999974971794e-06, "loss": -0.1022, "step": 62, "step_time": 8.102149796992308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.375, "completions/mean_terminated_length": 3.857142925262451, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3372110202908516, "epoch": 0.00063, "frac_reward_zero_std": 0.0, "grad_norm": 0.11545009166002274, "kl": 0.7482645362615585, "learning_rate": 7.999999730095235e-06, "loss": -0.0629, "num_tokens": 1333286.0, "reward": 0.3812459707260132, "reward_std": 0.8453522324562073, "rewards/rollout_reward_func/mean": 0.3812459707260132, "rewards/rollout_reward_func/std": 0.845352292060852, "sampling/importance_sampling_ratio/max": 2.0492873191833496, "sampling/importance_sampling_ratio/mean": 0.7961339354515076, "sampling/importance_sampling_ratio/min": 9.499141384594623e-08, "sampling/sampling_logp_difference/max": 3.287494421005249, "sampling/sampling_logp_difference/mean": 0.4596983790397644, "step": 63, "step_time": 15.364461885998026 }, { "clip_ratio/high_max": 0.03219697065651417, "clip_ratio/high_mean": 0.016098485328257084, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016098485328257084, "entropy": 1.347655976191163, "epoch": 0.00064, "grad_norm": 0.10723552852869034, "kl": 0.6460805684328079, "learning_rate": 7.99999970973205e-06, "loss": -0.0632, "step": 64, "step_time": 8.123738673020853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 4.96875, "completions/mean_terminated_length": 4.233333587646484, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2877794429659843, "epoch": 0.00065, "frac_reward_zero_std": 0.0, "grad_norm": 0.13504835963249207, "kl": 0.5085980482399464, "learning_rate": 7.999999688628386e-06, "loss": -0.0808, "num_tokens": 1374135.0, "reward": 0.6933455467224121, "reward_std": 0.6696789860725403, "rewards/rollout_reward_func/mean": 0.6933455467224121, "rewards/rollout_reward_func/std": 0.6696789860725403, "sampling/importance_sampling_ratio/max": 2.006815195083618, "sampling/importance_sampling_ratio/mean": 1.018618106842041, "sampling/importance_sampling_ratio/min": 2.7600324301602086e-06, "sampling/sampling_logp_difference/max": 2.0397260189056396, "sampling/sampling_logp_difference/mean": 0.33385002613067627, "step": 65, "step_time": 14.487963966093957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2879806756973267, "epoch": 0.00066, "grad_norm": 0.13485853374004364, "kl": 0.5168245360255241, "learning_rate": 7.999999666784243e-06, "loss": -0.0805, "step": 66, "step_time": 8.091859130974626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4338098242878914, "epoch": 0.00067, "frac_reward_zero_std": 0.0, "grad_norm": 0.17062480747699738, "kl": 1.0580738335847855, "learning_rate": 7.999999644199619e-06, "loss": -0.0779, "num_tokens": 1415818.0, "reward": 0.4202474355697632, "reward_std": 0.8739528656005859, "rewards/rollout_reward_func/mean": 0.4202474355697632, "rewards/rollout_reward_func/std": 0.8739528059959412, "sampling/importance_sampling_ratio/max": 1.6981751918792725, "sampling/importance_sampling_ratio/mean": 0.808976411819458, "sampling/importance_sampling_ratio/min": 1.7743991520546842e-07, "sampling/sampling_logp_difference/max": 2.174015998840332, "sampling/sampling_logp_difference/mean": 0.4455884099006653, "step": 67, "step_time": 15.328333959972952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4314331486821175, "epoch": 0.00068, "grad_norm": 0.14880794286727905, "kl": 0.9557367712259293, "learning_rate": 7.999999620874517e-06, "loss": -0.0786, "step": 68, "step_time": 8.89522298501106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.34375, "completions/mean_terminated_length": 4.633333683013916, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9536237046122551, "epoch": 0.00069, "frac_reward_zero_std": 0.0, "grad_norm": 0.39218273758888245, "kl": 0.4255521520972252, "learning_rate": 7.999999596808934e-06, "loss": -0.0309, "num_tokens": 1454917.0, "reward": 0.41453874111175537, "reward_std": 0.7063121199607849, "rewards/rollout_reward_func/mean": 0.41453874111175537, "rewards/rollout_reward_func/std": 0.7063121199607849, "sampling/importance_sampling_ratio/max": 1.881455659866333, "sampling/importance_sampling_ratio/mean": 0.8864954710006714, "sampling/importance_sampling_ratio/min": 1.481887323961928e-07, "sampling/sampling_logp_difference/max": 2.376399517059326, "sampling/sampling_logp_difference/mean": 0.2722143828868866, "step": 69, "step_time": 14.272492111951578 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.9516313709318638, "epoch": 0.0007, "grad_norm": 0.1251528114080429, "kl": 0.4222033377736807, "learning_rate": 7.999999572002872e-06, "loss": -0.0322, "step": 70, "step_time": 8.060342747048708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.428571701049805, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5801170282065868, "epoch": 0.00071, "frac_reward_zero_std": 0.0, "grad_norm": 0.10991942882537842, "kl": 0.5273595117032528, "learning_rate": 7.999999546456332e-06, "loss": -0.0908, "num_tokens": 1495928.0, "reward": 0.8705148696899414, "reward_std": 0.6008188128471375, "rewards/rollout_reward_func/mean": 0.8705148696899414, "rewards/rollout_reward_func/std": 0.6008188128471375, "sampling/importance_sampling_ratio/max": 2.097616672515869, "sampling/importance_sampling_ratio/mean": 0.9689662456512451, "sampling/importance_sampling_ratio/min": 2.5884065735226613e-07, "sampling/sampling_logp_difference/max": 1.9920281171798706, "sampling/sampling_logp_difference/mean": 0.41357314586639404, "step": 71, "step_time": 14.465932983031962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5822436194866896, "epoch": 0.00072, "grad_norm": 0.10166724771261215, "kl": 0.5431330427527428, "learning_rate": 7.999999520169313e-06, "loss": -0.091, "step": 72, "step_time": 8.808023284975206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.34375, "completions/mean_terminated_length": 4.964285850524902, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5672520250082016, "epoch": 0.00073, "frac_reward_zero_std": 0.0, "grad_norm": 0.34544116258621216, "kl": 0.5990136228501797, "learning_rate": 7.999999493141815e-06, "loss": -0.0308, "num_tokens": 1536855.0, "reward": 0.517148494720459, "reward_std": 0.9016224145889282, "rewards/rollout_reward_func/mean": 0.517148494720459, "rewards/rollout_reward_func/std": 0.9016223549842834, "sampling/importance_sampling_ratio/max": 1.686284065246582, "sampling/importance_sampling_ratio/mean": 0.6733815670013428, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.0814545154571533, "sampling/sampling_logp_difference/mean": 0.4091971516609192, "step": 73, "step_time": 14.941709117003484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010937500046566129, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010937500046566129, "entropy": 1.5657867416739464, "epoch": 0.00074, "grad_norm": 0.260067343711853, "kl": 0.6077633798122406, "learning_rate": 7.999999465373833e-06, "loss": -0.0322, "step": 74, "step_time": 8.135410017945105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 4.1875, "completions/mean_terminated_length": 3.8064515590667725, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7379115261137486, "epoch": 0.00075, "frac_reward_zero_std": 0.5, "grad_norm": 0.16488566994667053, "kl": 0.7069636955857277, "learning_rate": 7.999999436865376e-06, "loss": -0.0307, "num_tokens": 1578026.0, "reward": 0.6700474619865417, "reward_std": 0.7434166669845581, "rewards/rollout_reward_func/mean": 0.6700474619865417, "rewards/rollout_reward_func/std": 0.7434166073799133, "sampling/importance_sampling_ratio/max": 1.538666844367981, "sampling/importance_sampling_ratio/mean": 0.9756869077682495, "sampling/importance_sampling_ratio/min": 2.4528835638193414e-06, "sampling/sampling_logp_difference/max": 2.8958892822265625, "sampling/sampling_logp_difference/mean": 0.26542115211486816, "step": 75, "step_time": 14.21976753801573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01785714365541935, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01785714365541935, "entropy": 0.7418458610773087, "epoch": 0.00076, "grad_norm": 0.1550237089395523, "kl": 0.8297523912042379, "learning_rate": 7.999999407616439e-06, "loss": -0.0308, "step": 76, "step_time": 8.079165853967424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.40625, "completions/mean_terminated_length": 4.192307949066162, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6031129732728004, "epoch": 0.00077, "frac_reward_zero_std": 0.25, "grad_norm": 0.25720280408859253, "kl": 0.5136363282799721, "learning_rate": 7.999999377627022e-06, "loss": -0.0527, "num_tokens": 1620743.0, "reward": 0.626442015171051, "reward_std": 0.6644177436828613, "rewards/rollout_reward_func/mean": 0.626442015171051, "rewards/rollout_reward_func/std": 0.6644177436828613, "sampling/importance_sampling_ratio/max": 2.030214309692383, "sampling/importance_sampling_ratio/mean": 0.8772963881492615, "sampling/importance_sampling_ratio/min": 5.025901373301167e-06, "sampling/sampling_logp_difference/max": 1.988387107849121, "sampling/sampling_logp_difference/mean": 0.4114595651626587, "step": 77, "step_time": 15.435107979021268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6091739907860756, "epoch": 0.00078, "grad_norm": 0.2417127788066864, "kl": 0.5199431553483009, "learning_rate": 7.999999346897126e-06, "loss": -0.0537, "step": 78, "step_time": 8.532370122004068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.625, "completions/mean_terminated_length": 4.142857551574707, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4916364029049873, "epoch": 0.00079, "frac_reward_zero_std": 0.0, "grad_norm": 0.11560725420713425, "kl": 0.6516235359013081, "learning_rate": 7.99999931542675e-06, "loss": -0.0829, "num_tokens": 1663566.0, "reward": 0.6280626058578491, "reward_std": 0.7852424383163452, "rewards/rollout_reward_func/mean": 0.6280626058578491, "rewards/rollout_reward_func/std": 0.7852424383163452, "sampling/importance_sampling_ratio/max": 1.7694621086120605, "sampling/importance_sampling_ratio/mean": 0.9286859035491943, "sampling/importance_sampling_ratio/min": 7.584911543290218e-08, "sampling/sampling_logp_difference/max": 3.2941830158233643, "sampling/sampling_logp_difference/mean": 0.427787721157074, "step": 79, "step_time": 14.686232140025822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.485827475786209, "epoch": 0.0008, "grad_norm": 0.10935062170028687, "kl": 0.6777349822223186, "learning_rate": 7.999999283215897e-06, "loss": -0.083, "step": 80, "step_time": 8.108048089023214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.65625, "completions/mean_terminated_length": 3.4827585220336914, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8243714589625597, "epoch": 0.00081, "frac_reward_zero_std": 0.0, "grad_norm": 0.14701589941978455, "kl": 0.9104066044092178, "learning_rate": 7.999999250264562e-06, "loss": -0.0337, "num_tokens": 1707337.0, "reward": 0.6931761503219604, "reward_std": 0.5984086990356445, "rewards/rollout_reward_func/mean": 0.6931761503219604, "rewards/rollout_reward_func/std": 0.5984086990356445, "sampling/importance_sampling_ratio/max": 1.5120517015457153, "sampling/importance_sampling_ratio/mean": 0.8591850996017456, "sampling/importance_sampling_ratio/min": 3.431446771173796e-07, "sampling/sampling_logp_difference/max": 2.167766571044922, "sampling/sampling_logp_difference/mean": 0.3346281051635742, "step": 81, "step_time": 14.530650069063995 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.8237135950475931, "epoch": 0.00082, "grad_norm": 0.13788297772407532, "kl": 0.98299914970994, "learning_rate": 7.999999216572749e-06, "loss": -0.0336, "step": 82, "step_time": 8.668763396010036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 4.3125, "completions/mean_terminated_length": 3.935483694076538, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.73869189620018, "epoch": 0.00083, "frac_reward_zero_std": 0.25, "grad_norm": 0.1219305470585823, "kl": 0.6620416529476643, "learning_rate": 7.999999182140456e-06, "loss": -0.0503, "num_tokens": 1749130.0, "reward": 0.6397022604942322, "reward_std": 0.5633842945098877, "rewards/rollout_reward_func/mean": 0.6397022604942322, "rewards/rollout_reward_func/std": 0.5633842945098877, "sampling/importance_sampling_ratio/max": 2.2791848182678223, "sampling/importance_sampling_ratio/mean": 0.9368146657943726, "sampling/importance_sampling_ratio/min": 1.0791823115141597e-05, "sampling/sampling_logp_difference/max": 3.742777109146118, "sampling/sampling_logp_difference/mean": 0.2709418535232544, "step": 83, "step_time": 14.73145789600676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7457274049520493, "epoch": 0.00084, "grad_norm": 0.12265529483556747, "kl": 0.684906791895628, "learning_rate": 7.999999146967684e-06, "loss": -0.0503, "step": 84, "step_time": 8.093017447012244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 4.785714626312256, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6507288254797459, "epoch": 0.00085, "frac_reward_zero_std": 0.0, "grad_norm": 0.18214711546897888, "kl": 1.4510327484458685, "learning_rate": 7.999999111054434e-06, "loss": -0.0923, "num_tokens": 1788316.0, "reward": 0.4920206367969513, "reward_std": 0.8867719769477844, "rewards/rollout_reward_func/mean": 0.4920206367969513, "rewards/rollout_reward_func/std": 0.8867719769477844, "sampling/importance_sampling_ratio/max": 1.6428189277648926, "sampling/importance_sampling_ratio/mean": 0.754221498966217, "sampling/importance_sampling_ratio/min": 1.7981619748752564e-08, "sampling/sampling_logp_difference/max": 3.0633933544158936, "sampling/sampling_logp_difference/mean": 0.4644431471824646, "step": 85, "step_time": 14.318827507959213 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 1.6504145190119743, "epoch": 0.00086, "grad_norm": 0.1611618846654892, "kl": 1.3298410698771477, "learning_rate": 7.999999074400703e-06, "loss": -0.0935, "step": 86, "step_time": 8.05126236792421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.65625, "completions/mean_terminated_length": 3.269230842590332, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.824615627527237, "epoch": 0.00087, "frac_reward_zero_std": 0.0, "grad_norm": 0.09662849456071854, "kl": 0.3719085082411766, "learning_rate": 7.999999037006494e-06, "loss": -0.0783, "num_tokens": 1831543.0, "reward": 0.621874988079071, "reward_std": 0.7589187622070312, "rewards/rollout_reward_func/mean": 0.621874988079071, "rewards/rollout_reward_func/std": 0.7589187026023865, "sampling/importance_sampling_ratio/max": 2.001399278640747, "sampling/importance_sampling_ratio/mean": 0.9112379550933838, "sampling/importance_sampling_ratio/min": 3.4581766783503554e-08, "sampling/sampling_logp_difference/max": 1.9935210943222046, "sampling/sampling_logp_difference/mean": 0.4757145643234253, "step": 87, "step_time": 15.347524506010814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8194607198238373, "epoch": 0.00088, "grad_norm": 0.09469027817249298, "kl": 0.38403961062431335, "learning_rate": 7.999998998871805e-06, "loss": -0.0784, "step": 88, "step_time": 8.699650258000474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.59375, "completions/mean_terminated_length": 3.9599997997283936, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5920906215906143, "epoch": 0.00089, "frac_reward_zero_std": 0.0, "grad_norm": 0.12484663724899292, "kl": 0.8225836865603924, "learning_rate": 7.999998959996637e-06, "loss": -0.0747, "num_tokens": 1873571.0, "reward": 0.6929792165756226, "reward_std": 0.786061704158783, "rewards/rollout_reward_func/mean": 0.6929792165756226, "rewards/rollout_reward_func/std": 0.786061704158783, "sampling/importance_sampling_ratio/max": 1.6915345191955566, "sampling/importance_sampling_ratio/mean": 0.8683750629425049, "sampling/importance_sampling_ratio/min": 8.120304784142718e-08, "sampling/sampling_logp_difference/max": 1.9922871589660645, "sampling/sampling_logp_difference/mean": 0.40543413162231445, "step": 89, "step_time": 14.740054939087713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.593827661126852, "epoch": 0.0009, "grad_norm": 0.1127060204744339, "kl": 0.7626943141222, "learning_rate": 7.99999892038099e-06, "loss": -0.0749, "step": 90, "step_time": 8.120508648920804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5661444086581469, "epoch": 0.00091, "frac_reward_zero_std": 0.0, "grad_norm": 0.09645453840494156, "kl": 0.5285315662622452, "learning_rate": 7.999998880024863e-06, "loss": -0.081, "num_tokens": 1912673.0, "reward": 0.6984566450119019, "reward_std": 0.701667845249176, "rewards/rollout_reward_func/mean": 0.6984566450119019, "rewards/rollout_reward_func/std": 0.7016677856445312, "sampling/importance_sampling_ratio/max": 1.8024686574935913, "sampling/importance_sampling_ratio/mean": 0.9425265192985535, "sampling/importance_sampling_ratio/min": 1.469141466259316e-07, "sampling/sampling_logp_difference/max": 1.9852910041809082, "sampling/sampling_logp_difference/mean": 0.4605446457862854, "step": 91, "step_time": 14.466829716955544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.563849726691842, "epoch": 0.00092, "grad_norm": 0.09333132952451706, "kl": 0.5371105372905731, "learning_rate": 7.999998838928257e-06, "loss": -0.0812, "step": 92, "step_time": 8.409060549980495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.1875, "completions/mean_terminated_length": 4.4666666984558105, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9641848281025887, "epoch": 0.00093, "frac_reward_zero_std": 0.0, "grad_norm": 0.101555734872818, "kl": 0.6154310554265976, "learning_rate": 7.999998797091172e-06, "loss": -0.0972, "num_tokens": 1950656.0, "reward": 0.7063464522361755, "reward_std": 0.8001297116279602, "rewards/rollout_reward_func/mean": 0.7063464522361755, "rewards/rollout_reward_func/std": 0.8001297116279602, "sampling/importance_sampling_ratio/max": 1.7436468601226807, "sampling/importance_sampling_ratio/mean": 0.9411951899528503, "sampling/importance_sampling_ratio/min": 2.1194671717239544e-05, "sampling/sampling_logp_difference/max": 2.06453013420105, "sampling/sampling_logp_difference/mean": 0.30398663878440857, "step": 93, "step_time": 14.500793681974756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9581584893167019, "epoch": 0.00094, "grad_norm": 0.10055646300315857, "kl": 0.6402410082519054, "learning_rate": 7.999998754513608e-06, "loss": -0.0975, "step": 94, "step_time": 7.9953958049591165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 4.642857551574707, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3559904843568802, "epoch": 0.00095, "frac_reward_zero_std": 0.0, "grad_norm": 0.1323547065258026, "kl": 0.60786965303123, "learning_rate": 7.999998711195565e-06, "loss": -0.0844, "num_tokens": 1989604.0, "reward": 0.49401044845581055, "reward_std": 0.8259052038192749, "rewards/rollout_reward_func/mean": 0.49401044845581055, "rewards/rollout_reward_func/std": 0.8259052038192749, "sampling/importance_sampling_ratio/max": 1.445023536682129, "sampling/importance_sampling_ratio/mean": 0.8077601790428162, "sampling/importance_sampling_ratio/min": 1.633988290450361e-06, "sampling/sampling_logp_difference/max": 1.9151999950408936, "sampling/sampling_logp_difference/mean": 0.37297922372817993, "step": 95, "step_time": 14.388826036942191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.355696877464652, "epoch": 0.00096, "grad_norm": 0.11892388015985489, "kl": 0.6211227029561996, "learning_rate": 7.999998667137043e-06, "loss": -0.0846, "step": 96, "step_time": 7.982840609038249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 5.142857551574707, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4191708751022816, "epoch": 0.00097, "frac_reward_zero_std": 0.0, "grad_norm": 0.13912345468997955, "kl": 0.933295227587223, "learning_rate": 7.999998622338041e-06, "loss": -0.0813, "num_tokens": 2030511.0, "reward": 0.45020735263824463, "reward_std": 0.8394709825515747, "rewards/rollout_reward_func/mean": 0.45020735263824463, "rewards/rollout_reward_func/std": 0.8394709825515747, "sampling/importance_sampling_ratio/max": 1.6606156826019287, "sampling/importance_sampling_ratio/mean": 0.7199388742446899, "sampling/importance_sampling_ratio/min": 2.589411633380223e-05, "sampling/sampling_logp_difference/max": 2.933884859085083, "sampling/sampling_logp_difference/mean": 0.40740278363227844, "step": 97, "step_time": 14.966433599998709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 1.4197298996150494, "epoch": 0.00098, "grad_norm": 0.13498611748218536, "kl": 0.9525809250771999, "learning_rate": 7.999998576798562e-06, "loss": -0.0818, "step": 98, "step_time": 8.608791293081595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 4.09375, "completions/mean_terminated_length": 3.709677219390869, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9644812550395727, "epoch": 0.00099, "frac_reward_zero_std": 0.25, "grad_norm": 0.05681291222572327, "kl": 1.0328673124313354, "learning_rate": 7.999998530518601e-06, "loss": -0.0599, "num_tokens": 2073213.0, "reward": 0.8207689523696899, "reward_std": 0.4128098487854004, "rewards/rollout_reward_func/mean": 0.8207689523696899, "rewards/rollout_reward_func/std": 0.4128098487854004, "sampling/importance_sampling_ratio/max": 1.4496915340423584, "sampling/importance_sampling_ratio/mean": 1.0167200565338135, "sampling/importance_sampling_ratio/min": 4.597580414156255e-07, "sampling/sampling_logp_difference/max": 1.7171720266342163, "sampling/sampling_logp_difference/mean": 0.32595735788345337, "step": 99, "step_time": 14.26326206504018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9627204444259405, "epoch": 0.001, "grad_norm": 0.06018931046128273, "kl": 1.035637628287077, "learning_rate": 7.999998483498162e-06, "loss": -0.0597, "step": 100, "step_time": 8.083795332931913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 4.375, "completions/mean_terminated_length": 3.6000001430511475, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8987695574760437, "epoch": 0.00101, "frac_reward_zero_std": 0.25, "grad_norm": 0.15155819058418274, "kl": 0.5903496406972408, "learning_rate": 7.999998435737244e-06, "loss": -0.0426, "num_tokens": 2116068.0, "reward": 0.654111385345459, "reward_std": 0.5407465696334839, "rewards/rollout_reward_func/mean": 0.654111385345459, "rewards/rollout_reward_func/std": 0.5407465100288391, "sampling/importance_sampling_ratio/max": 1.459596872329712, "sampling/importance_sampling_ratio/mean": 1.0059081315994263, "sampling/importance_sampling_ratio/min": 1.1949053941862076e-06, "sampling/sampling_logp_difference/max": 1.8144553899765015, "sampling/sampling_logp_difference/mean": 0.31143447756767273, "step": 101, "step_time": 14.318654725997476 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 0.8947776816785336, "epoch": 0.00102, "grad_norm": 0.114131860435009, "kl": 0.6089061126112938, "learning_rate": 7.999998387235846e-06, "loss": -0.0436, "step": 102, "step_time": 8.65682813100284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 4.296296119689941, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8914017900824547, "epoch": 0.00103, "frac_reward_zero_std": 0.0, "grad_norm": 0.1118139997124672, "kl": 0.619939923286438, "learning_rate": 7.99999833799397e-06, "loss": -0.0852, "num_tokens": 2157496.0, "reward": 0.2017877697944641, "reward_std": 0.8150376677513123, "rewards/rollout_reward_func/mean": 0.2017877697944641, "rewards/rollout_reward_func/std": 0.8150376677513123, "sampling/importance_sampling_ratio/max": 1.547081708908081, "sampling/importance_sampling_ratio/mean": 0.8848620653152466, "sampling/importance_sampling_ratio/min": 1.1499476926246643e-07, "sampling/sampling_logp_difference/max": 2.16353702545166, "sampling/sampling_logp_difference/mean": 0.4962579011917114, "step": 103, "step_time": 15.00361991400132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8889388479292393, "epoch": 0.00104, "grad_norm": 0.1135617196559906, "kl": 0.6006340086460114, "learning_rate": 7.999998288011616e-06, "loss": -0.0852, "step": 104, "step_time": 8.092464418994496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.413793087005615, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4597186259925365, "epoch": 0.00105, "frac_reward_zero_std": 0.0, "grad_norm": 0.09816154092550278, "kl": 1.0129779353737831, "learning_rate": 7.999998237288781e-06, "loss": -0.0762, "num_tokens": 2200003.0, "reward": 0.2934325933456421, "reward_std": 0.6945187449455261, "rewards/rollout_reward_func/mean": 0.2934325933456421, "rewards/rollout_reward_func/std": 0.6945187449455261, "sampling/importance_sampling_ratio/max": 1.6826711893081665, "sampling/importance_sampling_ratio/mean": 0.7822100520133972, "sampling/importance_sampling_ratio/min": 3.1484399531933605e-09, "sampling/sampling_logp_difference/max": 2.274289608001709, "sampling/sampling_logp_difference/mean": 0.4583044648170471, "step": 105, "step_time": 14.509945008030627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.4591337814927101, "epoch": 0.00106, "grad_norm": 0.08626661449670792, "kl": 0.9943130016326904, "learning_rate": 7.999998185825468e-06, "loss": -0.0761, "step": 106, "step_time": 8.115606137958821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 3.59375, "completions/mean_terminated_length": 3.1935482025146484, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5910434126853943, "epoch": 0.00107, "frac_reward_zero_std": 0.0, "grad_norm": 0.20390315353870392, "kl": 0.5028313137590885, "learning_rate": 7.999998133621676e-06, "loss": -0.0195, "num_tokens": 2243834.0, "reward": 0.8743211030960083, "reward_std": 0.36195868253707886, "rewards/rollout_reward_func/mean": 0.8743211030960083, "rewards/rollout_reward_func/std": 0.36195868253707886, "sampling/importance_sampling_ratio/max": 1.783795714378357, "sampling/importance_sampling_ratio/mean": 0.9751328229904175, "sampling/importance_sampling_ratio/min": 5.542544386116788e-06, "sampling/sampling_logp_difference/max": 1.6283657550811768, "sampling/sampling_logp_difference/mean": 0.25483396649360657, "step": 107, "step_time": 14.852394922956591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5911570321768522, "epoch": 0.00108, "grad_norm": 0.18443961441516876, "kl": 0.500493872910738, "learning_rate": 7.999998080677404e-06, "loss": -0.0204, "step": 108, "step_time": 8.641802287020255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 4.8125, "completions/mean_terminated_length": 3.655172348022461, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.947407640516758, "epoch": 0.00109, "frac_reward_zero_std": 0.25, "grad_norm": 0.04682869836688042, "kl": 0.7131749950349331, "learning_rate": 7.999998026992654e-06, "loss": -0.06, "num_tokens": 2287544.0, "reward": 0.7655199766159058, "reward_std": 0.5474411249160767, "rewards/rollout_reward_func/mean": 0.7655199766159058, "rewards/rollout_reward_func/std": 0.5474411845207214, "sampling/importance_sampling_ratio/max": 1.5925496816635132, "sampling/importance_sampling_ratio/mean": 0.9829436540603638, "sampling/importance_sampling_ratio/min": 0.0002136116090696305, "sampling/sampling_logp_difference/max": 1.6674413681030273, "sampling/sampling_logp_difference/mean": 0.28719162940979004, "step": 109, "step_time": 14.655342810001457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9488987997174263, "epoch": 0.0011, "grad_norm": 0.04844241216778755, "kl": 0.7163382694125175, "learning_rate": 7.999997972567424e-06, "loss": -0.0601, "step": 110, "step_time": 8.131943043030333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 4.71875, "completions/mean_terminated_length": 4.3548383712768555, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.027579067274928, "epoch": 0.00111, "frac_reward_zero_std": 0.0, "grad_norm": 0.11293182522058487, "kl": 0.8212607204914093, "learning_rate": 7.999997917401717e-06, "loss": -0.0853, "num_tokens": 2328959.0, "reward": 0.9400968551635742, "reward_std": 0.4621155560016632, "rewards/rollout_reward_func/mean": 0.9400968551635742, "rewards/rollout_reward_func/std": 0.4621155261993408, "sampling/importance_sampling_ratio/max": 1.543493628501892, "sampling/importance_sampling_ratio/mean": 0.969043493270874, "sampling/importance_sampling_ratio/min": 0.00012929980584885925, "sampling/sampling_logp_difference/max": 2.221526861190796, "sampling/sampling_logp_difference/mean": 0.32158780097961426, "step": 111, "step_time": 14.287684345035814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0227718688547611, "epoch": 0.00112, "grad_norm": 0.1087557002902031, "kl": 0.846947655081749, "learning_rate": 7.99999786149553e-06, "loss": -0.0854, "step": 112, "step_time": 8.491856078995625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 4.266666889190674, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7209144402295351, "epoch": 0.00113, "frac_reward_zero_std": 0.0, "grad_norm": 0.501838207244873, "kl": 1.0280143916606903, "learning_rate": 7.999997804848863e-06, "loss": -0.0371, "num_tokens": 2371435.0, "reward": 0.44052523374557495, "reward_std": 0.7319390773773193, "rewards/rollout_reward_func/mean": 0.44052523374557495, "rewards/rollout_reward_func/std": 0.7319390773773193, "sampling/importance_sampling_ratio/max": 2.521996021270752, "sampling/importance_sampling_ratio/mean": 1.0140750408172607, "sampling/importance_sampling_ratio/min": 1.8891731087933294e-05, "sampling/sampling_logp_difference/max": 2.3773908615112305, "sampling/sampling_logp_difference/mean": 0.28456902503967285, "step": 113, "step_time": 15.169131376052974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.026041666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026041666977107525, "entropy": 0.7108248192816973, "epoch": 0.00114, "grad_norm": 0.16835948824882507, "kl": 1.0522319413721561, "learning_rate": 7.999997747461717e-06, "loss": -0.0389, "step": 114, "step_time": 8.164572975016199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.5625, "completions/mean_terminated_length": 4.482758522033691, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.131332403048873, "epoch": 0.00115, "frac_reward_zero_std": 0.0, "grad_norm": 0.14043374359607697, "kl": 1.4840212985873222, "learning_rate": 7.999997689334094e-06, "loss": -0.1159, "num_tokens": 2406888.0, "reward": 0.6590861082077026, "reward_std": 0.9807443022727966, "rewards/rollout_reward_func/mean": 0.6590861082077026, "rewards/rollout_reward_func/std": 0.9807443022727966, "sampling/importance_sampling_ratio/max": 2.210951805114746, "sampling/importance_sampling_ratio/mean": 0.9954689741134644, "sampling/importance_sampling_ratio/min": 2.2425213046517456e-06, "sampling/sampling_logp_difference/max": 2.921337842941284, "sampling/sampling_logp_difference/mean": 0.34346771240234375, "step": 115, "step_time": 13.489115214993944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 1.133056907914579, "epoch": 0.00116, "grad_norm": 0.11122403293848038, "kl": 1.5353549495339394, "learning_rate": 7.99999763046599e-06, "loss": -0.1164, "step": 116, "step_time": 7.5412138369865716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.53125, "completions/mean_terminated_length": 4.035714626312256, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5903140753507614, "epoch": 0.00117, "frac_reward_zero_std": 0.25, "grad_norm": 0.07881207019090652, "kl": 0.7531015761196613, "learning_rate": 7.999997570857409e-06, "loss": -0.0424, "num_tokens": 2449713.0, "reward": 0.6593817472457886, "reward_std": 0.6305489540100098, "rewards/rollout_reward_func/mean": 0.6593817472457886, "rewards/rollout_reward_func/std": 0.630548894405365, "sampling/importance_sampling_ratio/max": 1.5986298322677612, "sampling/importance_sampling_ratio/mean": 0.7843719720840454, "sampling/importance_sampling_ratio/min": 1.3104819451825733e-08, "sampling/sampling_logp_difference/max": 2.380680561065674, "sampling/sampling_logp_difference/mean": 0.4641987085342407, "step": 117, "step_time": 15.700835592026124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5973060578107834, "epoch": 0.00118, "grad_norm": 0.07889380306005478, "kl": 0.7632701173424721, "learning_rate": 7.999997510508348e-06, "loss": -0.0423, "step": 118, "step_time": 8.125801532994956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.40625, "completions/mean_terminated_length": 3.7200000286102295, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4320842511951923, "epoch": 0.00119, "frac_reward_zero_std": 0.0, "grad_norm": 0.10448001325130463, "kl": 1.045521229505539, "learning_rate": 7.999997449418809e-06, "loss": -0.0975, "num_tokens": 2489817.0, "reward": 0.22124828398227692, "reward_std": 0.8360170125961304, "rewards/rollout_reward_func/mean": 0.22124828398227692, "rewards/rollout_reward_func/std": 0.8360169529914856, "sampling/importance_sampling_ratio/max": 1.4552037715911865, "sampling/importance_sampling_ratio/mean": 0.8514630198478699, "sampling/importance_sampling_ratio/min": 2.168048240491771e-06, "sampling/sampling_logp_difference/max": 3.274782180786133, "sampling/sampling_logp_difference/mean": 0.4096888303756714, "step": 119, "step_time": 14.692567817954114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4352972693741322, "epoch": 0.0012, "grad_norm": 0.08706964552402496, "kl": 0.9346054717898369, "learning_rate": 7.99999738758879e-06, "loss": -0.0981, "step": 120, "step_time": 8.065308179007843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 4.266666889190674, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.224071178585291, "epoch": 0.00121, "frac_reward_zero_std": 0.25, "grad_norm": 0.109605573117733, "kl": 0.6599854826927185, "learning_rate": 7.999997325018293e-06, "loss": -0.0593, "num_tokens": 2530094.0, "reward": 0.6894083023071289, "reward_std": 0.666975200176239, "rewards/rollout_reward_func/mean": 0.6894083023071289, "rewards/rollout_reward_func/std": 0.666975200176239, "sampling/importance_sampling_ratio/max": 2.041039228439331, "sampling/importance_sampling_ratio/mean": 0.9623416662216187, "sampling/importance_sampling_ratio/min": 4.380372953249889e-09, "sampling/sampling_logp_difference/max": 2.58815860748291, "sampling/sampling_logp_difference/mean": 0.36616086959838867, "step": 121, "step_time": 14.449025831039762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2233570888638496, "epoch": 0.00122, "grad_norm": 0.09995537251234055, "kl": 0.6124245524406433, "learning_rate": 7.999997261707317e-06, "loss": -0.0596, "step": 122, "step_time": 8.873253283032682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.65625, "completions/mean_terminated_length": 3.4827585220336914, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1262299921363592, "epoch": 0.00123, "frac_reward_zero_std": 0.0, "grad_norm": 0.07228489965200424, "kl": 0.4856506884098053, "learning_rate": 7.999997197655861e-06, "loss": -0.0735, "num_tokens": 2572269.0, "reward": 1.024925708770752, "reward_std": 0.4302324056625366, "rewards/rollout_reward_func/mean": 1.024925708770752, "rewards/rollout_reward_func/std": 0.43023237586021423, "sampling/importance_sampling_ratio/max": 1.45201575756073, "sampling/importance_sampling_ratio/mean": 0.949245810508728, "sampling/importance_sampling_ratio/min": 1.5195538480838877e-06, "sampling/sampling_logp_difference/max": 2.0029654502868652, "sampling/sampling_logp_difference/mean": 0.35715916752815247, "step": 123, "step_time": 14.33446730102878 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 1.1297319829463959, "epoch": 0.00124, "grad_norm": 0.08178111910820007, "kl": 0.49928246438503265, "learning_rate": 7.999997132863928e-06, "loss": -0.0734, "step": 124, "step_time": 8.127976925985422 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.53125, "completions/mean_terminated_length": 4.035714626312256, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.482263259589672, "epoch": 0.00125, "frac_reward_zero_std": 0.0, "grad_norm": 0.24403807520866394, "kl": 1.316875234246254, "learning_rate": 7.999997067331516e-06, "loss": -0.0449, "num_tokens": 2615678.0, "reward": 0.5634016394615173, "reward_std": 0.709787130355835, "rewards/rollout_reward_func/mean": 0.5634016394615173, "rewards/rollout_reward_func/std": 0.709787130355835, "sampling/importance_sampling_ratio/max": 1.655918002128601, "sampling/importance_sampling_ratio/mean": 0.7511699795722961, "sampling/importance_sampling_ratio/min": 1.2649140401777004e-08, "sampling/sampling_logp_difference/max": 2.4614310264587402, "sampling/sampling_logp_difference/mean": 0.4839287996292114, "step": 125, "step_time": 14.671245692006778 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "entropy": 1.48690789565444, "epoch": 0.00126, "grad_norm": 0.2078535407781601, "kl": 1.1127092204988003, "learning_rate": 7.999997001058625e-06, "loss": -0.0457, "step": 126, "step_time": 8.572142282966524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.8125, "completions/mean_terminated_length": 4.066667079925537, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7649100292474031, "epoch": 0.00127, "frac_reward_zero_std": 0.0, "grad_norm": 0.09265835583209991, "kl": 1.2676848024129868, "learning_rate": 7.999996934045254e-06, "loss": -0.0971, "num_tokens": 2651243.0, "reward": 0.8841902613639832, "reward_std": 0.8573343753814697, "rewards/rollout_reward_func/mean": 0.8841902613639832, "rewards/rollout_reward_func/std": 0.8573343753814697, "sampling/importance_sampling_ratio/max": 1.5752407312393188, "sampling/importance_sampling_ratio/mean": 0.884423017501831, "sampling/importance_sampling_ratio/min": 8.355152385775e-05, "sampling/sampling_logp_difference/max": 2.0773818492889404, "sampling/sampling_logp_difference/mean": 0.28069761395454407, "step": 127, "step_time": 14.396946023014607 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 0.7764123268425465, "epoch": 0.00128, "grad_norm": 0.09213702380657196, "kl": 1.1174134314060211, "learning_rate": 7.999996866291406e-06, "loss": -0.0971, "step": 128, "step_time": 7.979507462994661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.21875, "completions/mean_terminated_length": 3.433333396911621, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8451290428638458, "epoch": 0.00129, "frac_reward_zero_std": 0.0, "grad_norm": 0.17176789045333862, "kl": 0.4830758273601532, "learning_rate": 7.999996797797079e-06, "loss": -0.0329, "num_tokens": 2691524.0, "reward": 0.6519486904144287, "reward_std": 0.726320207118988, "rewards/rollout_reward_func/mean": 0.6519486904144287, "rewards/rollout_reward_func/std": 0.7263202667236328, "sampling/importance_sampling_ratio/max": 1.4813703298568726, "sampling/importance_sampling_ratio/mean": 1.0180244445800781, "sampling/importance_sampling_ratio/min": 1.9854822141951445e-07, "sampling/sampling_logp_difference/max": 1.8432198762893677, "sampling/sampling_logp_difference/mean": 0.32062527537345886, "step": 129, "step_time": 14.131963731982978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8528160713613033, "epoch": 0.0013, "grad_norm": 0.1754397600889206, "kl": 0.46952174231410027, "learning_rate": 7.999996728562273e-06, "loss": -0.0329, "step": 130, "step_time": 8.111989805969642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.40625, "completions/mean_terminated_length": 4.310344696044922, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3037714548408985, "epoch": 0.00131, "frac_reward_zero_std": 0.0, "grad_norm": 0.09085382521152496, "kl": 0.6660060882568359, "learning_rate": 7.999996658586989e-06, "loss": -0.0822, "num_tokens": 2732885.0, "reward": 0.7747938632965088, "reward_std": 0.6572369933128357, "rewards/rollout_reward_func/mean": 0.7747938632965088, "rewards/rollout_reward_func/std": 0.6572369933128357, "sampling/importance_sampling_ratio/max": 1.5330084562301636, "sampling/importance_sampling_ratio/mean": 0.8685591220855713, "sampling/importance_sampling_ratio/min": 9.29661396753545e-08, "sampling/sampling_logp_difference/max": 2.165175199508667, "sampling/sampling_logp_difference/mean": 0.3701182007789612, "step": 131, "step_time": 15.186108984024031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.304344978183508, "epoch": 0.00132, "grad_norm": 0.09310605376958847, "kl": 0.6636405363678932, "learning_rate": 7.999996587871225e-06, "loss": -0.0823, "step": 132, "step_time": 8.773404144973028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.3125, "completions/mean_terminated_length": 4.2068963050842285, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1384556405246258, "epoch": 0.00133, "frac_reward_zero_std": 0.25, "grad_norm": 0.11604270339012146, "kl": 0.4353393167257309, "learning_rate": 7.999996516414982e-06, "loss": -0.0358, "num_tokens": 2771453.0, "reward": 0.6141515374183655, "reward_std": 0.8024008870124817, "rewards/rollout_reward_func/mean": 0.6141515374183655, "rewards/rollout_reward_func/std": 0.8024009466171265, "sampling/importance_sampling_ratio/max": 1.5689767599105835, "sampling/importance_sampling_ratio/mean": 0.9821621179580688, "sampling/importance_sampling_ratio/min": 2.2339422756090244e-08, "sampling/sampling_logp_difference/max": 2.1769466400146484, "sampling/sampling_logp_difference/mean": 0.3028973937034607, "step": 133, "step_time": 14.359608309983741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.135617807507515, "epoch": 0.00134, "grad_norm": 0.12628236413002014, "kl": 0.44519588351249695, "learning_rate": 7.999996444218262e-06, "loss": -0.0361, "step": 134, "step_time": 8.028495777049102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 4.5714287757873535, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.488634668290615, "epoch": 0.00135, "frac_reward_zero_std": 0.0, "grad_norm": 0.11046203225851059, "kl": 0.5963750183582306, "learning_rate": 7.999996371281063e-06, "loss": -0.0763, "num_tokens": 2814794.0, "reward": 0.5750497579574585, "reward_std": 0.645440399646759, "rewards/rollout_reward_func/mean": 0.5750497579574585, "rewards/rollout_reward_func/std": 0.6454403400421143, "sampling/importance_sampling_ratio/max": 1.466784954071045, "sampling/importance_sampling_ratio/mean": 0.8182400465011597, "sampling/importance_sampling_ratio/min": 2.9789730682239224e-09, "sampling/sampling_logp_difference/max": 2.412929058074951, "sampling/sampling_logp_difference/mean": 0.47621962428092957, "step": 135, "step_time": 14.66744990303414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4827366396784782, "epoch": 0.00136, "grad_norm": 0.10670299828052521, "kl": 0.5911029949784279, "learning_rate": 7.999996297603385e-06, "loss": -0.0762, "step": 136, "step_time": 8.526022227975773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.78125, "completions/mean_terminated_length": 3.8888888359069824, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4344526827335358, "epoch": 0.00137, "frac_reward_zero_std": 0.0, "grad_norm": 0.1323365867137909, "kl": 0.8990068323910236, "learning_rate": 7.999996223185228e-06, "loss": -0.048, "num_tokens": 2854650.0, "reward": 0.5904149413108826, "reward_std": 0.7810651063919067, "rewards/rollout_reward_func/mean": 0.5904149413108826, "rewards/rollout_reward_func/std": 0.7810651063919067, "sampling/importance_sampling_ratio/max": 1.6008822917938232, "sampling/importance_sampling_ratio/mean": 0.8506937026977539, "sampling/importance_sampling_ratio/min": 2.041454116863406e-09, "sampling/sampling_logp_difference/max": 2.0895392894744873, "sampling/sampling_logp_difference/mean": 0.4556076228618622, "step": 137, "step_time": 15.23490266202134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4286552891135216, "epoch": 0.00138, "grad_norm": 0.1291530430316925, "kl": 0.875928308814764, "learning_rate": 7.999996148026594e-06, "loss": -0.0483, "step": 138, "step_time": 8.080002330010757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.625, "completions/mean_terminated_length": 3.230769395828247, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4255988039076328, "epoch": 0.00139, "frac_reward_zero_std": 0.0, "grad_norm": 0.2706155776977539, "kl": 0.679955817759037, "learning_rate": 7.99999607212748e-06, "loss": -0.0394, "num_tokens": 2898339.0, "reward": 0.6656249761581421, "reward_std": 0.696354329586029, "rewards/rollout_reward_func/mean": 0.6656249761581421, "rewards/rollout_reward_func/std": 0.6963542699813843, "sampling/importance_sampling_ratio/max": 1.7452843189239502, "sampling/importance_sampling_ratio/mean": 0.8430901765823364, "sampling/importance_sampling_ratio/min": 1.6123777868415345e-07, "sampling/sampling_logp_difference/max": 3.0266666412353516, "sampling/sampling_logp_difference/mean": 0.4548715353012085, "step": 139, "step_time": 14.727182065980742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 1.4290200471878052, "epoch": 0.0014, "grad_norm": 0.10349943488836288, "kl": 0.6376087740063667, "learning_rate": 7.999995995487888e-06, "loss": -0.0398, "step": 140, "step_time": 8.238597374991514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.4375, "completions/mean_terminated_length": 3.9285717010498047, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3306762501597404, "epoch": 0.00141, "frac_reward_zero_std": 0.0, "grad_norm": 0.09156066179275513, "kl": 0.905798114836216, "learning_rate": 7.999995918107818e-06, "loss": -0.0958, "num_tokens": 2938027.0, "reward": 0.8601333498954773, "reward_std": 0.8077284097671509, "rewards/rollout_reward_func/mean": 0.8601333498954773, "rewards/rollout_reward_func/std": 0.8077284097671509, "sampling/importance_sampling_ratio/max": 1.9285662174224854, "sampling/importance_sampling_ratio/mean": 0.9732481241226196, "sampling/importance_sampling_ratio/min": 4.1569968800558854e-08, "sampling/sampling_logp_difference/max": 2.1098103523254395, "sampling/sampling_logp_difference/mean": 0.41615891456604004, "step": 141, "step_time": 14.892389869026374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3340443577617407, "epoch": 0.00142, "grad_norm": 0.09341869503259659, "kl": 0.8985725566744804, "learning_rate": 7.999995839987269e-06, "loss": -0.0958, "step": 142, "step_time": 9.14658621198032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.15625, "completions/mean_terminated_length": 4.208333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6936638876795769, "epoch": 0.00143, "frac_reward_zero_std": 0.0, "grad_norm": 0.1805979311466217, "kl": 0.33557638712227345, "learning_rate": 7.999995761126243e-06, "loss": -0.0864, "num_tokens": 2977858.0, "reward": 0.5895398259162903, "reward_std": 0.917009711265564, "rewards/rollout_reward_func/mean": 0.5895398259162903, "rewards/rollout_reward_func/std": 0.917009711265564, "sampling/importance_sampling_ratio/max": 1.757750391960144, "sampling/importance_sampling_ratio/mean": 0.8113835453987122, "sampling/importance_sampling_ratio/min": 3.785209628404118e-06, "sampling/sampling_logp_difference/max": 2.2542724609375, "sampling/sampling_logp_difference/mean": 0.3731710910797119, "step": 143, "step_time": 15.158430036040954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.6893120966851711, "epoch": 0.00144, "grad_norm": 0.11818724870681763, "kl": 0.3475538454949856, "learning_rate": 7.999995681524736e-06, "loss": -0.087, "step": 144, "step_time": 8.109428770985687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.59375, "completions/mean_terminated_length": 3.4137930870056152, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0533462390303612, "epoch": 0.00145, "frac_reward_zero_std": 0.25, "grad_norm": 0.09449347853660583, "kl": 0.5539789013564587, "learning_rate": 7.999995601182752e-06, "loss": -0.0419, "num_tokens": 3020935.0, "reward": 0.71875, "reward_std": 0.671271026134491, "rewards/rollout_reward_func/mean": 0.71875, "rewards/rollout_reward_func/std": 0.671271026134491, "sampling/importance_sampling_ratio/max": 1.4139301776885986, "sampling/importance_sampling_ratio/mean": 0.8904962539672852, "sampling/importance_sampling_ratio/min": 6.048524181512249e-11, "sampling/sampling_logp_difference/max": 2.3408396244049072, "sampling/sampling_logp_difference/mean": 0.3918311595916748, "step": 145, "step_time": 14.57470079199993 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 1.0531968101859093, "epoch": 0.00146, "grad_norm": 0.08502373099327087, "kl": 0.590666152536869, "learning_rate": 7.999995520100289e-06, "loss": -0.042, "step": 146, "step_time": 8.823314962995937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 4.46875, "completions/mean_terminated_length": 4.096774101257324, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0276574827730656, "epoch": 0.00147, "frac_reward_zero_std": 0.0, "grad_norm": 0.2840912640094757, "kl": 0.6115671619772911, "learning_rate": 7.999995438277348e-06, "loss": -0.0605, "num_tokens": 3064334.0, "reward": 0.6610008478164673, "reward_std": 0.5104764103889465, "rewards/rollout_reward_func/mean": 0.6610008478164673, "rewards/rollout_reward_func/std": 0.5104764103889465, "sampling/importance_sampling_ratio/max": 1.5717085599899292, "sampling/importance_sampling_ratio/mean": 0.9239627122879028, "sampling/importance_sampling_ratio/min": 2.9816339520039037e-05, "sampling/sampling_logp_difference/max": 1.9616060256958008, "sampling/sampling_logp_difference/mean": 0.3006832003593445, "step": 147, "step_time": 15.334119097009534 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.0148780830204487, "epoch": 0.00148, "grad_norm": 0.26465287804603577, "kl": 0.6158715300261974, "learning_rate": 7.99999535571393e-06, "loss": -0.0609, "step": 148, "step_time": 8.160928254015744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.1875, "completions/mean_terminated_length": 4.068965435028076, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9929729215800762, "epoch": 0.00149, "frac_reward_zero_std": 0.25, "grad_norm": 0.5083588361740112, "kl": 0.5516847968101501, "learning_rate": 7.999995272410032e-06, "loss": -0.0195, "num_tokens": 3106482.0, "reward": 0.46024227142333984, "reward_std": 0.685075581073761, "rewards/rollout_reward_func/mean": 0.46024227142333984, "rewards/rollout_reward_func/std": 0.6850755214691162, "sampling/importance_sampling_ratio/max": 2.5495054721832275, "sampling/importance_sampling_ratio/mean": 1.0001524686813354, "sampling/importance_sampling_ratio/min": 3.521189455568674e-07, "sampling/sampling_logp_difference/max": 1.759295105934143, "sampling/sampling_logp_difference/mean": 0.33431780338287354, "step": 149, "step_time": 14.477609624998877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.9861800540238619, "epoch": 0.0015, "grad_norm": 0.07800456136465073, "kl": 0.5519616939127445, "learning_rate": 7.999995188365656e-06, "loss": -0.0223, "step": 150, "step_time": 8.17981991203851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 4.148148059844971, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.874490387737751, "epoch": 0.00151, "frac_reward_zero_std": 0.25, "grad_norm": 0.12978781759738922, "kl": 0.4261668622493744, "learning_rate": 7.999995103580802e-06, "loss": -0.0553, "num_tokens": 3149814.0, "reward": 0.627124011516571, "reward_std": 0.7040860056877136, "rewards/rollout_reward_func/mean": 0.627124011516571, "rewards/rollout_reward_func/std": 0.7040860056877136, "sampling/importance_sampling_ratio/max": 1.487300157546997, "sampling/importance_sampling_ratio/mean": 0.8914406299591064, "sampling/importance_sampling_ratio/min": 2.410309625500151e-11, "sampling/sampling_logp_difference/max": 2.5417110919952393, "sampling/sampling_logp_difference/mean": 0.5822088122367859, "step": 151, "step_time": 14.841746472025989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8706854525953531, "epoch": 0.00152, "grad_norm": 0.15001769363880157, "kl": 0.4266779460012913, "learning_rate": 7.99999501805547e-06, "loss": -0.0551, "step": 152, "step_time": 8.5785439160245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 4.90625, "completions/mean_terminated_length": 3.7586207389831543, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.819558521732688, "epoch": 0.00153, "frac_reward_zero_std": 0.25, "grad_norm": 0.0721978023648262, "kl": 0.3611396551132202, "learning_rate": 7.99999493178966e-06, "loss": -0.0617, "num_tokens": 3188587.0, "reward": 0.7350417971611023, "reward_std": 0.7230128645896912, "rewards/rollout_reward_func/mean": 0.7350417971611023, "rewards/rollout_reward_func/std": 0.7230128645896912, "sampling/importance_sampling_ratio/max": 1.355345606803894, "sampling/importance_sampling_ratio/mean": 1.0052887201309204, "sampling/importance_sampling_ratio/min": 0.00010300886060576886, "sampling/sampling_logp_difference/max": 1.7010247707366943, "sampling/sampling_logp_difference/mean": 0.24756616353988647, "step": 153, "step_time": 14.230266572994879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.8158280793577433, "epoch": 0.00154, "grad_norm": 0.08162181079387665, "kl": 0.35777774453163147, "learning_rate": 7.99999484478337e-06, "loss": -0.0619, "step": 154, "step_time": 8.072327464033151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.96875, "completions/mean_terminated_length": 3.392857313156128, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3802735470235348, "epoch": 0.00155, "frac_reward_zero_std": 0.0, "grad_norm": 0.08544214069843292, "kl": 0.8502992875874043, "learning_rate": 7.999994757036603e-06, "loss": -0.0521, "num_tokens": 3231808.0, "reward": 0.7091745734214783, "reward_std": 0.6000349521636963, "rewards/rollout_reward_func/mean": 0.7091745734214783, "rewards/rollout_reward_func/std": 0.6000349521636963, "sampling/importance_sampling_ratio/max": 1.639162302017212, "sampling/importance_sampling_ratio/mean": 0.9189826846122742, "sampling/importance_sampling_ratio/min": 1.9680435059399315e-08, "sampling/sampling_logp_difference/max": 1.8871164321899414, "sampling/sampling_logp_difference/mean": 0.4649440050125122, "step": 155, "step_time": 14.442206220992375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 1.383083390071988, "epoch": 0.00156, "grad_norm": 0.0850856676697731, "kl": 0.910453237593174, "learning_rate": 7.999994668549356e-06, "loss": -0.0518, "step": 156, "step_time": 8.53357606305508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.03125, "completions/mean_terminated_length": 3.8965516090393066, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9221864342689514, "epoch": 0.00157, "frac_reward_zero_std": 0.25, "grad_norm": 0.15762104094028473, "kl": 0.3555282484740019, "learning_rate": 7.999994579321633e-06, "loss": -0.0418, "num_tokens": 3272609.0, "reward": 0.601702094078064, "reward_std": 0.7859078049659729, "rewards/rollout_reward_func/mean": 0.601702094078064, "rewards/rollout_reward_func/std": 0.7859078049659729, "sampling/importance_sampling_ratio/max": 1.8173155784606934, "sampling/importance_sampling_ratio/mean": 1.0097461938858032, "sampling/importance_sampling_ratio/min": 2.666583714017179e-05, "sampling/sampling_logp_difference/max": 1.836301565170288, "sampling/sampling_logp_difference/mean": 0.27701815962791443, "step": 157, "step_time": 15.106385324994335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.918281476944685, "epoch": 0.00158, "grad_norm": 0.16635262966156006, "kl": 0.35419031977653503, "learning_rate": 7.999994489353432e-06, "loss": -0.042, "step": 158, "step_time": 8.099141040001996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 4.21875, "completions/mean_terminated_length": 4.21875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8832707237452269, "epoch": 0.00159, "frac_reward_zero_std": 0.0, "grad_norm": 0.08675035089254379, "kl": 1.0348438993096352, "learning_rate": 7.999994398644752e-06, "loss": -0.0679, "num_tokens": 3315892.0, "reward": 0.8742594718933105, "reward_std": 0.17846234142780304, "rewards/rollout_reward_func/mean": 0.8742594718933105, "rewards/rollout_reward_func/std": 0.17846232652664185, "sampling/importance_sampling_ratio/max": 1.5670294761657715, "sampling/importance_sampling_ratio/mean": 1.0042023658752441, "sampling/importance_sampling_ratio/min": 0.0002905070432461798, "sampling/sampling_logp_difference/max": 2.138784885406494, "sampling/sampling_logp_difference/mean": 0.26364150643348694, "step": 159, "step_time": 14.385206491017016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8806754928082228, "epoch": 0.0016, "grad_norm": 0.08165177702903748, "kl": 1.1031698882579803, "learning_rate": 7.999994307195594e-06, "loss": -0.0682, "step": 160, "step_time": 8.143007294012932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 4.153846263885498, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6940741743892431, "epoch": 0.00161, "frac_reward_zero_std": 0.0, "grad_norm": 0.1342412382364273, "kl": 0.6545476093888283, "learning_rate": 7.99999421500596e-06, "loss": -0.091, "num_tokens": 3357564.0, "reward": 0.4600573778152466, "reward_std": 0.7327843308448792, "rewards/rollout_reward_func/mean": 0.4600573778152466, "rewards/rollout_reward_func/std": 0.7327843308448792, "sampling/importance_sampling_ratio/max": 1.7392685413360596, "sampling/importance_sampling_ratio/mean": 0.8434393405914307, "sampling/importance_sampling_ratio/min": 8.23319794562849e-07, "sampling/sampling_logp_difference/max": 2.2490105628967285, "sampling/sampling_logp_difference/mean": 0.462942510843277, "step": 161, "step_time": 15.09762946300907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6907487362623215, "epoch": 0.00162, "grad_norm": 0.13561096787452698, "kl": 0.672361334785819, "learning_rate": 7.999994122075845e-06, "loss": -0.0914, "step": 162, "step_time": 8.744009214016842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.40625, "completions/mean_terminated_length": 4.310344696044922, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4619414806365967, "epoch": 0.00163, "frac_reward_zero_std": 0.25, "grad_norm": 0.06810848414897919, "kl": 0.562431450933218, "learning_rate": 7.999994028405253e-06, "loss": -0.0504, "num_tokens": 3399596.0, "reward": 0.5005289316177368, "reward_std": 0.7373226284980774, "rewards/rollout_reward_func/mean": 0.5005289316177368, "rewards/rollout_reward_func/std": 0.7373226284980774, "sampling/importance_sampling_ratio/max": 2.581294298171997, "sampling/importance_sampling_ratio/mean": 0.97765052318573, "sampling/importance_sampling_ratio/min": 2.5802351046877448e-06, "sampling/sampling_logp_difference/max": 2.1515674591064453, "sampling/sampling_logp_difference/mean": 0.40152794122695923, "step": 163, "step_time": 14.748575075005647 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.4631823264062405, "epoch": 0.00164, "grad_norm": 0.06701356172561646, "kl": 0.5546810142695904, "learning_rate": 7.999993933994183e-06, "loss": -0.0504, "step": 164, "step_time": 8.093722360994434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.03125, "completions/mean_terminated_length": 3.8965516090393066, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2473113182932138, "epoch": 0.00165, "frac_reward_zero_std": 0.0, "grad_norm": 0.15594612061977386, "kl": 1.4655470177531242, "learning_rate": 7.999993838842636e-06, "loss": -0.0766, "num_tokens": 3441851.0, "reward": 0.5079737901687622, "reward_std": 0.6877462863922119, "rewards/rollout_reward_func/mean": 0.5079737901687622, "rewards/rollout_reward_func/std": 0.6877462863922119, "sampling/importance_sampling_ratio/max": 2.160438299179077, "sampling/importance_sampling_ratio/mean": 0.9018545746803284, "sampling/importance_sampling_ratio/min": 1.9501150916312326e-08, "sampling/sampling_logp_difference/max": 2.7937164306640625, "sampling/sampling_logp_difference/mean": 0.4298110604286194, "step": 165, "step_time": 14.444396371021867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.244348768144846, "epoch": 0.00166, "grad_norm": 0.14113228023052216, "kl": 1.3427977375686169, "learning_rate": 7.99999374295061e-06, "loss": -0.0767, "step": 166, "step_time": 8.581377365044318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.9375, "completions/mean_terminated_length": 3.7931034564971924, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9871550966054201, "epoch": 0.00167, "frac_reward_zero_std": 0.0, "grad_norm": 0.15749244391918182, "kl": 0.6463546268641949, "learning_rate": 7.999993646318106e-06, "loss": -0.0448, "num_tokens": 3484948.0, "reward": 0.5918570756912231, "reward_std": 0.688079833984375, "rewards/rollout_reward_func/mean": 0.5918570756912231, "rewards/rollout_reward_func/std": 0.6880798935890198, "sampling/importance_sampling_ratio/max": 1.626670479774475, "sampling/importance_sampling_ratio/mean": 0.8746570348739624, "sampling/importance_sampling_ratio/min": 6.967474597274759e-09, "sampling/sampling_logp_difference/max": 2.1803083419799805, "sampling/sampling_logp_difference/mean": 0.4089457392692566, "step": 167, "step_time": 15.077676860935753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9996500350534916, "epoch": 0.00168, "grad_norm": 0.1657840758562088, "kl": 0.5836240909993649, "learning_rate": 7.999993548945123e-06, "loss": -0.0452, "step": 168, "step_time": 8.156510337983491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.079999923706055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0896055214107037, "epoch": 0.00169, "frac_reward_zero_std": 0.0, "grad_norm": 0.1423807144165039, "kl": 0.42713775113224983, "learning_rate": 7.999993450831664e-06, "loss": -0.0735, "num_tokens": 3528159.0, "reward": 0.42835208773612976, "reward_std": 0.7792192101478577, "rewards/rollout_reward_func/mean": 0.42835208773612976, "rewards/rollout_reward_func/std": 0.7792191505432129, "sampling/importance_sampling_ratio/max": 1.8205626010894775, "sampling/importance_sampling_ratio/mean": 0.7964118123054504, "sampling/importance_sampling_ratio/min": 2.4498350015278447e-08, "sampling/sampling_logp_difference/max": 2.0282857418060303, "sampling/sampling_logp_difference/mean": 0.5127427577972412, "step": 169, "step_time": 14.790648563939612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.095659766346216, "epoch": 0.0017, "grad_norm": 0.15635834634304047, "kl": 0.4048808366060257, "learning_rate": 7.999993351977727e-06, "loss": -0.074, "step": 170, "step_time": 8.138844993984094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.5625, "completions/mean_terminated_length": 4.866666793823242, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0763516277074814, "epoch": 0.00171, "frac_reward_zero_std": 0.0, "grad_norm": 0.07285112887620926, "kl": 0.7995843179523945, "learning_rate": 7.999993252383311e-06, "loss": -0.0765, "num_tokens": 3567946.0, "reward": 0.498465359210968, "reward_std": 0.7606447339057922, "rewards/rollout_reward_func/mean": 0.498465359210968, "rewards/rollout_reward_func/std": 0.7606447339057922, "sampling/importance_sampling_ratio/max": 1.9827159643173218, "sampling/importance_sampling_ratio/mean": 0.8403235673904419, "sampling/importance_sampling_ratio/min": 1.5938347530664032e-07, "sampling/sampling_logp_difference/max": 2.0887961387634277, "sampling/sampling_logp_difference/mean": 0.3770010769367218, "step": 171, "step_time": 14.867010792921064 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 1.0834794156253338, "epoch": 0.00172, "grad_norm": 0.06991534680128098, "kl": 0.7237817198038101, "learning_rate": 7.999993152048418e-06, "loss": -0.0766, "step": 172, "step_time": 8.528742027032422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.59375, "completions/mean_terminated_length": 4.851851940155029, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9685327038168907, "epoch": 0.00173, "frac_reward_zero_std": 0.0, "grad_norm": 0.24798855185508728, "kl": 0.30279340222477913, "learning_rate": 7.999993050973047e-06, "loss": -0.0727, "num_tokens": 3607850.0, "reward": 0.2611916661262512, "reward_std": 0.8421519994735718, "rewards/rollout_reward_func/mean": 0.2611916661262512, "rewards/rollout_reward_func/std": 0.842151939868927, "sampling/importance_sampling_ratio/max": 1.4531766176223755, "sampling/importance_sampling_ratio/mean": 0.8694557547569275, "sampling/importance_sampling_ratio/min": 3.4138532001293243e-09, "sampling/sampling_logp_difference/max": 2.107304811477661, "sampling/sampling_logp_difference/mean": 0.48366737365722656, "step": 173, "step_time": 14.203161837998778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 1.9749648924916983, "epoch": 0.00174, "grad_norm": 0.17115406692028046, "kl": 0.30347679927945137, "learning_rate": 7.999992949157197e-06, "loss": -0.0732, "step": 174, "step_time": 7.831925186008448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 4.8125, "completions/mean_terminated_length": 4.451612949371338, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1598474644124508, "epoch": 0.00175, "frac_reward_zero_std": 0.0, "grad_norm": 0.08235158026218414, "kl": 0.4278879761695862, "learning_rate": 7.999992846600872e-06, "loss": -0.0761, "num_tokens": 3649994.0, "reward": 0.9530986547470093, "reward_std": 0.4865633249282837, "rewards/rollout_reward_func/mean": 0.9530986547470093, "rewards/rollout_reward_func/std": 0.4865633249282837, "sampling/importance_sampling_ratio/max": 1.3997776508331299, "sampling/importance_sampling_ratio/mean": 0.9652689695358276, "sampling/importance_sampling_ratio/min": 5.920566259476345e-09, "sampling/sampling_logp_difference/max": 2.115445613861084, "sampling/sampling_logp_difference/mean": 0.39004623889923096, "step": 175, "step_time": 14.310941435018321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1642339117825031, "epoch": 0.00176, "grad_norm": 0.08074251562356949, "kl": 0.4278861694037914, "learning_rate": 7.999992743304069e-06, "loss": -0.0761, "step": 176, "step_time": 8.603515831986442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.71875, "completions/mean_terminated_length": 4.655172348022461, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.188732285052538, "epoch": 0.00177, "frac_reward_zero_std": 0.0, "grad_norm": 0.15081046521663666, "kl": 0.9604708775877953, "learning_rate": 7.999992639266786e-06, "loss": -0.0717, "num_tokens": 3691029.0, "reward": 0.6540266275405884, "reward_std": 0.7778489589691162, "rewards/rollout_reward_func/mean": 0.6540266275405884, "rewards/rollout_reward_func/std": 0.7778488993644714, "sampling/importance_sampling_ratio/max": 1.6337190866470337, "sampling/importance_sampling_ratio/mean": 0.8522112369537354, "sampling/importance_sampling_ratio/min": 1.9103276827081572e-06, "sampling/sampling_logp_difference/max": 2.141011953353882, "sampling/sampling_logp_difference/mean": 0.35054951906204224, "step": 177, "step_time": 15.118461925943848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 1.1909306943416595, "epoch": 0.00178, "grad_norm": 0.129087895154953, "kl": 0.9017064124345779, "learning_rate": 7.999992534489026e-06, "loss": -0.0722, "step": 178, "step_time": 8.127078801975586 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 4.625, "completions/mean_terminated_length": 4.258064270019531, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9533974342048168, "epoch": 0.00179, "frac_reward_zero_std": 0.0, "grad_norm": 0.096279077231884, "kl": 0.47148578613996506, "learning_rate": 7.99999242897079e-06, "loss": -0.0423, "num_tokens": 3732249.0, "reward": 0.8339288234710693, "reward_std": 0.5613985061645508, "rewards/rollout_reward_func/mean": 0.8339288234710693, "rewards/rollout_reward_func/std": 0.5613985061645508, "sampling/importance_sampling_ratio/max": 1.7261193990707397, "sampling/importance_sampling_ratio/mean": 0.9941054582595825, "sampling/importance_sampling_ratio/min": 1.7025713532348163e-06, "sampling/sampling_logp_difference/max": 1.8692939281463623, "sampling/sampling_logp_difference/mean": 0.31391507387161255, "step": 179, "step_time": 14.53132671394269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9512906856834888, "epoch": 0.0018, "grad_norm": 0.09210679680109024, "kl": 0.4523421861231327, "learning_rate": 7.999992322712075e-06, "loss": -0.0424, "step": 180, "step_time": 8.712569379043998 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.65625, "completions/mean_terminated_length": 4.9259257316589355, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.388796627521515, "epoch": 0.00181, "frac_reward_zero_std": 0.0, "grad_norm": 0.06412876397371292, "kl": 0.8159867264330387, "learning_rate": 7.999992215712882e-06, "loss": -0.0789, "num_tokens": 3773136.0, "reward": 0.45876723527908325, "reward_std": 0.8182920813560486, "rewards/rollout_reward_func/mean": 0.45876723527908325, "rewards/rollout_reward_func/std": 0.8182920217514038, "sampling/importance_sampling_ratio/max": 1.34908926486969, "sampling/importance_sampling_ratio/mean": 0.6882205009460449, "sampling/importance_sampling_ratio/min": 5.309153117138976e-10, "sampling/sampling_logp_difference/max": 2.146172523498535, "sampling/sampling_logp_difference/mean": 0.5772531032562256, "step": 181, "step_time": 14.874787550012115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.390936493873596, "epoch": 0.00182, "grad_norm": 0.06286820024251938, "kl": 0.8690695241093636, "learning_rate": 7.999992107973214e-06, "loss": -0.079, "step": 182, "step_time": 8.86133627500385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.09375, "completions/mean_terminated_length": 3.5357143878936768, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.109285457059741, "epoch": 0.00183, "frac_reward_zero_std": 0.25, "grad_norm": 0.061068031936883926, "kl": 0.5167804248631, "learning_rate": 7.999991999493065e-06, "loss": -0.0421, "num_tokens": 3814641.0, "reward": 0.7452133893966675, "reward_std": 0.5874180793762207, "rewards/rollout_reward_func/mean": 0.7452133893966675, "rewards/rollout_reward_func/std": 0.5874180197715759, "sampling/importance_sampling_ratio/max": 1.419661045074463, "sampling/importance_sampling_ratio/mean": 1.0003719329833984, "sampling/importance_sampling_ratio/min": 1.0563632635296472e-08, "sampling/sampling_logp_difference/max": 2.158566474914551, "sampling/sampling_logp_difference/mean": 0.3725590109825134, "step": 183, "step_time": 14.462770389916841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1079430505633354, "epoch": 0.00184, "grad_norm": 0.06119164079427719, "kl": 0.5072009637951851, "learning_rate": 7.999991890272441e-06, "loss": -0.0422, "step": 184, "step_time": 8.098810632014647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 4.8125, "completions/mean_terminated_length": 4.451612949371338, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2793487589806318, "epoch": 0.00185, "frac_reward_zero_std": 0.0, "grad_norm": 0.14978265762329102, "kl": 1.5572959631681442, "learning_rate": 7.999991780311339e-06, "loss": -0.0601, "num_tokens": 3858194.0, "reward": 0.7184388637542725, "reward_std": 0.5322840809822083, "rewards/rollout_reward_func/mean": 0.7184388637542725, "rewards/rollout_reward_func/std": 0.5322840809822083, "sampling/importance_sampling_ratio/max": 1.4176321029663086, "sampling/importance_sampling_ratio/mean": 0.7406907081604004, "sampling/importance_sampling_ratio/min": 3.2346181342290947e-06, "sampling/sampling_logp_difference/max": 2.1315159797668457, "sampling/sampling_logp_difference/mean": 0.38452744483947754, "step": 185, "step_time": 14.921928501018556 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 1.2788913734257221, "epoch": 0.00186, "grad_norm": 0.13526757061481476, "kl": 1.4363190792500973, "learning_rate": 7.999991669609758e-06, "loss": -0.0602, "step": 186, "step_time": 8.15577330708038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.21875, "completions/mean_terminated_length": 4.8214287757873535, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6512824669480324, "epoch": 0.00187, "frac_reward_zero_std": 0.0, "grad_norm": 0.20355510711669922, "kl": 0.7046094462275505, "learning_rate": 7.999991558167702e-06, "loss": -0.0294, "num_tokens": 3901536.0, "reward": 0.3648042678833008, "reward_std": 0.8152311444282532, "rewards/rollout_reward_func/mean": 0.3648042678833008, "rewards/rollout_reward_func/std": 0.8152311444282532, "sampling/importance_sampling_ratio/max": 1.5520808696746826, "sampling/importance_sampling_ratio/mean": 0.7728428840637207, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.9121041297912598, "sampling/sampling_logp_difference/mean": 0.3760719895362854, "step": 187, "step_time": 15.485427834035363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6621218398213387, "epoch": 0.00188, "grad_norm": 0.2208823561668396, "kl": 0.678126884624362, "learning_rate": 7.999991445985168e-06, "loss": -0.0297, "step": 188, "step_time": 8.13157475501066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 3.8709676265716553, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8346669562160969, "epoch": 0.00189, "frac_reward_zero_std": 0.25, "grad_norm": 0.10223013162612915, "kl": 0.3922128174453974, "learning_rate": 7.999991333062156e-06, "loss": -0.0496, "num_tokens": 3943062.0, "reward": 0.9249769449234009, "reward_std": 0.5260776877403259, "rewards/rollout_reward_func/mean": 0.9249769449234009, "rewards/rollout_reward_func/std": 0.5260776281356812, "sampling/importance_sampling_ratio/max": 1.4838988780975342, "sampling/importance_sampling_ratio/mean": 0.9901970028877258, "sampling/importance_sampling_ratio/min": 2.4213988581323065e-05, "sampling/sampling_logp_difference/max": 1.8267102241516113, "sampling/sampling_logp_difference/mean": 0.22731371223926544, "step": 189, "step_time": 14.341156832961133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8397820889949799, "epoch": 0.0019, "grad_norm": 0.10237892717123032, "kl": 0.3825669176876545, "learning_rate": 7.999991219398665e-06, "loss": -0.0499, "step": 190, "step_time": 8.476828379003564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.84375, "completions/mean_terminated_length": 4.136363983154297, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.5845545530319214, "epoch": 0.00191, "frac_reward_zero_std": 0.0, "grad_norm": 0.11755941063165665, "kl": 1.1843429245054722, "learning_rate": 7.999991104994699e-06, "loss": -0.052, "num_tokens": 3986398.0, "reward": 0.3863216042518616, "reward_std": 0.831038773059845, "rewards/rollout_reward_func/mean": 0.3863216042518616, "rewards/rollout_reward_func/std": 0.8310387134552002, "sampling/importance_sampling_ratio/max": 1.4136793613433838, "sampling/importance_sampling_ratio/mean": 0.5953716039657593, "sampling/importance_sampling_ratio/min": 4.4089478734576915e-09, "sampling/sampling_logp_difference/max": 2.058715581893921, "sampling/sampling_logp_difference/mean": 0.6333802938461304, "step": 191, "step_time": 15.784175043954747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5912893749773502, "epoch": 0.00192, "grad_norm": 0.10970956087112427, "kl": 1.0718143368139863, "learning_rate": 7.999990989850255e-06, "loss": -0.0522, "step": 192, "step_time": 8.191781967994757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.40625, "completions/mean_terminated_length": 5.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8714734800159931, "epoch": 0.00193, "frac_reward_zero_std": 0.0, "grad_norm": 0.11856076121330261, "kl": 0.35427953163161874, "learning_rate": 7.999990873965335e-06, "loss": -0.0602, "num_tokens": 4030240.0, "reward": 0.5264864563941956, "reward_std": 0.9012279510498047, "rewards/rollout_reward_func/mean": 0.5264864563941956, "rewards/rollout_reward_func/std": 0.9012279510498047, "sampling/importance_sampling_ratio/max": 1.4741240739822388, "sampling/importance_sampling_ratio/mean": 0.7054895162582397, "sampling/importance_sampling_ratio/min": 2.9190712780291506e-07, "sampling/sampling_logp_difference/max": 2.1631767749786377, "sampling/sampling_logp_difference/mean": 0.3565176725387573, "step": 193, "step_time": 20.812589456007117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.874484047293663, "epoch": 0.00194, "grad_norm": 0.1200801432132721, "kl": 0.3574006771668792, "learning_rate": 7.999990757339936e-06, "loss": -0.0606, "step": 194, "step_time": 11.483120549033629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.03125, "completions/mean_terminated_length": 5.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.537628397345543, "epoch": 0.00195, "frac_reward_zero_std": 0.0, "grad_norm": 0.17628593742847443, "kl": 0.37895884923636913, "learning_rate": 7.99999063997406e-06, "loss": -0.0695, "num_tokens": 4082493.0, "reward": 0.18851470947265625, "reward_std": 0.6668851375579834, "rewards/rollout_reward_func/mean": 0.18851470947265625, "rewards/rollout_reward_func/std": 0.6668851971626282, "sampling/importance_sampling_ratio/max": 2.1350455284118652, "sampling/importance_sampling_ratio/mean": 0.5792642831802368, "sampling/importance_sampling_ratio/min": 7.82162715040613e-07, "sampling/sampling_logp_difference/max": 2.49822735786438, "sampling/sampling_logp_difference/mean": 0.4852772355079651, "step": 195, "step_time": 21.396368975023506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.531055733561516, "epoch": 0.00196, "grad_norm": 0.16919104754924774, "kl": 0.3687539743259549, "learning_rate": 7.999990521867708e-06, "loss": -0.0698, "step": 196, "step_time": 11.400838268978987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.28125, "completions/mean_terminated_length": 6.119999885559082, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4176764488220215, "epoch": 0.00197, "frac_reward_zero_std": 0.0, "grad_norm": 0.37788712978363037, "kl": 0.12962104100733995, "learning_rate": 7.999990403020877e-06, "loss": -0.0486, "num_tokens": 4137074.0, "reward": 0.03169689327478409, "reward_std": 0.5348052382469177, "rewards/rollout_reward_func/mean": 0.03169689327478409, "rewards/rollout_reward_func/std": 0.5348052382469177, "sampling/importance_sampling_ratio/max": 1.9515732526779175, "sampling/importance_sampling_ratio/mean": 0.5755534172058105, "sampling/importance_sampling_ratio/min": 1.2144177219397534e-07, "sampling/sampling_logp_difference/max": 2.523240327835083, "sampling/sampling_logp_difference/mean": 0.4444100856781006, "step": 197, "step_time": 20.338135177997174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01711309584788978, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01711309584788978, "entropy": 2.403105102479458, "epoch": 0.00198, "grad_norm": 0.3155667185783386, "kl": 0.14762688847258687, "learning_rate": 7.99999028343357e-06, "loss": -0.0515, "step": 198, "step_time": 10.645276366005419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.15625, "completions/mean_terminated_length": 5.892857551574707, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3498315289616585, "epoch": 0.00199, "frac_reward_zero_std": 0.0, "grad_norm": 0.2501804232597351, "kl": 0.37521763145923615, "learning_rate": 7.999990163105786e-06, "loss": -0.0813, "num_tokens": 4195155.0, "reward": 0.35669487714767456, "reward_std": 0.605546772480011, "rewards/rollout_reward_func/mean": 0.35669487714767456, "rewards/rollout_reward_func/std": 0.605546772480011, "sampling/importance_sampling_ratio/max": 2.2122244834899902, "sampling/importance_sampling_ratio/mean": 0.6120926141738892, "sampling/importance_sampling_ratio/min": 5.5562537681908e-07, "sampling/sampling_logp_difference/max": 2.065056800842285, "sampling/sampling_logp_difference/mean": 0.48509377241134644, "step": 199, "step_time": 21.36553528899094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01533808489330113, "clip_ratio/low_min": 0.004999999888241291, "clip_ratio/region_mean": 0.01533808489330113, "entropy": 2.3186004012823105, "epoch": 0.002, "grad_norm": 0.21364262700080872, "kl": 0.44586005061864853, "learning_rate": 7.999990042037526e-06, "loss": -0.0834, "step": 200, "step_time": 11.83551408801577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.92307710647583, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7078791186213493, "epoch": 0.00201, "frac_reward_zero_std": 0.0, "grad_norm": 0.14339175820350647, "kl": 0.3634251970797777, "learning_rate": 7.999989920228787e-06, "loss": -0.0696, "num_tokens": 4246167.0, "reward": 0.664081335067749, "reward_std": 0.8386322259902954, "rewards/rollout_reward_func/mean": 0.664081335067749, "rewards/rollout_reward_func/std": 0.8386322855949402, "sampling/importance_sampling_ratio/max": 2.5320632457733154, "sampling/importance_sampling_ratio/mean": 0.9167893528938293, "sampling/importance_sampling_ratio/min": 4.6471544123960484e-07, "sampling/sampling_logp_difference/max": 2.366539478302002, "sampling/sampling_logp_difference/mean": 0.4078681766986847, "step": 201, "step_time": 21.756067170994356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011488970601931214, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011488970601931214, "entropy": 1.6659869328141212, "epoch": 0.00202, "grad_norm": 0.12960860133171082, "kl": 0.43866758793592453, "learning_rate": 7.999989797679573e-06, "loss": -0.0706, "step": 202, "step_time": 11.187102605035761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 5.785714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8339224010705948, "epoch": 0.00203, "frac_reward_zero_std": 0.0, "grad_norm": 0.1342564970254898, "kl": 0.7081996910274029, "learning_rate": 7.99998967438988e-06, "loss": -0.0948, "num_tokens": 4293972.0, "reward": 0.6523277759552002, "reward_std": 0.7703136205673218, "rewards/rollout_reward_func/mean": 0.6523277759552002, "rewards/rollout_reward_func/std": 0.7703136205673218, "sampling/importance_sampling_ratio/max": 1.4700876474380493, "sampling/importance_sampling_ratio/mean": 0.6781179308891296, "sampling/importance_sampling_ratio/min": 1.9579456420615315e-06, "sampling/sampling_logp_difference/max": 3.2441773414611816, "sampling/sampling_logp_difference/mean": 0.379302978515625, "step": 203, "step_time": 20.43262993596727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03733974462375045, "clip_ratio/low_min": 0.0062500000931322575, "clip_ratio/region_mean": 0.03733974462375045, "entropy": 1.8057384863495827, "epoch": 0.00204, "grad_norm": 0.1158197671175003, "kl": 0.9015690237283707, "learning_rate": 7.999989550359713e-06, "loss": -0.0956, "step": 204, "step_time": 11.135267635021592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.15625, "completions/mean_terminated_length": 4.433333396911621, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8221497498452663, "epoch": 0.00205, "frac_reward_zero_std": 0.0, "grad_norm": 0.18870742619037628, "kl": 1.303007323294878, "learning_rate": 7.999989425589066e-06, "loss": -0.1046, "num_tokens": 4339583.0, "reward": 0.6094533801078796, "reward_std": 0.7613147497177124, "rewards/rollout_reward_func/mean": 0.6094533801078796, "rewards/rollout_reward_func/std": 0.7613147497177124, "sampling/importance_sampling_ratio/max": 2.850851535797119, "sampling/importance_sampling_ratio/mean": 1.110100507736206, "sampling/importance_sampling_ratio/min": 1.4913337508914992e-05, "sampling/sampling_logp_difference/max": 2.0614209175109863, "sampling/sampling_logp_difference/mean": 0.26007479429244995, "step": 205, "step_time": 19.797998304013163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009588068351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009588068351149559, "entropy": 0.7940205093473196, "epoch": 0.00206, "grad_norm": 0.15066702663898468, "kl": 1.556112539023161, "learning_rate": 7.999989300077943e-06, "loss": -0.1055, "step": 206, "step_time": 10.77401331398869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.28125, "completions/mean_terminated_length": 4.481481552124023, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0301182437688112, "epoch": 0.00207, "frac_reward_zero_std": 0.0, "grad_norm": 0.10812947899103165, "kl": 1.05820694565773, "learning_rate": 7.999989173826344e-06, "loss": -0.0669, "num_tokens": 4384691.0, "reward": 0.3796602487564087, "reward_std": 0.9432556629180908, "rewards/rollout_reward_func/mean": 0.3796602487564087, "rewards/rollout_reward_func/std": 0.943255603313446, "sampling/importance_sampling_ratio/max": 1.391887903213501, "sampling/importance_sampling_ratio/mean": 0.815943717956543, "sampling/importance_sampling_ratio/min": 2.903351742133964e-05, "sampling/sampling_logp_difference/max": 2.862048864364624, "sampling/sampling_logp_difference/mean": 0.2500782310962677, "step": 207, "step_time": 20.804858453018824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013494318351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013494318351149559, "entropy": 1.0167784597724676, "epoch": 0.00208, "grad_norm": 0.11995352804660797, "kl": 1.2100777253508568, "learning_rate": 7.999989046834267e-06, "loss": -0.0667, "step": 208, "step_time": 11.480724607972661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.53125, "completions/mean_terminated_length": 4.777777671813965, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3534351959824562, "epoch": 0.00209, "frac_reward_zero_std": 0.0, "grad_norm": 0.24516406655311584, "kl": 1.8323676753789186, "learning_rate": 7.999988919101714e-06, "loss": -0.0422, "num_tokens": 4439197.0, "reward": 0.891753077507019, "reward_std": 0.575986921787262, "rewards/rollout_reward_func/mean": 0.891753077507019, "rewards/rollout_reward_func/std": 0.575986921787262, "sampling/importance_sampling_ratio/max": 1.7470715045928955, "sampling/importance_sampling_ratio/mean": 0.8171957731246948, "sampling/importance_sampling_ratio/min": 6.729992207965552e-08, "sampling/sampling_logp_difference/max": 4.2298173904418945, "sampling/sampling_logp_difference/mean": 0.412691593170166, "step": 209, "step_time": 21.529862311988836 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 1.3551388010382652, "epoch": 0.0021, "grad_norm": 0.23041003942489624, "kl": 1.7391307391226292, "learning_rate": 7.999988790628685e-06, "loss": -0.043, "step": 210, "step_time": 11.30438365592272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.28125, "completions/mean_terminated_length": 4.481481552124023, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.320419866591692, "epoch": 0.00211, "frac_reward_zero_std": 0.0, "grad_norm": 0.2075623720884323, "kl": 1.5287430882453918, "learning_rate": 7.999988661415179e-06, "loss": -0.0845, "num_tokens": 4493101.0, "reward": 0.35642409324645996, "reward_std": 0.7042486071586609, "rewards/rollout_reward_func/mean": 0.35642409324645996, "rewards/rollout_reward_func/std": 0.7042486071586609, "sampling/importance_sampling_ratio/max": 1.5703147649765015, "sampling/importance_sampling_ratio/mean": 0.6405717134475708, "sampling/importance_sampling_ratio/min": 4.514382089837454e-05, "sampling/sampling_logp_difference/max": 3.1279568672180176, "sampling/sampling_logp_difference/mean": 0.3556724786758423, "step": 211, "step_time": 21.9889388330339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3201967142522335, "epoch": 0.00212, "grad_norm": 0.19530664384365082, "kl": 1.3704914897680283, "learning_rate": 7.999988531461196e-06, "loss": -0.0858, "step": 212, "step_time": 11.270936802990036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.84375, "completions/mean_terminated_length": 4.65217399597168, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5356140546500683, "epoch": 0.00213, "frac_reward_zero_std": 0.0, "grad_norm": 0.12253482639789581, "kl": 1.1572722271084785, "learning_rate": 7.999988400766736e-06, "loss": -0.1053, "num_tokens": 4540167.0, "reward": 0.09252817183732986, "reward_std": 0.9957112073898315, "rewards/rollout_reward_func/mean": 0.09252817183732986, "rewards/rollout_reward_func/std": 0.9957111477851868, "sampling/importance_sampling_ratio/max": 1.5610538721084595, "sampling/importance_sampling_ratio/mean": 0.7419509887695312, "sampling/importance_sampling_ratio/min": 5.155037641202398e-08, "sampling/sampling_logp_difference/max": 2.505669116973877, "sampling/sampling_logp_difference/mean": 0.3089089095592499, "step": 213, "step_time": 20.840557639981853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5449655503034592, "epoch": 0.00214, "grad_norm": 0.11395777016878128, "kl": 1.0722524393349886, "learning_rate": 7.999988269331798e-06, "loss": -0.1057, "step": 214, "step_time": 11.435794850054663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2988173607736826, "epoch": 0.00215, "frac_reward_zero_std": 0.0, "grad_norm": 0.10525906085968018, "kl": 1.3132203184068203, "learning_rate": 7.999988137156384e-06, "loss": -0.0842, "num_tokens": 4586975.0, "reward": 0.44890156388282776, "reward_std": 0.8995989561080933, "rewards/rollout_reward_func/mean": 0.44890156388282776, "rewards/rollout_reward_func/std": 0.8995989561080933, "sampling/importance_sampling_ratio/max": 1.469281554222107, "sampling/importance_sampling_ratio/mean": 0.7493219375610352, "sampling/importance_sampling_ratio/min": 1.3684159512195038e-06, "sampling/sampling_logp_difference/max": 2.379436492919922, "sampling/sampling_logp_difference/mean": 0.3318488597869873, "step": 215, "step_time": 19.341107676009415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3069677352905273, "epoch": 0.00216, "grad_norm": 0.09034428000450134, "kl": 1.120198829099536, "learning_rate": 7.999988004240496e-06, "loss": -0.0846, "step": 216, "step_time": 11.04522098400048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.40625, "completions/mean_terminated_length": 4.192307949066162, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1417550668120384, "epoch": 0.00217, "frac_reward_zero_std": 0.0, "grad_norm": 0.057711683213710785, "kl": 0.3396617854014039, "learning_rate": 7.99998787058413e-06, "loss": -0.0801, "num_tokens": 4638577.0, "reward": 0.5932643413543701, "reward_std": 0.7675012350082397, "rewards/rollout_reward_func/mean": 0.5932643413543701, "rewards/rollout_reward_func/std": 0.7675012350082397, "sampling/importance_sampling_ratio/max": 1.5625122785568237, "sampling/importance_sampling_ratio/mean": 0.967765212059021, "sampling/importance_sampling_ratio/min": 2.275223050673958e-05, "sampling/sampling_logp_difference/max": 1.8294613361358643, "sampling/sampling_logp_difference/mean": 0.22681975364685059, "step": 217, "step_time": 20.849954500910826 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.004166666883975267, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "entropy": 1.1511500477790833, "epoch": 0.00218, "grad_norm": 0.060297951102256775, "kl": 0.3328039487823844, "learning_rate": 7.999987736187286e-06, "loss": -0.0802, "step": 218, "step_time": 11.23675728001399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 3.846153974533081, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9681316763162613, "epoch": 0.00219, "frac_reward_zero_std": 0.0, "grad_norm": 0.11230981349945068, "kl": 0.25189758837223053, "learning_rate": 7.999987601049967e-06, "loss": -0.0883, "num_tokens": 4681117.0, "reward": 0.4168071448802948, "reward_std": 1.042080044746399, "rewards/rollout_reward_func/mean": 0.4168071448802948, "rewards/rollout_reward_func/std": 1.042080044746399, "sampling/importance_sampling_ratio/max": 1.6367242336273193, "sampling/importance_sampling_ratio/mean": 0.9341905117034912, "sampling/importance_sampling_ratio/min": 1.4519894648401532e-05, "sampling/sampling_logp_difference/max": 1.4338736534118652, "sampling/sampling_logp_difference/mean": 0.205253005027771, "step": 219, "step_time": 20.45960294996621 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 0.9727301727980375, "epoch": 0.0022, "grad_norm": 0.11022414267063141, "kl": 0.25458200369030237, "learning_rate": 7.99998746517217e-06, "loss": -0.0885, "step": 220, "step_time": 10.836374366015662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.96875, "completions/mean_terminated_length": 4.884615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0103301405906677, "epoch": 0.00221, "frac_reward_zero_std": 0.0, "grad_norm": 0.07182927429676056, "kl": 0.5478602107614279, "learning_rate": 7.999987328553901e-06, "loss": -0.0825, "num_tokens": 4733953.0, "reward": 0.6525312662124634, "reward_std": 0.8783100843429565, "rewards/rollout_reward_func/mean": 0.6525312662124634, "rewards/rollout_reward_func/std": 0.8783100843429565, "sampling/importance_sampling_ratio/max": 1.59166419506073, "sampling/importance_sampling_ratio/mean": 0.828863263130188, "sampling/importance_sampling_ratio/min": 0.0019103396916761994, "sampling/sampling_logp_difference/max": 1.8355035781860352, "sampling/sampling_logp_difference/mean": 0.22866815328598022, "step": 221, "step_time": 21.307620963983936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0054266843944788, "epoch": 0.00222, "grad_norm": 0.07050245255231857, "kl": 0.5472721569240093, "learning_rate": 7.999987191195152e-06, "loss": -0.0826, "step": 222, "step_time": 10.588620323047508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.71875, "completions/mean_terminated_length": 5.400000095367432, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.79853156208992, "epoch": 0.00223, "frac_reward_zero_std": 0.0, "grad_norm": 0.07060127705335617, "kl": 0.5267892982810736, "learning_rate": 7.999987053095927e-06, "loss": -0.0796, "num_tokens": 4786610.0, "reward": 0.14009851217269897, "reward_std": 0.7739333510398865, "rewards/rollout_reward_func/mean": 0.14009851217269897, "rewards/rollout_reward_func/std": 0.7739333510398865, "sampling/importance_sampling_ratio/max": 1.3743709325790405, "sampling/importance_sampling_ratio/mean": 0.6119423508644104, "sampling/importance_sampling_ratio/min": 1.2031550795654766e-05, "sampling/sampling_logp_difference/max": 2.711780548095703, "sampling/sampling_logp_difference/mean": 0.3650180697441101, "step": 223, "step_time": 21.017364372004522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7964877262711525, "epoch": 0.00224, "grad_norm": 0.06790095567703247, "kl": 0.5630777701735497, "learning_rate": 7.999986914256228e-06, "loss": -0.0798, "step": 224, "step_time": 11.684310537035344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.161290168762207, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0467038117349148, "epoch": 0.00225, "frac_reward_zero_std": 0.0, "grad_norm": 0.07999172806739807, "kl": 0.8731794506311417, "learning_rate": 7.99998677467605e-06, "loss": -0.0543, "num_tokens": 4842566.0, "reward": 0.316166490316391, "reward_std": 0.7100785970687866, "rewards/rollout_reward_func/mean": 0.316166490316391, "rewards/rollout_reward_func/std": 0.7100785970687866, "sampling/importance_sampling_ratio/max": 2.2415480613708496, "sampling/importance_sampling_ratio/mean": 0.9511704444885254, "sampling/importance_sampling_ratio/min": 2.2749843992642127e-05, "sampling/sampling_logp_difference/max": 2.351973295211792, "sampling/sampling_logp_difference/mean": 0.2584698796272278, "step": 225, "step_time": 20.52107614002307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.046294804662466, "epoch": 0.00226, "grad_norm": 0.08207210898399353, "kl": 0.8918992429971695, "learning_rate": 7.999986634355396e-06, "loss": -0.0543, "step": 226, "step_time": 12.014128099981463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.53125, "completions/mean_terminated_length": 5.159999847412109, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.911116398870945, "epoch": 0.00227, "frac_reward_zero_std": 0.0, "grad_norm": 0.11279385536909103, "kl": 0.45920420438051224, "learning_rate": 7.999986493294267e-06, "loss": -0.1054, "num_tokens": 4891735.0, "reward": 0.41830581426620483, "reward_std": 0.920007050037384, "rewards/rollout_reward_func/mean": 0.41830581426620483, "rewards/rollout_reward_func/std": 0.9200071096420288, "sampling/importance_sampling_ratio/max": 1.6917107105255127, "sampling/importance_sampling_ratio/mean": 0.7069302797317505, "sampling/importance_sampling_ratio/min": 1.336570676357951e-05, "sampling/sampling_logp_difference/max": 2.577457904815674, "sampling/sampling_logp_difference/mean": 0.34923386573791504, "step": 227, "step_time": 20.915923940978246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9100488666445017, "epoch": 0.00228, "grad_norm": 0.11754904687404633, "kl": 0.4815942719578743, "learning_rate": 7.99998635149266e-06, "loss": -0.1053, "step": 228, "step_time": 11.13751118897926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.65625, "completions/mean_terminated_length": 4.178571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7288937792181969, "epoch": 0.00229, "frac_reward_zero_std": 0.25, "grad_norm": 0.027550330385565758, "kl": 0.2640114463865757, "learning_rate": 7.99998620895058e-06, "loss": -0.0623, "num_tokens": 4932731.0, "reward": 0.8220281004905701, "reward_std": 0.8825197219848633, "rewards/rollout_reward_func/mean": 0.8220281004905701, "rewards/rollout_reward_func/std": 0.8825196623802185, "sampling/importance_sampling_ratio/max": 1.8174848556518555, "sampling/importance_sampling_ratio/mean": 1.0713777542114258, "sampling/importance_sampling_ratio/min": 0.00850600004196167, "sampling/sampling_logp_difference/max": 1.7168179750442505, "sampling/sampling_logp_difference/mean": 0.1410338431596756, "step": 229, "step_time": 17.980887620011345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.72630750015378, "epoch": 0.0023, "grad_norm": 0.026470577344298363, "kl": 0.266459409147501, "learning_rate": 7.999986065668021e-06, "loss": -0.0623, "step": 230, "step_time": 9.696390341036022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 4.692307949066162, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5636993534862995, "epoch": 0.00231, "frac_reward_zero_std": 0.25, "grad_norm": 0.050363149493932724, "kl": 0.5794152971357107, "learning_rate": 7.999985921644988e-06, "loss": -0.0645, "num_tokens": 4979838.0, "reward": 0.7767060399055481, "reward_std": 0.8914514780044556, "rewards/rollout_reward_func/mean": 0.7767060399055481, "rewards/rollout_reward_func/std": 0.8914514780044556, "sampling/importance_sampling_ratio/max": 1.465922474861145, "sampling/importance_sampling_ratio/mean": 0.8379331827163696, "sampling/importance_sampling_ratio/min": 6.618450925088837e-07, "sampling/sampling_logp_difference/max": 2.3349061012268066, "sampling/sampling_logp_difference/mean": 0.3498779833316803, "step": 231, "step_time": 20.17904571100371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.56334182433784, "epoch": 0.00232, "grad_norm": 0.05079937353730202, "kl": 0.5906058084219694, "learning_rate": 7.999985776881479e-06, "loss": -0.0645, "step": 232, "step_time": 10.395691071054898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.84375, "completions/mean_terminated_length": 3.962963104248047, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7822582572698593, "epoch": 0.00233, "frac_reward_zero_std": 0.25, "grad_norm": 0.1205802857875824, "kl": 0.5885388981550932, "learning_rate": 7.999985631377493e-06, "loss": -0.0627, "num_tokens": 5027106.0, "reward": 0.8741277456283569, "reward_std": 0.8219714164733887, "rewards/rollout_reward_func/mean": 0.8741277456283569, "rewards/rollout_reward_func/std": 0.8219714164733887, "sampling/importance_sampling_ratio/max": 1.6501718759536743, "sampling/importance_sampling_ratio/mean": 1.0217106342315674, "sampling/importance_sampling_ratio/min": 4.664102380047552e-05, "sampling/sampling_logp_difference/max": 2.122898578643799, "sampling/sampling_logp_difference/mean": 0.1868048906326294, "step": 233, "step_time": 20.60445321298903 }, { "clip_ratio/high_max": 0.00657894741743803, "clip_ratio/high_mean": 0.003289473708719015, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "entropy": 0.7818336691707373, "epoch": 0.00234, "grad_norm": 0.09747026115655899, "kl": 0.5790877379477024, "learning_rate": 7.99998548513303e-06, "loss": -0.0633, "step": 234, "step_time": 11.891643920971546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 4.6875, "completions/mean_terminated_length": 4.322580337524414, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.925703264772892, "epoch": 0.00235, "frac_reward_zero_std": 0.0, "grad_norm": 0.1424352377653122, "kl": 0.832196582108736, "learning_rate": 7.999985338148094e-06, "loss": -0.0772, "num_tokens": 5086767.0, "reward": 0.5664539933204651, "reward_std": 0.5981342196464539, "rewards/rollout_reward_func/mean": 0.5664539933204651, "rewards/rollout_reward_func/std": 0.5981342196464539, "sampling/importance_sampling_ratio/max": 1.5526783466339111, "sampling/importance_sampling_ratio/mean": 0.887839138507843, "sampling/importance_sampling_ratio/min": 0.00016356939158868045, "sampling/sampling_logp_difference/max": 3.103849172592163, "sampling/sampling_logp_difference/mean": 0.28621190786361694, "step": 235, "step_time": 20.93252406900865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9275444708764553, "epoch": 0.00236, "grad_norm": 0.13459190726280212, "kl": 0.8044611625373363, "learning_rate": 7.99998519042268e-06, "loss": -0.0774, "step": 236, "step_time": 11.99316283094231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.9375, "completions/mean_terminated_length": 5.266666889190674, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2299876473844051, "epoch": 0.00237, "frac_reward_zero_std": 0.0, "grad_norm": 0.0968717411160469, "kl": 1.602029474452138, "learning_rate": 7.99998504195679e-06, "loss": -0.0839, "num_tokens": 5144400.0, "reward": 0.2368474155664444, "reward_std": 0.6351972818374634, "rewards/rollout_reward_func/mean": 0.2368474155664444, "rewards/rollout_reward_func/std": 0.6351972818374634, "sampling/importance_sampling_ratio/max": 2.806379556655884, "sampling/importance_sampling_ratio/mean": 0.7764865756034851, "sampling/importance_sampling_ratio/min": 0.0008054478676058352, "sampling/sampling_logp_difference/max": 2.2310678958892822, "sampling/sampling_logp_difference/mean": 0.32210415601730347, "step": 237, "step_time": 21.134108145022765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 1.2360619641840458, "epoch": 0.00238, "grad_norm": 0.09957922995090485, "kl": 1.7212827559560537, "learning_rate": 7.999984892750425e-06, "loss": -0.0839, "step": 238, "step_time": 11.618789135012776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.8125, "completions/mean_terminated_length": 4.758620738983154, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0579158551990986, "epoch": 0.00239, "frac_reward_zero_std": 0.0, "grad_norm": 0.04937282204627991, "kl": 0.9173993654549122, "learning_rate": 7.999984742803586e-06, "loss": -0.0986, "num_tokens": 5203409.0, "reward": 0.738558292388916, "reward_std": 0.652691662311554, "rewards/rollout_reward_func/mean": 0.738558292388916, "rewards/rollout_reward_func/std": 0.652691662311554, "sampling/importance_sampling_ratio/max": 1.6534770727157593, "sampling/importance_sampling_ratio/mean": 0.8741851449012756, "sampling/importance_sampling_ratio/min": 5.520997001440264e-05, "sampling/sampling_logp_difference/max": 2.5205931663513184, "sampling/sampling_logp_difference/mean": 0.26456600427627563, "step": 239, "step_time": 20.74099847805337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0548663064837456, "epoch": 0.0024, "grad_norm": 0.04816962778568268, "kl": 0.8918969668447971, "learning_rate": 7.999984592116268e-06, "loss": -0.0987, "step": 240, "step_time": 11.18322730899672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7990757003426552, "epoch": 0.00241, "frac_reward_zero_std": 0.0, "grad_norm": 0.19578510522842407, "kl": 1.2374472208321095, "learning_rate": 7.999984440688477e-06, "loss": -0.0628, "num_tokens": 5244632.0, "reward": 0.7013484239578247, "reward_std": 0.8741843104362488, "rewards/rollout_reward_func/mean": 0.7013484239578247, "rewards/rollout_reward_func/std": 0.8741843104362488, "sampling/importance_sampling_ratio/max": 1.4574310779571533, "sampling/importance_sampling_ratio/mean": 0.9370135068893433, "sampling/importance_sampling_ratio/min": 2.88589262709138e-06, "sampling/sampling_logp_difference/max": 1.9787073135375977, "sampling/sampling_logp_difference/mean": 0.27988553047180176, "step": 241, "step_time": 19.193075183982728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8004887960851192, "epoch": 0.00242, "grad_norm": 0.2114231139421463, "kl": 1.1714534126222134, "learning_rate": 7.999984288520209e-06, "loss": -0.0635, "step": 242, "step_time": 10.409731383930193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 4.642857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2000342905521393, "epoch": 0.00243, "frac_reward_zero_std": 0.0, "grad_norm": 0.05149077624082565, "kl": 0.7392615005373955, "learning_rate": 7.999984135611465e-06, "loss": -0.0895, "num_tokens": 5298013.0, "reward": 0.6652116775512695, "reward_std": 0.8974812030792236, "rewards/rollout_reward_func/mean": 0.6652116775512695, "rewards/rollout_reward_func/std": 0.8974811434745789, "sampling/importance_sampling_ratio/max": 1.418979287147522, "sampling/importance_sampling_ratio/mean": 0.8700907826423645, "sampling/importance_sampling_ratio/min": 1.9580904336180538e-05, "sampling/sampling_logp_difference/max": 2.129845142364502, "sampling/sampling_logp_difference/mean": 0.2619386911392212, "step": 243, "step_time": 20.76236475497717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2002132013440132, "epoch": 0.00244, "grad_norm": 0.047852229326963425, "kl": 0.71795036457479, "learning_rate": 7.999983981962246e-06, "loss": -0.0895, "step": 244, "step_time": 10.992133358027786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.375, "completions/mean_terminated_length": 4.275862216949463, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2655249759554863, "epoch": 0.00245, "frac_reward_zero_std": 0.0, "grad_norm": 0.286190390586853, "kl": 0.7695690114051104, "learning_rate": 7.999983827572551e-06, "loss": -0.0704, "num_tokens": 5353409.0, "reward": 0.25701040029525757, "reward_std": 0.610146701335907, "rewards/rollout_reward_func/mean": 0.25701040029525757, "rewards/rollout_reward_func/std": 0.6101466417312622, "sampling/importance_sampling_ratio/max": 2.6775076389312744, "sampling/importance_sampling_ratio/mean": 0.8969295024871826, "sampling/importance_sampling_ratio/min": 4.327004717197269e-05, "sampling/sampling_logp_difference/max": 3.1304333209991455, "sampling/sampling_logp_difference/mean": 0.312175452709198, "step": 245, "step_time": 20.082545025041327 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016741071827709675, "entropy": 1.265747506171465, "epoch": 0.00246, "grad_norm": 0.1680534929037094, "kl": 0.7622558437287807, "learning_rate": 7.999983672442382e-06, "loss": -0.0709, "step": 246, "step_time": 11.578201662981883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.84375, "completions/mean_terminated_length": 5.896551609039307, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2731693983078003, "epoch": 0.00247, "frac_reward_zero_std": 0.0, "grad_norm": 0.3802318871021271, "kl": 0.8264986872673035, "learning_rate": 7.999983516571737e-06, "loss": -0.0562, "num_tokens": 5404559.0, "reward": 0.6867390871047974, "reward_std": 0.7738901972770691, "rewards/rollout_reward_func/mean": 0.6867390871047974, "rewards/rollout_reward_func/std": 0.7738901376724243, "sampling/importance_sampling_ratio/max": 2.3669610023498535, "sampling/importance_sampling_ratio/mean": 0.8369849920272827, "sampling/importance_sampling_ratio/min": 4.149715095991269e-07, "sampling/sampling_logp_difference/max": 2.0746681690216064, "sampling/sampling_logp_difference/mean": 0.3489246964454651, "step": 247, "step_time": 21.682855279941577 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "entropy": 1.2767120730131865, "epoch": 0.00248, "grad_norm": 0.1379794180393219, "kl": 0.7755273804068565, "learning_rate": 7.999983359960615e-06, "loss": -0.0583, "step": 248, "step_time": 12.161574203026248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9391716569662094, "epoch": 0.00249, "frac_reward_zero_std": 0.0, "grad_norm": 0.11786852777004242, "kl": 0.571716271340847, "learning_rate": 7.999983202609019e-06, "loss": -0.064, "num_tokens": 5457177.0, "reward": 0.18294017016887665, "reward_std": 0.7809416651725769, "rewards/rollout_reward_func/mean": 0.18294017016887665, "rewards/rollout_reward_func/std": 0.7809416651725769, "sampling/importance_sampling_ratio/max": 1.5038310289382935, "sampling/importance_sampling_ratio/mean": 0.5701838731765747, "sampling/importance_sampling_ratio/min": 1.0839610808943689e-07, "sampling/sampling_logp_difference/max": 2.706580638885498, "sampling/sampling_logp_difference/mean": 0.4389774799346924, "step": 249, "step_time": 20.75384879100602 }, { "clip_ratio/high_max": 0.016995614394545555, "clip_ratio/high_mean": 0.008497807197272778, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008497807197272778, "entropy": 1.9386829137802124, "epoch": 0.0025, "grad_norm": 0.11092031747102737, "kl": 0.5374232493340969, "learning_rate": 7.999983044516948e-06, "loss": -0.064, "step": 250, "step_time": 10.628607587015722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.65625, "completions/mean_terminated_length": 4.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4486695528030396, "epoch": 0.00251, "frac_reward_zero_std": 0.0, "grad_norm": 0.10972673445940018, "kl": 0.8862747065722942, "learning_rate": 7.999982885684401e-06, "loss": -0.0907, "num_tokens": 5513690.0, "reward": 0.37688833475112915, "reward_std": 0.7871919274330139, "rewards/rollout_reward_func/mean": 0.37688833475112915, "rewards/rollout_reward_func/std": 0.7871920466423035, "sampling/importance_sampling_ratio/max": 1.6184704303741455, "sampling/importance_sampling_ratio/mean": 0.6616946458816528, "sampling/importance_sampling_ratio/min": 4.575564645052654e-13, "sampling/sampling_logp_difference/max": 2.6691412925720215, "sampling/sampling_logp_difference/mean": 0.5675753355026245, "step": 251, "step_time": 21.665008408977883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.451655797660351, "epoch": 0.00252, "grad_norm": 0.11254110932350159, "kl": 0.8645569384098053, "learning_rate": 7.99998272611138e-06, "loss": -0.0908, "step": 252, "step_time": 11.277536283014342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.84375, "completions/mean_terminated_length": 4.730769634246826, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.301027461886406, "epoch": 0.00253, "frac_reward_zero_std": 0.0, "grad_norm": 0.2084246128797531, "kl": 0.785998247563839, "learning_rate": 7.999982565797882e-06, "loss": -0.0707, "num_tokens": 5562986.0, "reward": 0.5240208506584167, "reward_std": 0.8669114112854004, "rewards/rollout_reward_func/mean": 0.5240208506584167, "rewards/rollout_reward_func/std": 0.8669114112854004, "sampling/importance_sampling_ratio/max": 1.422410011291504, "sampling/importance_sampling_ratio/mean": 0.7295976877212524, "sampling/importance_sampling_ratio/min": 5.2803416110691614e-06, "sampling/sampling_logp_difference/max": 2.3979668617248535, "sampling/sampling_logp_difference/mean": 0.2620103359222412, "step": 253, "step_time": 21.846426080999663 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.3076932430267334, "epoch": 0.00254, "grad_norm": 0.18450218439102173, "kl": 0.7455997783690691, "learning_rate": 7.999982404743908e-06, "loss": -0.0713, "step": 254, "step_time": 11.182301219028886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.92307710647583, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.670066386461258, "epoch": 0.00255, "frac_reward_zero_std": 0.25, "grad_norm": 0.10716277360916138, "kl": 0.30228714272379875, "learning_rate": 7.999982242949461e-06, "loss": -0.0559, "num_tokens": 5614549.0, "reward": 0.5551193952560425, "reward_std": 0.81169593334198, "rewards/rollout_reward_func/mean": 0.5551193952560425, "rewards/rollout_reward_func/std": 0.8116959929466248, "sampling/importance_sampling_ratio/max": 1.50832200050354, "sampling/importance_sampling_ratio/mean": 0.7850338220596313, "sampling/importance_sampling_ratio/min": 1.1954417459492106e-05, "sampling/sampling_logp_difference/max": 1.5697600841522217, "sampling/sampling_logp_difference/mean": 0.31551387906074524, "step": 255, "step_time": 21.40334231700399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.6731275767087936, "epoch": 0.00256, "grad_norm": 0.10551176965236664, "kl": 0.2927218182012439, "learning_rate": 7.999982080414539e-06, "loss": -0.0559, "step": 256, "step_time": 11.782716275018174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 4.956521987915039, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6457756124436855, "epoch": 0.00257, "frac_reward_zero_std": 0.0, "grad_norm": 0.1076393872499466, "kl": 0.3559545185416937, "learning_rate": 7.999981917139141e-06, "loss": -0.0973, "num_tokens": 5660379.0, "reward": 0.3700043857097626, "reward_std": 0.9465427398681641, "rewards/rollout_reward_func/mean": 0.3700043857097626, "rewards/rollout_reward_func/std": 0.9465426802635193, "sampling/importance_sampling_ratio/max": 1.4949893951416016, "sampling/importance_sampling_ratio/mean": 0.616479754447937, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.752347707748413, "sampling/sampling_logp_difference/mean": 0.2721058130264282, "step": 257, "step_time": 22.172610992973205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6497405879199505, "epoch": 0.00258, "grad_norm": 0.10305430740118027, "kl": 0.3494549673050642, "learning_rate": 7.999981753123268e-06, "loss": -0.0973, "step": 258, "step_time": 11.822623250976903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.78125, "completions/mean_terminated_length": 5.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9662086702883244, "epoch": 0.00259, "frac_reward_zero_std": 0.0, "grad_norm": 0.08633647859096527, "kl": 0.23451714031398296, "learning_rate": 7.999981588366921e-06, "loss": -0.0984, "num_tokens": 5712567.0, "reward": 0.3275708258152008, "reward_std": 0.859987199306488, "rewards/rollout_reward_func/mean": 0.3275708258152008, "rewards/rollout_reward_func/std": 0.8599872589111328, "sampling/importance_sampling_ratio/max": 1.8534828424453735, "sampling/importance_sampling_ratio/mean": 0.707016110420227, "sampling/importance_sampling_ratio/min": 4.745493242808152e-08, "sampling/sampling_logp_difference/max": 1.7935163974761963, "sampling/sampling_logp_difference/mean": 0.40351128578186035, "step": 259, "step_time": 21.072297790000448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9638269655406475, "epoch": 0.0026, "grad_norm": 0.08209118247032166, "kl": 0.24017092771828175, "learning_rate": 7.999981422870099e-06, "loss": -0.0986, "step": 260, "step_time": 11.074962780927308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.53125, "completions/mean_terminated_length": 5.159999847412109, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6749202571809292, "epoch": 0.00261, "frac_reward_zero_std": 0.0, "grad_norm": 0.08987349271774292, "kl": 0.45816043205559254, "learning_rate": 7.999981256632802e-06, "loss": -0.0981, "num_tokens": 5751861.0, "reward": 0.6275615692138672, "reward_std": 1.0250756740570068, "rewards/rollout_reward_func/mean": 0.6275615692138672, "rewards/rollout_reward_func/std": 1.0250755548477173, "sampling/importance_sampling_ratio/max": 1.3153082132339478, "sampling/importance_sampling_ratio/mean": 0.7035003900527954, "sampling/importance_sampling_ratio/min": 1.5147437807172537e-05, "sampling/sampling_logp_difference/max": 1.7161669731140137, "sampling/sampling_logp_difference/mean": 0.2959117591381073, "step": 261, "step_time": 19.948101020010654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 1.670870490372181, "epoch": 0.00262, "grad_norm": 0.08747339248657227, "kl": 0.4923686580732465, "learning_rate": 7.999981089655028e-06, "loss": -0.0983, "step": 262, "step_time": 10.257171539997216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 4.559999942779541, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.468731351196766, "epoch": 0.00263, "frac_reward_zero_std": 0.0, "grad_norm": 0.12233254313468933, "kl": 0.3637142479419708, "learning_rate": 7.999980921936782e-06, "loss": -0.0845, "num_tokens": 5803901.0, "reward": 0.23876245319843292, "reward_std": 0.6738964319229126, "rewards/rollout_reward_func/mean": 0.23876245319843292, "rewards/rollout_reward_func/std": 0.6738964915275574, "sampling/importance_sampling_ratio/max": 1.7137179374694824, "sampling/importance_sampling_ratio/mean": 0.8197791576385498, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.2288808822631836, "sampling/sampling_logp_difference/mean": 0.32079145312309265, "step": 263, "step_time": 21.38267619596445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4567311331629753, "epoch": 0.00264, "grad_norm": 0.12014400959014893, "kl": 0.3752524070441723, "learning_rate": 7.999980753478058e-06, "loss": -0.085, "step": 264, "step_time": 11.149427192984149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 4.90625, "completions/mean_terminated_length": 4.548387050628662, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7804286405444145, "epoch": 0.00265, "frac_reward_zero_std": 0.0, "grad_norm": 0.140690878033638, "kl": 0.7160997912287712, "learning_rate": 7.999980584278861e-06, "loss": -0.0601, "num_tokens": 5849837.0, "reward": 0.9101780652999878, "reward_std": 0.7291182279586792, "rewards/rollout_reward_func/mean": 0.9101780652999878, "rewards/rollout_reward_func/std": 0.7291182279586792, "sampling/importance_sampling_ratio/max": 1.3770910501480103, "sampling/importance_sampling_ratio/mean": 0.8591554760932922, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.596135377883911, "sampling/sampling_logp_difference/mean": 0.26362743973731995, "step": 265, "step_time": 19.510385691042757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7720104232430458, "epoch": 0.00266, "grad_norm": 0.1309816688299179, "kl": 0.7604759708046913, "learning_rate": 7.999980414339192e-06, "loss": -0.0607, "step": 266, "step_time": 10.644415464019403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.6875, "completions/mean_terminated_length": 3.9333336353302, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7103093322366476, "epoch": 0.00267, "frac_reward_zero_std": 0.0, "grad_norm": 0.12117563933134079, "kl": 1.1297475695610046, "learning_rate": 7.999980243659046e-06, "loss": -0.0804, "num_tokens": 5897378.0, "reward": 0.5981010794639587, "reward_std": 0.6675661206245422, "rewards/rollout_reward_func/mean": 0.5981010794639587, "rewards/rollout_reward_func/std": 0.667566180229187, "sampling/importance_sampling_ratio/max": 1.4800089597702026, "sampling/importance_sampling_ratio/mean": 0.906640887260437, "sampling/importance_sampling_ratio/min": 7.961957453517243e-05, "sampling/sampling_logp_difference/max": 2.7290401458740234, "sampling/sampling_logp_difference/mean": 0.21517308056354523, "step": 267, "step_time": 20.031396393984323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6960298512130976, "epoch": 0.00268, "grad_norm": 0.11882397532463074, "kl": 1.1956007592380047, "learning_rate": 7.999980072238424e-06, "loss": -0.0808, "step": 268, "step_time": 11.313386516005266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 7.34375, "completions/mean_terminated_length": 3.95652174949646, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.536557413637638, "epoch": 0.00269, "frac_reward_zero_std": 0.0, "grad_norm": 0.16258880496025085, "kl": 0.32862814515829086, "learning_rate": 7.999979900077329e-06, "loss": -0.1133, "num_tokens": 5943400.0, "reward": 0.39520180225372314, "reward_std": 0.9539192318916321, "rewards/rollout_reward_func/mean": 0.39520180225372314, "rewards/rollout_reward_func/std": 0.9539191722869873, "sampling/importance_sampling_ratio/max": 2.3154714107513428, "sampling/importance_sampling_ratio/mean": 0.831212043762207, "sampling/importance_sampling_ratio/min": 4.436617189185199e-07, "sampling/sampling_logp_difference/max": 1.8825368881225586, "sampling/sampling_logp_difference/mean": 0.336548775434494, "step": 269, "step_time": 20.289385943004163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5325338616967201, "epoch": 0.0027, "grad_norm": 0.14802901446819305, "kl": 0.33379140868782997, "learning_rate": 7.99997972717576e-06, "loss": -0.114, "step": 270, "step_time": 11.112429987959331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.03125, "completions/mean_terminated_length": 4.607142925262451, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0503577888011932, "epoch": 0.00271, "frac_reward_zero_std": 0.25, "grad_norm": 0.08829144388437271, "kl": 0.5850465651601553, "learning_rate": 7.999979553533716e-06, "loss": -0.0646, "num_tokens": 5986543.0, "reward": 1.0248606204986572, "reward_std": 0.7608184814453125, "rewards/rollout_reward_func/mean": 1.0248606204986572, "rewards/rollout_reward_func/std": 0.7608184814453125, "sampling/importance_sampling_ratio/max": 1.3479506969451904, "sampling/importance_sampling_ratio/mean": 0.9037001132965088, "sampling/importance_sampling_ratio/min": 1.5112244966530852e-07, "sampling/sampling_logp_difference/max": 2.710369348526001, "sampling/sampling_logp_difference/mean": 0.23576819896697998, "step": 271, "step_time": 20.70910641597584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0408258978277445, "epoch": 0.00272, "grad_norm": 0.08275598287582397, "kl": 0.5948857516050339, "learning_rate": 7.999979379151197e-06, "loss": -0.0648, "step": 272, "step_time": 11.040282770991325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 5.111111164093018, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3060974664986134, "epoch": 0.00273, "frac_reward_zero_std": 0.0, "grad_norm": 0.15007519721984863, "kl": 0.6708042826503515, "learning_rate": 7.999979204028205e-06, "loss": -0.0848, "num_tokens": 6038906.0, "reward": 0.7003864049911499, "reward_std": 0.7996504306793213, "rewards/rollout_reward_func/mean": 0.7003864049911499, "rewards/rollout_reward_func/std": 0.7996503710746765, "sampling/importance_sampling_ratio/max": 1.493873119354248, "sampling/importance_sampling_ratio/mean": 0.775837779045105, "sampling/importance_sampling_ratio/min": 8.98579173735925e-07, "sampling/sampling_logp_difference/max": 2.0015323162078857, "sampling/sampling_logp_difference/mean": 0.3535838723182678, "step": 273, "step_time": 22.20565792598063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013494318351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013494318351149559, "entropy": 1.2976928818970919, "epoch": 0.00274, "grad_norm": 0.14877095818519592, "kl": 0.7402702532708645, "learning_rate": 7.999979028164737e-06, "loss": -0.0849, "step": 274, "step_time": 11.614104147913167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.28125, "completions/mean_terminated_length": 4.892857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.066578719764948, "epoch": 0.00275, "frac_reward_zero_std": 0.0, "grad_norm": 0.40296483039855957, "kl": 2.6778607042506337, "learning_rate": 7.999978851560795e-06, "loss": -0.0767, "num_tokens": 6092930.0, "reward": 0.7073312997817993, "reward_std": 0.829425036907196, "rewards/rollout_reward_func/mean": 0.7073312997817993, "rewards/rollout_reward_func/std": 0.829425036907196, "sampling/importance_sampling_ratio/max": 2.184953451156616, "sampling/importance_sampling_ratio/mean": 0.7644087076187134, "sampling/importance_sampling_ratio/min": 1.6328510099583582e-08, "sampling/sampling_logp_difference/max": 3.7151150703430176, "sampling/sampling_logp_difference/mean": 0.36722710728645325, "step": 275, "step_time": 20.072545646951767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.0627276999875903, "epoch": 0.00276, "grad_norm": 0.3097776174545288, "kl": 2.1506357081234455, "learning_rate": 7.999978674216379e-06, "loss": -0.0789, "step": 276, "step_time": 10.57557684398489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.03125, "completions/mean_terminated_length": 4.961538791656494, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5703632682561874, "epoch": 0.00277, "frac_reward_zero_std": 0.0, "grad_norm": 0.161489799618721, "kl": 0.659636503085494, "learning_rate": 7.99997849613149e-06, "loss": -0.0849, "num_tokens": 6144471.0, "reward": 0.6764302849769592, "reward_std": 0.7912848591804504, "rewards/rollout_reward_func/mean": 0.6764302849769592, "rewards/rollout_reward_func/std": 0.7912848591804504, "sampling/importance_sampling_ratio/max": 1.3539730310440063, "sampling/importance_sampling_ratio/mean": 0.6478421688079834, "sampling/importance_sampling_ratio/min": 3.3210892524948576e-07, "sampling/sampling_logp_difference/max": 2.882441520690918, "sampling/sampling_logp_difference/mean": 0.3726401925086975, "step": 277, "step_time": 21.46793370199157 }, { "clip_ratio/high_max": 0.02220394741743803, "clip_ratio/high_mean": 0.011101973708719015, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011101973708719015, "entropy": 1.5762766860425472, "epoch": 0.00278, "grad_norm": 0.07127073407173157, "kl": 0.5658635683357716, "learning_rate": 7.999978317306126e-06, "loss": -0.0858, "step": 278, "step_time": 11.58583503094269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.079999923706055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2290668431669474, "epoch": 0.00279, "frac_reward_zero_std": 0.0, "grad_norm": 0.2218388468027115, "kl": 0.6252709561958909, "learning_rate": 7.999978137740288e-06, "loss": -0.0595, "num_tokens": 6196127.0, "reward": 0.4431094527244568, "reward_std": 0.846124529838562, "rewards/rollout_reward_func/mean": 0.4431094527244568, "rewards/rollout_reward_func/std": 0.8461244702339172, "sampling/importance_sampling_ratio/max": 2.540842056274414, "sampling/importance_sampling_ratio/mean": 0.8009768128395081, "sampling/importance_sampling_ratio/min": 1.3342202009880566e-08, "sampling/sampling_logp_difference/max": 2.4203615188598633, "sampling/sampling_logp_difference/mean": 0.3210080862045288, "step": 279, "step_time": 20.076103511004476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2320856600999832, "epoch": 0.0028, "grad_norm": 0.2255999892950058, "kl": 0.5809416212141514, "learning_rate": 7.999977957433975e-06, "loss": -0.0606, "step": 280, "step_time": 11.093879232008476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 4.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6120819505304098, "epoch": 0.00281, "frac_reward_zero_std": 0.0, "grad_norm": 0.12043202668428421, "kl": 1.6511937081813812, "learning_rate": 7.999977776387188e-06, "loss": -0.0954, "num_tokens": 6246957.0, "reward": 0.3148505687713623, "reward_std": 0.799569845199585, "rewards/rollout_reward_func/mean": 0.3148505687713623, "rewards/rollout_reward_func/std": 0.799569845199585, "sampling/importance_sampling_ratio/max": 2.2531378269195557, "sampling/importance_sampling_ratio/mean": 0.8028172254562378, "sampling/importance_sampling_ratio/min": 4.677373908634763e-06, "sampling/sampling_logp_difference/max": 2.4058618545532227, "sampling/sampling_logp_difference/mean": 0.4245345890522003, "step": 281, "step_time": 19.40228394800215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 1.61861751973629, "epoch": 0.00282, "grad_norm": 0.14798246324062347, "kl": 1.437572667375207, "learning_rate": 7.999977594599927e-06, "loss": -0.0957, "step": 282, "step_time": 10.914999172993703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.46875, "completions/mean_terminated_length": 4.379310131072998, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9918881747871637, "epoch": 0.00283, "frac_reward_zero_std": 0.0, "grad_norm": 0.14549410343170166, "kl": 0.7538736220449209, "learning_rate": 7.999977412072193e-06, "loss": -0.0808, "num_tokens": 6298949.0, "reward": 0.6496074199676514, "reward_std": 0.8452650904655457, "rewards/rollout_reward_func/mean": 0.6496074199676514, "rewards/rollout_reward_func/std": 0.8452650308609009, "sampling/importance_sampling_ratio/max": 1.9862332344055176, "sampling/importance_sampling_ratio/mean": 0.945911169052124, "sampling/importance_sampling_ratio/min": 3.4170065191574395e-05, "sampling/sampling_logp_difference/max": 2.234065055847168, "sampling/sampling_logp_difference/mean": 0.270762175321579, "step": 283, "step_time": 20.709171458001947 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 0.9993534125387669, "epoch": 0.00284, "grad_norm": 0.12822285294532776, "kl": 0.7053139973431826, "learning_rate": 7.999977228803984e-06, "loss": -0.0811, "step": 284, "step_time": 11.154356910963543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 5.15625, "completions/mean_terminated_length": 4.034482955932617, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7721588313579559, "epoch": 0.00285, "frac_reward_zero_std": 0.25, "grad_norm": 0.2555839419364929, "kl": 0.5011548064649105, "learning_rate": 7.999977044795302e-06, "loss": -0.0473, "num_tokens": 6342949.0, "reward": 1.0838420391082764, "reward_std": 0.6708931922912598, "rewards/rollout_reward_func/mean": 1.0838420391082764, "rewards/rollout_reward_func/std": 0.6708931922912598, "sampling/importance_sampling_ratio/max": 1.4590001106262207, "sampling/importance_sampling_ratio/mean": 0.9169182777404785, "sampling/importance_sampling_ratio/min": 1.489753140049288e-05, "sampling/sampling_logp_difference/max": 2.0729479789733887, "sampling/sampling_logp_difference/mean": 0.21648597717285156, "step": 285, "step_time": 20.87550245100283 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016741071827709675, "entropy": 0.774247620254755, "epoch": 0.00286, "grad_norm": 0.1643252968788147, "kl": 0.47310788184404373, "learning_rate": 7.999976860046145e-06, "loss": -0.0478, "step": 286, "step_time": 11.116240374016343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.625, "completions/mean_terminated_length": 4.933333396911621, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.853049049153924, "epoch": 0.00287, "frac_reward_zero_std": 0.0, "grad_norm": 0.1002127155661583, "kl": 0.6266347076743841, "learning_rate": 7.999976674556518e-06, "loss": -0.0242, "num_tokens": 6398987.0, "reward": 0.952763557434082, "reward_std": 0.6179376244544983, "rewards/rollout_reward_func/mean": 0.952763557434082, "rewards/rollout_reward_func/std": 0.6179375648498535, "sampling/importance_sampling_ratio/max": 2.138540267944336, "sampling/importance_sampling_ratio/mean": 0.9601125717163086, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.5788886547088623, "sampling/sampling_logp_difference/mean": 0.31887078285217285, "step": 287, "step_time": 19.983831387973623 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "entropy": 0.8623975664377213, "epoch": 0.00288, "grad_norm": 0.08564019948244095, "kl": 0.5827202536165714, "learning_rate": 7.999976488326414e-06, "loss": -0.0243, "step": 288, "step_time": 10.508593107020715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.96875, "completions/mean_terminated_length": 4.884615421295166, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3985638618469238, "epoch": 0.00289, "frac_reward_zero_std": 0.0, "grad_norm": 0.20441120862960815, "kl": 0.4565674401819706, "learning_rate": 7.999976301355836e-06, "loss": -0.1046, "num_tokens": 6448570.0, "reward": 0.45749473571777344, "reward_std": 0.8003786206245422, "rewards/rollout_reward_func/mean": 0.45749473571777344, "rewards/rollout_reward_func/std": 0.8003786206245422, "sampling/importance_sampling_ratio/max": 1.7258113622665405, "sampling/importance_sampling_ratio/mean": 0.7407617568969727, "sampling/importance_sampling_ratio/min": 3.634887616499327e-05, "sampling/sampling_logp_difference/max": 1.8762295246124268, "sampling/sampling_logp_difference/mean": 0.3126108646392822, "step": 289, "step_time": 19.459422364016064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4095554016530514, "epoch": 0.0029, "grad_norm": 0.2038791924715042, "kl": 0.4425636399537325, "learning_rate": 7.999976113644787e-06, "loss": -0.1047, "step": 290, "step_time": 11.077289240056416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 5.142857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3112877812236547, "epoch": 0.00291, "frac_reward_zero_std": 0.25, "grad_norm": 0.0599183551967144, "kl": 0.49396422132849693, "learning_rate": 7.999975925193261e-06, "loss": -0.0768, "num_tokens": 6496241.0, "reward": 0.6392303109169006, "reward_std": 0.842979371547699, "rewards/rollout_reward_func/mean": 0.6392303109169006, "rewards/rollout_reward_func/std": 0.8429793119430542, "sampling/importance_sampling_ratio/max": 1.5394551753997803, "sampling/importance_sampling_ratio/mean": 0.9086623191833496, "sampling/importance_sampling_ratio/min": 3.793024006881751e-05, "sampling/sampling_logp_difference/max": 1.962430477142334, "sampling/sampling_logp_difference/mean": 0.30889007449150085, "step": 291, "step_time": 21.087456504959846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3155537340790033, "epoch": 0.00292, "grad_norm": 0.05645548552274704, "kl": 0.5096674039959908, "learning_rate": 7.999975736001263e-06, "loss": -0.0768, "step": 292, "step_time": 11.492900188022759 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.71875, "completions/mean_terminated_length": 4.958333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.50377307087183, "epoch": 0.00293, "frac_reward_zero_std": 0.25, "grad_norm": 0.03814258426427841, "kl": 0.3651180360466242, "learning_rate": 7.999975546068793e-06, "loss": -0.072, "num_tokens": 6544168.0, "reward": 0.5302066802978516, "reward_std": 0.8400141596794128, "rewards/rollout_reward_func/mean": 0.5302066802978516, "rewards/rollout_reward_func/std": 0.8400141596794128, "sampling/importance_sampling_ratio/max": 1.3830687999725342, "sampling/importance_sampling_ratio/mean": 0.6972088813781738, "sampling/importance_sampling_ratio/min": 1.039257199408894e-06, "sampling/sampling_logp_difference/max": 2.5225443840026855, "sampling/sampling_logp_difference/mean": 0.416025847196579, "step": 293, "step_time": 19.68529384402791 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "entropy": 1.503676287829876, "epoch": 0.00294, "grad_norm": 0.03684553503990173, "kl": 0.37165962159633636, "learning_rate": 7.999975355395847e-06, "loss": -0.0722, "step": 294, "step_time": 10.265274826961104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 4.592592716217041, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9105174168944359, "epoch": 0.00295, "frac_reward_zero_std": 0.0, "grad_norm": 0.139046773314476, "kl": 0.6324383616447449, "learning_rate": 7.999975163982429e-06, "loss": -0.065, "num_tokens": 6604562.0, "reward": 0.5363563299179077, "reward_std": 0.5998390913009644, "rewards/rollout_reward_func/mean": 0.5363563299179077, "rewards/rollout_reward_func/std": 0.5998390316963196, "sampling/importance_sampling_ratio/max": 2.7474663257598877, "sampling/importance_sampling_ratio/mean": 0.8324763774871826, "sampling/importance_sampling_ratio/min": 1.3896498103349586e-06, "sampling/sampling_logp_difference/max": 2.2808785438537598, "sampling/sampling_logp_difference/mean": 0.44260746240615845, "step": 295, "step_time": 22.076303765992634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.9016889110207558, "epoch": 0.00296, "grad_norm": 0.13390617072582245, "kl": 0.6648191548883915, "learning_rate": 7.999974971828538e-06, "loss": -0.0656, "step": 296, "step_time": 11.53050755604636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.620689868927002, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8264171611517668, "epoch": 0.00297, "frac_reward_zero_std": 0.0, "grad_norm": 0.07115798443555832, "kl": 0.5258244145661592, "learning_rate": 7.999974778934173e-06, "loss": -0.0528, "num_tokens": 6651158.0, "reward": 0.8000921010971069, "reward_std": 0.8016495108604431, "rewards/rollout_reward_func/mean": 0.8000921010971069, "rewards/rollout_reward_func/std": 0.8016494512557983, "sampling/importance_sampling_ratio/max": 1.7591389417648315, "sampling/importance_sampling_ratio/mean": 0.9527103900909424, "sampling/importance_sampling_ratio/min": 9.46537111303769e-05, "sampling/sampling_logp_difference/max": 2.009105682373047, "sampling/sampling_logp_difference/mean": 0.20832335948944092, "step": 297, "step_time": 20.840669668017654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8174246717244387, "epoch": 0.00298, "grad_norm": 0.06787428259849548, "kl": 0.5615541823208332, "learning_rate": 7.999974585299335e-06, "loss": -0.053, "step": 298, "step_time": 11.207426185981603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.28125, "completions/mean_terminated_length": 4.1724138259887695, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9210375137627125, "epoch": 0.00299, "frac_reward_zero_std": 0.25, "grad_norm": 0.18618902564048767, "kl": 0.988595730625093, "learning_rate": 7.999974390924023e-06, "loss": -0.0473, "num_tokens": 6691921.0, "reward": 0.765068531036377, "reward_std": 0.9068189263343811, "rewards/rollout_reward_func/mean": 0.765068531036377, "rewards/rollout_reward_func/std": 0.9068188667297363, "sampling/importance_sampling_ratio/max": 1.420623779296875, "sampling/importance_sampling_ratio/mean": 0.9743796586990356, "sampling/importance_sampling_ratio/min": 0.0007124625262804329, "sampling/sampling_logp_difference/max": 2.1524198055267334, "sampling/sampling_logp_difference/mean": 0.20401570200920105, "step": 299, "step_time": 18.19266767197405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9184136521071196, "epoch": 0.003, "grad_norm": 0.1840122640132904, "kl": 0.9263806138187647, "learning_rate": 7.999974195808239e-06, "loss": -0.0478, "step": 300, "step_time": 10.342282969038934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.625, "completions/mean_terminated_length": 4.142857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.88796023465693, "epoch": 0.00301, "frac_reward_zero_std": 0.25, "grad_norm": 0.04838502034544945, "kl": 0.8756701480597258, "learning_rate": 7.999973999951982e-06, "loss": -0.062, "num_tokens": 6737419.0, "reward": 0.38171711564064026, "reward_std": 0.8717496395111084, "rewards/rollout_reward_func/mean": 0.38171711564064026, "rewards/rollout_reward_func/std": 0.8717496395111084, "sampling/importance_sampling_ratio/max": 2.160731792449951, "sampling/importance_sampling_ratio/mean": 0.9122055768966675, "sampling/importance_sampling_ratio/min": 1.6445603250758722e-05, "sampling/sampling_logp_difference/max": 1.7680459022521973, "sampling/sampling_logp_difference/mean": 0.21715016663074493, "step": 301, "step_time": 18.32818744899123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8894128724932671, "epoch": 0.00302, "grad_norm": 0.04730788618326187, "kl": 0.856092307716608, "learning_rate": 7.99997380335525e-06, "loss": -0.0622, "step": 302, "step_time": 11.4120562489843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 4.384615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4250960387289524, "epoch": 0.00303, "frac_reward_zero_std": 0.0, "grad_norm": 0.10657092183828354, "kl": 1.0041336035355926, "learning_rate": 7.999973606018048e-06, "loss": -0.0777, "num_tokens": 6786587.0, "reward": 0.688970148563385, "reward_std": 0.8491772413253784, "rewards/rollout_reward_func/mean": 0.688970148563385, "rewards/rollout_reward_func/std": 0.8491772413253784, "sampling/importance_sampling_ratio/max": 1.9048044681549072, "sampling/importance_sampling_ratio/mean": 0.8483980298042297, "sampling/importance_sampling_ratio/min": 5.889699536965054e-07, "sampling/sampling_logp_difference/max": 2.5133285522460938, "sampling/sampling_logp_difference/mean": 0.32538866996765137, "step": 303, "step_time": 20.305015835998347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4211549926549196, "epoch": 0.00304, "grad_norm": 0.11265221983194351, "kl": 1.0038509089499712, "learning_rate": 7.99997340794037e-06, "loss": -0.078, "step": 304, "step_time": 11.025695603049826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 4.960000038146973, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.712400995194912, "epoch": 0.00305, "frac_reward_zero_std": 0.0, "grad_norm": 0.13494667410850525, "kl": 0.8592776386067271, "learning_rate": 7.999973209122222e-06, "loss": -0.0702, "num_tokens": 6835699.0, "reward": 0.5261030793190002, "reward_std": 0.8460960388183594, "rewards/rollout_reward_func/mean": 0.5261030793190002, "rewards/rollout_reward_func/std": 0.8460960388183594, "sampling/importance_sampling_ratio/max": 1.7596521377563477, "sampling/importance_sampling_ratio/mean": 0.6805351972579956, "sampling/importance_sampling_ratio/min": 6.509171157631499e-07, "sampling/sampling_logp_difference/max": 2.2987349033355713, "sampling/sampling_logp_difference/mean": 0.37155306339263916, "step": 305, "step_time": 20.5676530150522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.7084819078445435, "epoch": 0.00306, "grad_norm": 0.07730847597122192, "kl": 0.8282017605379224, "learning_rate": 7.999973009563599e-06, "loss": -0.0703, "step": 306, "step_time": 10.940653318044497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.28125, "completions/mean_terminated_length": 6.500000476837158, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9811301305890083, "epoch": 0.00307, "frac_reward_zero_std": 0.0, "grad_norm": 0.14230331778526306, "kl": 0.5170724093914032, "learning_rate": 7.999972809264505e-06, "loss": -0.0782, "num_tokens": 6892644.0, "reward": 0.36106371879577637, "reward_std": 0.7703182101249695, "rewards/rollout_reward_func/mean": 0.36106371879577637, "rewards/rollout_reward_func/std": 0.7703182697296143, "sampling/importance_sampling_ratio/max": 1.7292735576629639, "sampling/importance_sampling_ratio/mean": 0.5718689560890198, "sampling/importance_sampling_ratio/min": 1.094076651497744e-07, "sampling/sampling_logp_difference/max": 2.063917636871338, "sampling/sampling_logp_difference/mean": 0.4055893123149872, "step": 307, "step_time": 22.27313311994658 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 1.9824384562671185, "epoch": 0.00308, "grad_norm": 0.14941421151161194, "kl": 0.47620439529418945, "learning_rate": 7.999972608224937e-06, "loss": -0.0788, "step": 308, "step_time": 11.535335189953912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.625, "completions/mean_terminated_length": 4.142857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0645287036895752, "epoch": 0.00309, "frac_reward_zero_std": 0.0, "grad_norm": 0.14341942965984344, "kl": 0.3320480678230524, "learning_rate": 7.999972406444895e-06, "loss": -0.0604, "num_tokens": 6941675.0, "reward": 0.39420372247695923, "reward_std": 0.8461920022964478, "rewards/rollout_reward_func/mean": 0.39420372247695923, "rewards/rollout_reward_func/std": 0.8461920022964478, "sampling/importance_sampling_ratio/max": 1.5122586488723755, "sampling/importance_sampling_ratio/mean": 0.9112703800201416, "sampling/importance_sampling_ratio/min": 4.205403456580825e-05, "sampling/sampling_logp_difference/max": 1.7511687278747559, "sampling/sampling_logp_difference/mean": 0.24474884569644928, "step": 309, "step_time": 19.10516217703116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.0761490650475025, "epoch": 0.0031, "grad_norm": 0.11426017433404922, "kl": 0.33359216898679733, "learning_rate": 7.999972203924383e-06, "loss": -0.0613, "step": 310, "step_time": 11.124095395032782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.375, "completions/mean_terminated_length": 4.6666669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.983849510550499, "epoch": 0.00311, "frac_reward_zero_std": 0.0, "grad_norm": 0.10268435627222061, "kl": 0.5395564436912537, "learning_rate": 7.999972000663396e-06, "loss": -0.0835, "num_tokens": 6991152.0, "reward": 0.5990117192268372, "reward_std": 0.8220037221908569, "rewards/rollout_reward_func/mean": 0.5990117192268372, "rewards/rollout_reward_func/std": 0.8220036625862122, "sampling/importance_sampling_ratio/max": 1.4152512550354004, "sampling/importance_sampling_ratio/mean": 0.9019641876220703, "sampling/importance_sampling_ratio/min": 0.0004103636892978102, "sampling/sampling_logp_difference/max": 1.769596815109253, "sampling/sampling_logp_difference/mean": 0.24650108814239502, "step": 311, "step_time": 20.92588528804481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012620192486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012620192486792803, "entropy": 0.9903932362794876, "epoch": 0.00312, "grad_norm": 0.09459347277879715, "kl": 0.5134493634104729, "learning_rate": 7.999971796661938e-06, "loss": -0.0839, "step": 312, "step_time": 11.485043476975989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.9375, "completions/mean_terminated_length": 4.84615421295166, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7551972940564156, "epoch": 0.00313, "frac_reward_zero_std": 0.0, "grad_norm": 0.2520640790462494, "kl": 0.3928054329007864, "learning_rate": 7.999971591920007e-06, "loss": -0.1145, "num_tokens": 7046857.0, "reward": 0.36106032133102417, "reward_std": 0.7728433609008789, "rewards/rollout_reward_func/mean": 0.36106032133102417, "rewards/rollout_reward_func/std": 0.7728433609008789, "sampling/importance_sampling_ratio/max": 2.6180598735809326, "sampling/importance_sampling_ratio/mean": 0.7810627222061157, "sampling/importance_sampling_ratio/min": 3.6261727132114174e-07, "sampling/sampling_logp_difference/max": 2.159611463546753, "sampling/sampling_logp_difference/mean": 0.3983301520347595, "step": 313, "step_time": 21.00857235401054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 1.7576342895627022, "epoch": 0.00314, "grad_norm": 0.18928174674510956, "kl": 0.41377419233322144, "learning_rate": 7.999971386437603e-06, "loss": -0.1155, "step": 314, "step_time": 11.178758996044053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.1875, "completions/mean_terminated_length": 4.636363983154297, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8078610002994537, "epoch": 0.00315, "frac_reward_zero_std": 0.0, "grad_norm": 0.11324567347764969, "kl": 0.7004276737570763, "learning_rate": 7.999971180214728e-06, "loss": -0.0656, "num_tokens": 7095055.0, "reward": 0.2907942831516266, "reward_std": 0.9029518365859985, "rewards/rollout_reward_func/mean": 0.2907942831516266, "rewards/rollout_reward_func/std": 0.9029517769813538, "sampling/importance_sampling_ratio/max": 1.476210355758667, "sampling/importance_sampling_ratio/mean": 0.5475497245788574, "sampling/importance_sampling_ratio/min": 2.61201489593077e-06, "sampling/sampling_logp_difference/max": 2.3905391693115234, "sampling/sampling_logp_difference/mean": 0.38446104526519775, "step": 315, "step_time": 21.263326979998965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010937500046566129, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010937500046566129, "entropy": 1.8087711930274963, "epoch": 0.00316, "grad_norm": 0.10660441964864731, "kl": 0.7613504398614168, "learning_rate": 7.99997097325138e-06, "loss": -0.0656, "step": 316, "step_time": 10.53346616998897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.40625, "completions/mean_terminated_length": 5.035714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4373903349041939, "epoch": 0.00317, "frac_reward_zero_std": 0.0, "grad_norm": 0.18676811456680298, "kl": 0.35486934147775173, "learning_rate": 7.999970765547559e-06, "loss": -0.0889, "num_tokens": 7139078.0, "reward": 1.0026299953460693, "reward_std": 0.729559063911438, "rewards/rollout_reward_func/mean": 1.0026299953460693, "rewards/rollout_reward_func/std": 0.729559063911438, "sampling/importance_sampling_ratio/max": 2.1809182167053223, "sampling/importance_sampling_ratio/mean": 0.9332095980644226, "sampling/importance_sampling_ratio/min": 4.024361288657019e-08, "sampling/sampling_logp_difference/max": 1.8922698497772217, "sampling/sampling_logp_difference/mean": 0.3231009840965271, "step": 317, "step_time": 19.525769140949706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4360970109701157, "epoch": 0.00318, "grad_norm": 0.1573973298072815, "kl": 0.36027507297694683, "learning_rate": 7.999970557103267e-06, "loss": -0.0886, "step": 318, "step_time": 9.720611731056124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 5.6666669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1760541647672653, "epoch": 0.00319, "frac_reward_zero_std": 0.0, "grad_norm": 0.10974867641925812, "kl": 0.2746696462854743, "learning_rate": 7.999970347918501e-06, "loss": -0.0752, "num_tokens": 7192935.0, "reward": 0.473551481962204, "reward_std": 0.7616981863975525, "rewards/rollout_reward_func/mean": 0.473551481962204, "rewards/rollout_reward_func/std": 0.7616981863975525, "sampling/importance_sampling_ratio/max": 1.8115979433059692, "sampling/importance_sampling_ratio/mean": 0.6918237209320068, "sampling/importance_sampling_ratio/min": 3.6946823911421234e-07, "sampling/sampling_logp_difference/max": 1.5767873525619507, "sampling/sampling_logp_difference/mean": 0.39890578389167786, "step": 319, "step_time": 21.66380350801046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1735713481903076, "epoch": 0.0032, "grad_norm": 0.1110939010977745, "kl": 0.27786706760525703, "learning_rate": 7.999970137993264e-06, "loss": -0.075, "step": 320, "step_time": 11.713789084984455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.5625, "completions/mean_terminated_length": 4.6666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.690819576382637, "epoch": 0.00321, "frac_reward_zero_std": 0.0, "grad_norm": 0.10267122089862823, "kl": 0.5617436217144132, "learning_rate": 7.999969927327556e-06, "loss": -0.0887, "num_tokens": 7250303.0, "reward": 0.3803492784500122, "reward_std": 0.8736156821250916, "rewards/rollout_reward_func/mean": 0.3803492784500122, "rewards/rollout_reward_func/std": 0.8736156225204468, "sampling/importance_sampling_ratio/max": 1.7379915714263916, "sampling/importance_sampling_ratio/mean": 0.6848784685134888, "sampling/importance_sampling_ratio/min": 2.088286919388338e-06, "sampling/sampling_logp_difference/max": 2.8619890213012695, "sampling/sampling_logp_difference/mean": 0.3484604060649872, "step": 321, "step_time": 27.841807257995242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6855051182210445, "epoch": 0.00322, "grad_norm": 0.09828139841556549, "kl": 0.5682674646377563, "learning_rate": 7.999969715921373e-06, "loss": -0.089, "step": 322, "step_time": 14.40477370203007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.28125, "completions/mean_terminated_length": 5.269230842590332, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2678697491064668, "epoch": 0.00323, "frac_reward_zero_std": 0.0, "grad_norm": 0.13043789565563202, "kl": 0.2513395585119724, "learning_rate": 7.999969503774719e-06, "loss": -0.1048, "num_tokens": 7303777.0, "reward": 0.7495739459991455, "reward_std": 0.9245485067367554, "rewards/rollout_reward_func/mean": 0.7495739459991455, "rewards/rollout_reward_func/std": 0.9245485067367554, "sampling/importance_sampling_ratio/max": 1.6385332345962524, "sampling/importance_sampling_ratio/mean": 0.8632112145423889, "sampling/importance_sampling_ratio/min": 0.0005636096466332674, "sampling/sampling_logp_difference/max": 1.559370756149292, "sampling/sampling_logp_difference/mean": 0.21859359741210938, "step": 323, "step_time": 26.25595317396801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.264827013015747, "epoch": 0.00324, "grad_norm": 0.0884489044547081, "kl": 0.26470763236284256, "learning_rate": 7.999969290887594e-06, "loss": -0.1054, "step": 324, "step_time": 14.041030566004338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.15625, "completions/mean_terminated_length": 4.433333396911621, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.553485163487494, "epoch": 0.00325, "frac_reward_zero_std": 0.0, "grad_norm": 0.12008398026227951, "kl": 0.7304491326212883, "learning_rate": 7.999969077259998e-06, "loss": -0.0669, "num_tokens": 7351586.0, "reward": 1.1545937061309814, "reward_std": 0.6263909935951233, "rewards/rollout_reward_func/mean": 1.1545937061309814, "rewards/rollout_reward_func/std": 0.6263910531997681, "sampling/importance_sampling_ratio/max": 1.554728627204895, "sampling/importance_sampling_ratio/mean": 0.9868518710136414, "sampling/importance_sampling_ratio/min": 0.0021195129957050085, "sampling/sampling_logp_difference/max": 1.5320885181427002, "sampling/sampling_logp_difference/mean": 0.1473540961742401, "step": 325, "step_time": 24.17474319896428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.5430580424144864, "epoch": 0.00326, "grad_norm": 0.10873936116695404, "kl": 0.7882031835615635, "learning_rate": 7.999968862891929e-06, "loss": -0.0672, "step": 326, "step_time": 13.789510613016319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.65625, "completions/mean_terminated_length": 5.319999694824219, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4117853492498398, "epoch": 0.00327, "frac_reward_zero_std": 0.0, "grad_norm": 0.20493188500404358, "kl": 0.27006671112030745, "learning_rate": 7.999968647783389e-06, "loss": -0.0962, "num_tokens": 7406530.0, "reward": 0.5755738615989685, "reward_std": 0.9149075150489807, "rewards/rollout_reward_func/mean": 0.5755738615989685, "rewards/rollout_reward_func/std": 0.9149075150489807, "sampling/importance_sampling_ratio/max": 1.52251398563385, "sampling/importance_sampling_ratio/mean": 0.7342977523803711, "sampling/importance_sampling_ratio/min": 1.450039007977466e-06, "sampling/sampling_logp_difference/max": 1.9359246492385864, "sampling/sampling_logp_difference/mean": 0.28675705194473267, "step": 327, "step_time": 27.069894926971756 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.007589285960420966, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015401785960420966, "entropy": 1.3982956521213055, "epoch": 0.00328, "grad_norm": 0.07931061834096909, "kl": 0.2924676360562444, "learning_rate": 7.999968431934376e-06, "loss": -0.0969, "step": 328, "step_time": 13.982979836058803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.59375, "completions/mean_terminated_length": 4.851851940155029, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.079586305655539, "epoch": 0.00329, "frac_reward_zero_std": 0.0, "grad_norm": 0.14362050592899323, "kl": 1.0159885380417109, "learning_rate": 7.999968215344892e-06, "loss": -0.0847, "num_tokens": 7449808.0, "reward": 0.6058323383331299, "reward_std": 0.9314640164375305, "rewards/rollout_reward_func/mean": 0.6058323383331299, "rewards/rollout_reward_func/std": 0.9314640164375305, "sampling/importance_sampling_ratio/max": 1.3736491203308105, "sampling/importance_sampling_ratio/mean": 0.8417930603027344, "sampling/importance_sampling_ratio/min": 3.502625258988701e-05, "sampling/sampling_logp_difference/max": 1.9686713218688965, "sampling/sampling_logp_difference/mean": 0.24294555187225342, "step": 329, "step_time": 21.92649314898881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 1.076845558360219, "epoch": 0.0033, "grad_norm": 0.14986270666122437, "kl": 1.115026619285345, "learning_rate": 7.999967998014936e-06, "loss": -0.0847, "step": 330, "step_time": 11.905174215993611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5643737334758043, "epoch": 0.00331, "frac_reward_zero_std": 0.25, "grad_norm": 0.0856223851442337, "kl": 0.27422606386244297, "learning_rate": 7.999967779944508e-06, "loss": -0.0538, "num_tokens": 7497716.0, "reward": 0.6869587302207947, "reward_std": 0.8886538743972778, "rewards/rollout_reward_func/mean": 0.6869587302207947, "rewards/rollout_reward_func/std": 0.8886539340019226, "sampling/importance_sampling_ratio/max": 1.5649458169937134, "sampling/importance_sampling_ratio/mean": 0.9713465571403503, "sampling/importance_sampling_ratio/min": 0.0006439309800043702, "sampling/sampling_logp_difference/max": 1.2053170204162598, "sampling/sampling_logp_difference/mean": 0.1444166600704193, "step": 331, "step_time": 23.33025651203934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5633808327838778, "epoch": 0.00332, "grad_norm": 0.08555532991886139, "kl": 0.2748487163335085, "learning_rate": 7.99996756113361e-06, "loss": -0.0538, "step": 332, "step_time": 12.53303838998545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.15625, "completions/mean_terminated_length": 5.115384578704834, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6156879179179668, "epoch": 0.00333, "frac_reward_zero_std": 0.25, "grad_norm": 0.1732545644044876, "kl": 1.0901740603148937, "learning_rate": 7.999967341582239e-06, "loss": -0.0222, "num_tokens": 7550107.0, "reward": 0.44312193989753723, "reward_std": 0.8604814410209656, "rewards/rollout_reward_func/mean": 0.44312193989753723, "rewards/rollout_reward_func/std": 0.8604813814163208, "sampling/importance_sampling_ratio/max": 1.4729104042053223, "sampling/importance_sampling_ratio/mean": 0.8183609843254089, "sampling/importance_sampling_ratio/min": 5.694114335597078e-08, "sampling/sampling_logp_difference/max": 2.221254348754883, "sampling/sampling_logp_difference/mean": 0.3166811466217041, "step": 333, "step_time": 25.024228571011918 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.6224227156490088, "epoch": 0.00334, "grad_norm": 0.16668294370174408, "kl": 1.0011125216260552, "learning_rate": 7.999967121290396e-06, "loss": -0.0229, "step": 334, "step_time": 14.210977787035517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 4.5714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.579034298658371, "epoch": 0.00335, "frac_reward_zero_std": 0.0, "grad_norm": 0.09257244318723679, "kl": 0.21538274083286524, "learning_rate": 7.999966900258084e-06, "loss": -0.084, "num_tokens": 7602017.0, "reward": -0.13822582364082336, "reward_std": 0.8124115467071533, "rewards/rollout_reward_func/mean": -0.13822582364082336, "rewards/rollout_reward_func/std": 0.8124115467071533, "sampling/importance_sampling_ratio/max": 1.391462802886963, "sampling/importance_sampling_ratio/mean": 0.6452391147613525, "sampling/importance_sampling_ratio/min": 0.0003895623085554689, "sampling/sampling_logp_difference/max": 1.471158504486084, "sampling/sampling_logp_difference/mean": 0.23049688339233398, "step": 335, "step_time": 26.817398724000668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.5886496603488922, "epoch": 0.00336, "grad_norm": 0.0674738958477974, "kl": 0.21389038115739822, "learning_rate": 7.9999666784853e-06, "loss": -0.0841, "step": 336, "step_time": 13.877422043995466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.9375, "completions/mean_terminated_length": 4.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1931117046624422, "epoch": 0.00337, "frac_reward_zero_std": 0.0, "grad_norm": 0.1221897304058075, "kl": 0.3287980780005455, "learning_rate": 7.999966455972044e-06, "loss": -0.0647, "num_tokens": 7661918.0, "reward": 0.3660988211631775, "reward_std": 0.7284073233604431, "rewards/rollout_reward_func/mean": 0.3660988211631775, "rewards/rollout_reward_func/std": 0.7284073233604431, "sampling/importance_sampling_ratio/max": 1.3744608163833618, "sampling/importance_sampling_ratio/mean": 0.790483295917511, "sampling/importance_sampling_ratio/min": 5.4376905609387904e-06, "sampling/sampling_logp_difference/max": 3.1955323219299316, "sampling/sampling_logp_difference/mean": 0.2696722447872162, "step": 337, "step_time": 26.097341447981307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1993792289867997, "epoch": 0.00338, "grad_norm": 0.11709107458591461, "kl": 0.3134032003581524, "learning_rate": 7.999966232718316e-06, "loss": -0.0648, "step": 338, "step_time": 13.719494445016608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.40625, "completions/mean_terminated_length": 4.8947367668151855, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7138329073786736, "epoch": 0.00339, "frac_reward_zero_std": 0.0, "grad_norm": 0.12877310812473297, "kl": 0.3502679606899619, "learning_rate": 7.999966008724119e-06, "loss": -0.1128, "num_tokens": 7717490.0, "reward": 0.15261310338974, "reward_std": 0.9190265536308289, "rewards/rollout_reward_func/mean": 0.15261310338974, "rewards/rollout_reward_func/std": 0.9190264940261841, "sampling/importance_sampling_ratio/max": 2.0830066204071045, "sampling/importance_sampling_ratio/mean": 0.6396558284759521, "sampling/importance_sampling_ratio/min": 7.146297775761923e-08, "sampling/sampling_logp_difference/max": 2.1402759552001953, "sampling/sampling_logp_difference/mean": 0.281146764755249, "step": 339, "step_time": 28.15850969296298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7198672890663147, "epoch": 0.0034, "grad_norm": 0.12377113848924637, "kl": 0.36423757765442133, "learning_rate": 7.99996578398945e-06, "loss": -0.1128, "step": 340, "step_time": 13.620451793918619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 5.222222328186035, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2395672928541899, "epoch": 0.00341, "frac_reward_zero_std": 0.0, "grad_norm": 0.18205414712429047, "kl": 0.2917056083679199, "learning_rate": 7.99996555851431e-06, "loss": -0.0695, "num_tokens": 7769995.0, "reward": 0.4104541540145874, "reward_std": 0.8614048361778259, "rewards/rollout_reward_func/mean": 0.4104541540145874, "rewards/rollout_reward_func/std": 0.8614048361778259, "sampling/importance_sampling_ratio/max": 1.460852861404419, "sampling/importance_sampling_ratio/mean": 0.7647489309310913, "sampling/importance_sampling_ratio/min": 0.001517620636150241, "sampling/sampling_logp_difference/max": 1.778564214706421, "sampling/sampling_logp_difference/mean": 0.23904946446418762, "step": 341, "step_time": 23.83645990898367 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.2284070365130901, "epoch": 0.00342, "grad_norm": 0.15684078633785248, "kl": 0.30116187781095505, "learning_rate": 7.999965332298698e-06, "loss": -0.0705, "step": 342, "step_time": 11.990718526998535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.59375, "completions/mean_terminated_length": 4.7916669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6977001689374447, "epoch": 0.00343, "frac_reward_zero_std": 0.0, "grad_norm": 0.07506777346134186, "kl": 0.9484614618122578, "learning_rate": 7.999965105342615e-06, "loss": -0.0924, "num_tokens": 7825522.0, "reward": 0.3002226948738098, "reward_std": 0.8815671801567078, "rewards/rollout_reward_func/mean": 0.3002226948738098, "rewards/rollout_reward_func/std": 0.8815671801567078, "sampling/importance_sampling_ratio/max": 1.265163779258728, "sampling/importance_sampling_ratio/mean": 0.6436908841133118, "sampling/importance_sampling_ratio/min": 3.325767465867102e-07, "sampling/sampling_logp_difference/max": 2.6636719703674316, "sampling/sampling_logp_difference/mean": 0.32129743695259094, "step": 343, "step_time": 26.014876821020152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7055855877697468, "epoch": 0.00344, "grad_norm": 0.07328447699546814, "kl": 0.8837700849398971, "learning_rate": 7.999964877646064e-06, "loss": -0.0925, "step": 344, "step_time": 14.141718180995667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.59375, "completions/mean_terminated_length": 4.851851940155029, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.030381316319108, "epoch": 0.00345, "frac_reward_zero_std": 0.0, "grad_norm": 0.11938215047121048, "kl": 0.9868023060262203, "learning_rate": 7.99996464920904e-06, "loss": -0.0961, "num_tokens": 7878492.0, "reward": 0.7981066107749939, "reward_std": 0.9546862840652466, "rewards/rollout_reward_func/mean": 0.7981066107749939, "rewards/rollout_reward_func/std": 0.9546862244606018, "sampling/importance_sampling_ratio/max": 1.2293952703475952, "sampling/importance_sampling_ratio/mean": 0.7555244565010071, "sampling/importance_sampling_ratio/min": 2.835558689184836e-06, "sampling/sampling_logp_difference/max": 2.204038619995117, "sampling/sampling_logp_difference/mean": 0.22118940949440002, "step": 345, "step_time": 21.666100433969405 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 1.030540863983333, "epoch": 0.00346, "grad_norm": 0.10351387411355972, "kl": 0.9534068051725626, "learning_rate": 7.999964420031546e-06, "loss": -0.0964, "step": 346, "step_time": 11.831680586998118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3312581535428762, "epoch": 0.00347, "frac_reward_zero_std": 0.0, "grad_norm": 0.07113005965948105, "kl": 0.48725752998143435, "learning_rate": 7.99996419011358e-06, "loss": -0.0859, "num_tokens": 7934116.0, "reward": 0.5659735202789307, "reward_std": 0.9309138655662537, "rewards/rollout_reward_func/mean": 0.5659735202789307, "rewards/rollout_reward_func/std": 0.9309138655662537, "sampling/importance_sampling_ratio/max": 2.3880512714385986, "sampling/importance_sampling_ratio/mean": 0.7264465093612671, "sampling/importance_sampling_ratio/min": 1.262475564089982e-07, "sampling/sampling_logp_difference/max": 2.5245308876037598, "sampling/sampling_logp_difference/mean": 0.3080235421657562, "step": 347, "step_time": 27.41153142094845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3371150083839893, "epoch": 0.00348, "grad_norm": 0.05538462847471237, "kl": 0.4331676932051778, "learning_rate": 7.999963959455145e-06, "loss": -0.086, "step": 348, "step_time": 14.087515629042173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.78125, "completions/mean_terminated_length": 5.0740742683410645, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3813710510730743, "epoch": 0.00349, "frac_reward_zero_std": 0.0, "grad_norm": 0.07432521879673004, "kl": 0.6162668485194445, "learning_rate": 7.999963728056238e-06, "loss": -0.0702, "num_tokens": 7992933.0, "reward": 0.5558145642280579, "reward_std": 0.7315611839294434, "rewards/rollout_reward_func/mean": 0.5558145642280579, "rewards/rollout_reward_func/std": 0.7315612435340881, "sampling/importance_sampling_ratio/max": 1.4575819969177246, "sampling/importance_sampling_ratio/mean": 0.793278694152832, "sampling/importance_sampling_ratio/min": 4.906129561277339e-06, "sampling/sampling_logp_difference/max": 2.6581740379333496, "sampling/sampling_logp_difference/mean": 0.30014854669570923, "step": 349, "step_time": 26.62812172897975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3847943423315883, "epoch": 0.0035, "grad_norm": 0.07496560364961624, "kl": 0.5880699437111616, "learning_rate": 7.99996349591686e-06, "loss": -0.0706, "step": 350, "step_time": 14.137814821006032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.625, "completions/mean_terminated_length": 4.258064270019531, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.45472484827041626, "epoch": 0.00351, "frac_reward_zero_std": 0.25, "grad_norm": 0.19620099663734436, "kl": 0.5773568022996187, "learning_rate": 7.999963263037014e-06, "loss": 0.0059, "num_tokens": 8046168.0, "reward": 0.49303197860717773, "reward_std": 0.7971338629722595, "rewards/rollout_reward_func/mean": 0.49303197860717773, "rewards/rollout_reward_func/std": 0.7971338629722595, "sampling/importance_sampling_ratio/max": 1.4669671058654785, "sampling/importance_sampling_ratio/mean": 1.0185784101486206, "sampling/importance_sampling_ratio/min": 0.026735378429293633, "sampling/sampling_logp_difference/max": 1.3940675258636475, "sampling/sampling_logp_difference/mean": 0.11983118206262589, "step": 351, "step_time": 23.448320005991263 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.45726427249610424, "epoch": 0.00352, "grad_norm": 0.2012617290019989, "kl": 0.49645596370100975, "learning_rate": 7.999963029416695e-06, "loss": 0.0056, "step": 352, "step_time": 12.82007256190991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.928571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0092080924659967, "epoch": 0.00353, "frac_reward_zero_std": 0.0, "grad_norm": 0.09066823869943619, "kl": 1.263651049695909, "learning_rate": 7.999962795055906e-06, "loss": -0.0789, "num_tokens": 8094509.0, "reward": 0.7720896005630493, "reward_std": 0.8186599016189575, "rewards/rollout_reward_func/mean": 0.7720896005630493, "rewards/rollout_reward_func/std": 0.8186598420143127, "sampling/importance_sampling_ratio/max": 1.3298190832138062, "sampling/importance_sampling_ratio/mean": 0.8216202259063721, "sampling/importance_sampling_ratio/min": 0.00011695465218508616, "sampling/sampling_logp_difference/max": 1.6942646503448486, "sampling/sampling_logp_difference/mean": 0.23975196480751038, "step": 353, "step_time": 23.043397235043813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0173163479194045, "epoch": 0.00354, "grad_norm": 0.08138295263051987, "kl": 1.2148776887916028, "learning_rate": 7.99996255995465e-06, "loss": -0.0791, "step": 354, "step_time": 13.24871151309344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.9375, "completions/mean_terminated_length": 4.896551609039307, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8425217010080814, "epoch": 0.00355, "frac_reward_zero_std": 0.0, "grad_norm": 0.15460896492004395, "kl": 1.624922949820757, "learning_rate": 7.99996232411292e-06, "loss": -0.0675, "num_tokens": 8143075.0, "reward": 0.8713451623916626, "reward_std": 0.7617069482803345, "rewards/rollout_reward_func/mean": 0.8713451623916626, "rewards/rollout_reward_func/std": 0.7617068886756897, "sampling/importance_sampling_ratio/max": 1.4283170700073242, "sampling/importance_sampling_ratio/mean": 0.8244152069091797, "sampling/importance_sampling_ratio/min": 3.0680848794872873e-06, "sampling/sampling_logp_difference/max": 3.2570455074310303, "sampling/sampling_logp_difference/mean": 0.2573392987251282, "step": 355, "step_time": 22.672555947996443 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.8470705635845661, "epoch": 0.00356, "grad_norm": 0.1190284788608551, "kl": 1.25016950070858, "learning_rate": 7.999962087530722e-06, "loss": -0.0683, "step": 356, "step_time": 12.99472063803114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.774193286895752, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9531486574560404, "epoch": 0.00357, "frac_reward_zero_std": 0.0, "grad_norm": 0.07629624009132385, "kl": 0.9415738433599472, "learning_rate": 7.999961850208053e-06, "loss": -0.0667, "num_tokens": 8188765.0, "reward": 0.9179764986038208, "reward_std": 0.7455434203147888, "rewards/rollout_reward_func/mean": 0.9179764986038208, "rewards/rollout_reward_func/std": 0.745543360710144, "sampling/importance_sampling_ratio/max": 1.4095871448516846, "sampling/importance_sampling_ratio/mean": 0.9101988673210144, "sampling/importance_sampling_ratio/min": 5.2758433177757524e-09, "sampling/sampling_logp_difference/max": 2.1854984760284424, "sampling/sampling_logp_difference/mean": 0.2739848494529724, "step": 357, "step_time": 21.956031873996835 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.9666181672364473, "epoch": 0.00358, "grad_norm": 0.07826729863882065, "kl": 0.8680874407291412, "learning_rate": 7.999961612144914e-06, "loss": -0.0667, "step": 358, "step_time": 11.652976225013845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.0625, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.680332887917757, "epoch": 0.00359, "frac_reward_zero_std": 0.25, "grad_norm": 0.05414646118879318, "kl": 0.32367041520774364, "learning_rate": 7.999961373341304e-06, "loss": -0.0535, "num_tokens": 8240585.0, "reward": 0.7715781331062317, "reward_std": 0.7740218639373779, "rewards/rollout_reward_func/mean": 0.7715781331062317, "rewards/rollout_reward_func/std": 0.7740218639373779, "sampling/importance_sampling_ratio/max": 1.7897855043411255, "sampling/importance_sampling_ratio/mean": 1.0004444122314453, "sampling/importance_sampling_ratio/min": 0.0033616768196225166, "sampling/sampling_logp_difference/max": 1.5021097660064697, "sampling/sampling_logp_difference/mean": 0.15294788777828217, "step": 359, "step_time": 24.816772212012438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6884859353303909, "epoch": 0.0036, "grad_norm": 0.05320476368069649, "kl": 0.3191019147634506, "learning_rate": 7.999961133797226e-06, "loss": -0.0533, "step": 360, "step_time": 13.411138291994575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.44444465637207, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0205447282642126, "epoch": 0.00361, "frac_reward_zero_std": 0.0, "grad_norm": 0.05277977138757706, "kl": 0.6755544068291783, "learning_rate": 7.999960893512676e-06, "loss": -0.0922, "num_tokens": 8294864.0, "reward": 0.9073903560638428, "reward_std": 0.7812069058418274, "rewards/rollout_reward_func/mean": 0.9073903560638428, "rewards/rollout_reward_func/std": 0.7812069058418274, "sampling/importance_sampling_ratio/max": 1.5353492498397827, "sampling/importance_sampling_ratio/mean": 0.8387053608894348, "sampling/importance_sampling_ratio/min": 0.001014054287225008, "sampling/sampling_logp_difference/max": 1.582627296447754, "sampling/sampling_logp_difference/mean": 0.22757013142108917, "step": 361, "step_time": 23.71619789599208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.026074692606926, "epoch": 0.00362, "grad_norm": 0.05447545647621155, "kl": 0.6606953283771873, "learning_rate": 7.999960652487659e-06, "loss": -0.0923, "step": 362, "step_time": 12.292589897057042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.34375, "completions/mean_terminated_length": 4.555555820465088, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.290997078642249, "epoch": 0.00363, "frac_reward_zero_std": 0.0, "grad_norm": 0.08383062481880188, "kl": 0.5829834695905447, "learning_rate": 7.99996041072217e-06, "loss": -0.0898, "num_tokens": 8348084.0, "reward": 0.6407698392868042, "reward_std": 0.7330180406570435, "rewards/rollout_reward_func/mean": 0.6407698392868042, "rewards/rollout_reward_func/std": 0.7330180406570435, "sampling/importance_sampling_ratio/max": 1.3699625730514526, "sampling/importance_sampling_ratio/mean": 0.8168528079986572, "sampling/importance_sampling_ratio/min": 7.341802166216382e-10, "sampling/sampling_logp_difference/max": 2.3543453216552734, "sampling/sampling_logp_difference/mean": 0.35090911388397217, "step": 363, "step_time": 24.666935082001146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2944549024105072, "epoch": 0.00364, "grad_norm": 0.0825154259800911, "kl": 0.5995782092213631, "learning_rate": 7.999960168216212e-06, "loss": -0.0898, "step": 364, "step_time": 14.005133093975019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 4.285714626312256, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9078547395765781, "epoch": 0.00365, "frac_reward_zero_std": 0.0, "grad_norm": 0.09349767863750458, "kl": 0.5538847129791975, "learning_rate": 7.999959924969784e-06, "loss": -0.0786, "num_tokens": 8396903.0, "reward": 1.0005215406417847, "reward_std": 0.7477485537528992, "rewards/rollout_reward_func/mean": 1.0005215406417847, "rewards/rollout_reward_func/std": 0.7477485537528992, "sampling/importance_sampling_ratio/max": 1.3415378332138062, "sampling/importance_sampling_ratio/mean": 0.8967766761779785, "sampling/importance_sampling_ratio/min": 2.9154545700293966e-05, "sampling/sampling_logp_difference/max": 2.2546708583831787, "sampling/sampling_logp_difference/mean": 0.21606436371803284, "step": 365, "step_time": 23.844705161987804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9068107381463051, "epoch": 0.00366, "grad_norm": 0.09764689952135086, "kl": 0.5183828473091125, "learning_rate": 7.999959680982886e-06, "loss": -0.0788, "step": 366, "step_time": 13.02033629201469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.71875, "completions/mean_terminated_length": 4.576923370361328, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1835842374712229, "epoch": 0.00367, "frac_reward_zero_std": 0.0, "grad_norm": 0.0636894479393959, "kl": 0.5345521233975887, "learning_rate": 7.99995943625552e-06, "loss": -0.0672, "num_tokens": 8449777.0, "reward": 0.7913302779197693, "reward_std": 0.7749124765396118, "rewards/rollout_reward_func/mean": 0.7913302779197693, "rewards/rollout_reward_func/std": 0.7749124765396118, "sampling/importance_sampling_ratio/max": 1.3756171464920044, "sampling/importance_sampling_ratio/mean": 0.7677508592605591, "sampling/importance_sampling_ratio/min": 6.843772553111194e-07, "sampling/sampling_logp_difference/max": 2.2980499267578125, "sampling/sampling_logp_difference/mean": 0.31612855195999146, "step": 367, "step_time": 23.49037945602322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1815327685326338, "epoch": 0.00368, "grad_norm": 0.06316441297531128, "kl": 0.5299080274999142, "learning_rate": 7.999959190787684e-06, "loss": -0.0673, "step": 368, "step_time": 12.70907362102298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.92307710647583, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1944713070988655, "epoch": 0.00369, "frac_reward_zero_std": 0.25, "grad_norm": 0.0646536573767662, "kl": 0.22344722226262093, "learning_rate": 7.999958944579377e-06, "loss": -0.0632, "num_tokens": 8500223.0, "reward": 0.6293580532073975, "reward_std": 0.8725279569625854, "rewards/rollout_reward_func/mean": 0.6293580532073975, "rewards/rollout_reward_func/std": 0.8725279569625854, "sampling/importance_sampling_ratio/max": 1.4233001470565796, "sampling/importance_sampling_ratio/mean": 0.7636790871620178, "sampling/importance_sampling_ratio/min": 0.001563201192766428, "sampling/sampling_logp_difference/max": 1.4377095699310303, "sampling/sampling_logp_difference/mean": 0.20824596285820007, "step": 369, "step_time": 20.435480166052002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1914158537983894, "epoch": 0.0037, "grad_norm": 0.05802124738693237, "kl": 0.23201606050133705, "learning_rate": 7.999958697630603e-06, "loss": -0.0633, "step": 370, "step_time": 10.58519958195393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.21875, "completions/mean_terminated_length": 4.8214287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0623938664793968, "epoch": 0.00371, "frac_reward_zero_std": 0.0, "grad_norm": 0.09370057284832001, "kl": 0.26749021373689175, "learning_rate": 7.999958449941359e-06, "loss": -0.0969, "num_tokens": 8553049.0, "reward": 0.6077262163162231, "reward_std": 0.939456582069397, "rewards/rollout_reward_func/mean": 0.6077262163162231, "rewards/rollout_reward_func/std": 0.939456582069397, "sampling/importance_sampling_ratio/max": 1.6082345247268677, "sampling/importance_sampling_ratio/mean": 0.8838446140289307, "sampling/importance_sampling_ratio/min": 1.7474317104415604e-08, "sampling/sampling_logp_difference/max": 3.2189931869506836, "sampling/sampling_logp_difference/mean": 0.28474050760269165, "step": 371, "step_time": 25.743057002022397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0564960855990648, "epoch": 0.00372, "grad_norm": 0.12126820534467697, "kl": 0.27359381690621376, "learning_rate": 7.999958201511645e-06, "loss": -0.0971, "step": 372, "step_time": 13.491778152965708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.518518447875977, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1082474756985903, "epoch": 0.00373, "frac_reward_zero_std": 0.0, "grad_norm": 0.29934224486351013, "kl": 0.7544006053358316, "learning_rate": 7.999957952341462e-06, "loss": -0.0733, "num_tokens": 8603141.0, "reward": 0.8280901908874512, "reward_std": 0.829933226108551, "rewards/rollout_reward_func/mean": 0.8280901908874512, "rewards/rollout_reward_func/std": 0.829933226108551, "sampling/importance_sampling_ratio/max": 1.843432903289795, "sampling/importance_sampling_ratio/mean": 0.849862277507782, "sampling/importance_sampling_ratio/min": 0.0003984655486419797, "sampling/sampling_logp_difference/max": 1.5210316181182861, "sampling/sampling_logp_difference/mean": 0.20991511642932892, "step": 373, "step_time": 25.768735998979537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1072499491274357, "epoch": 0.00374, "grad_norm": 0.3340734839439392, "kl": 0.8388786353170872, "learning_rate": 7.99995770243081e-06, "loss": -0.0741, "step": 374, "step_time": 13.54401409701677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.96875, "completions/mean_terminated_length": 4.440000057220459, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4538812171667814, "epoch": 0.00375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2117272913455963, "kl": 0.5002050511538982, "learning_rate": 7.999957451779688e-06, "loss": -0.0726, "num_tokens": 8650601.0, "reward": 0.2057700753211975, "reward_std": 0.8208404183387756, "rewards/rollout_reward_func/mean": 0.2057700753211975, "rewards/rollout_reward_func/std": 0.8208404183387756, "sampling/importance_sampling_ratio/max": 1.329992413520813, "sampling/importance_sampling_ratio/mean": 0.6939770579338074, "sampling/importance_sampling_ratio/min": 2.284511024797098e-09, "sampling/sampling_logp_difference/max": 2.1204490661621094, "sampling/sampling_logp_difference/mean": 0.33914655447006226, "step": 375, "step_time": 23.75850726698991 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.012620192486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01574519253335893, "entropy": 1.4548140689730644, "epoch": 0.00376, "grad_norm": 0.08394403010606766, "kl": 0.5418041963130236, "learning_rate": 7.9999572003881e-06, "loss": -0.0741, "step": 376, "step_time": 12.446618828078499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.413793087005615, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.998291902244091, "epoch": 0.00377, "frac_reward_zero_std": 0.0, "grad_norm": 0.0957830622792244, "kl": 1.438781037926674, "learning_rate": 7.99995694825604e-06, "loss": -0.0775, "num_tokens": 8707037.0, "reward": 0.7983344793319702, "reward_std": 0.7615047693252563, "rewards/rollout_reward_func/mean": 0.7983344793319702, "rewards/rollout_reward_func/std": 0.7615048289299011, "sampling/importance_sampling_ratio/max": 1.4950755834579468, "sampling/importance_sampling_ratio/mean": 0.8268066048622131, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.2729220390319824, "sampling/sampling_logp_difference/mean": 0.2676367461681366, "step": 377, "step_time": 24.465777506004088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0000526197254658, "epoch": 0.00378, "grad_norm": 0.1023310199379921, "kl": 1.452454935759306, "learning_rate": 7.999956695383513e-06, "loss": -0.0776, "step": 378, "step_time": 13.512239406001754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.59375, "completions/mean_terminated_length": 4.517241477966309, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0373698808252811, "epoch": 0.00379, "frac_reward_zero_std": 0.0, "grad_norm": 0.09718213975429535, "kl": 0.6196195101365447, "learning_rate": 7.999956441770516e-06, "loss": -0.0786, "num_tokens": 8757420.0, "reward": 0.6634821891784668, "reward_std": 0.9056584239006042, "rewards/rollout_reward_func/mean": 0.6634821891784668, "rewards/rollout_reward_func/std": 0.9056584239006042, "sampling/importance_sampling_ratio/max": 1.4262349605560303, "sampling/importance_sampling_ratio/mean": 0.9045270681381226, "sampling/importance_sampling_ratio/min": 1.7102303900173865e-05, "sampling/sampling_logp_difference/max": 1.7436060905456543, "sampling/sampling_logp_difference/mean": 0.2094709277153015, "step": 379, "step_time": 24.181202748935902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0424365047365427, "epoch": 0.0038, "grad_norm": 0.08767680078744888, "kl": 0.6263941447250545, "learning_rate": 7.999956187417052e-06, "loss": -0.0787, "step": 380, "step_time": 13.89202653898974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 4.296296119689941, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0549693871289492, "epoch": 0.00381, "frac_reward_zero_std": 0.25, "grad_norm": 0.05992303416132927, "kl": 0.5186265911906958, "learning_rate": 7.999955932323117e-06, "loss": -0.0783, "num_tokens": 8813169.0, "reward": 0.887982189655304, "reward_std": 0.8328939080238342, "rewards/rollout_reward_func/mean": 0.887982189655304, "rewards/rollout_reward_func/std": 0.8328940272331238, "sampling/importance_sampling_ratio/max": 1.415076494216919, "sampling/importance_sampling_ratio/mean": 0.8602277636528015, "sampling/importance_sampling_ratio/min": 2.590683436665131e-07, "sampling/sampling_logp_difference/max": 3.2846972942352295, "sampling/sampling_logp_difference/mean": 0.2537146210670471, "step": 381, "step_time": 24.599854269006755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.051557069644332, "epoch": 0.00382, "grad_norm": 0.055801115930080414, "kl": 0.5326721668243408, "learning_rate": 7.999955676488715e-06, "loss": -0.0783, "step": 382, "step_time": 12.810626295016846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 4.666666507720947, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2758464086800814, "epoch": 0.00383, "frac_reward_zero_std": 0.0, "grad_norm": 0.048292141407728195, "kl": 0.7302441634237766, "learning_rate": 7.999955419913844e-06, "loss": -0.0467, "num_tokens": 8867285.0, "reward": 0.6718636751174927, "reward_std": 0.8362178206443787, "rewards/rollout_reward_func/mean": 0.6718636751174927, "rewards/rollout_reward_func/std": 0.8362177610397339, "sampling/importance_sampling_ratio/max": 1.9238306283950806, "sampling/importance_sampling_ratio/mean": 0.8151154518127441, "sampling/importance_sampling_ratio/min": 1.3850686627847608e-05, "sampling/sampling_logp_difference/max": 2.472620964050293, "sampling/sampling_logp_difference/mean": 0.3076561689376831, "step": 383, "step_time": 25.239236074034125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2731224680319428, "epoch": 0.00384, "grad_norm": 0.04779981076717377, "kl": 0.7673952393233776, "learning_rate": 7.999955162598504e-06, "loss": -0.0466, "step": 384, "step_time": 13.603929500997765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.111111164093018, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0058060819283128, "epoch": 0.00385, "frac_reward_zero_std": 0.0, "grad_norm": 0.04009459167718887, "kl": 0.3953485954552889, "learning_rate": 7.999954904542697e-06, "loss": -0.0859, "num_tokens": 8918068.0, "reward": 0.753899335861206, "reward_std": 0.8987948298454285, "rewards/rollout_reward_func/mean": 0.753899335861206, "rewards/rollout_reward_func/std": 0.8987948298454285, "sampling/importance_sampling_ratio/max": 1.4695438146591187, "sampling/importance_sampling_ratio/mean": 0.8974989652633667, "sampling/importance_sampling_ratio/min": 1.3405534446064848e-05, "sampling/sampling_logp_difference/max": 1.664438247680664, "sampling/sampling_logp_difference/mean": 0.2545661926269531, "step": 385, "step_time": 22.86367527107359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0043721813708544, "epoch": 0.00386, "grad_norm": 0.04081485792994499, "kl": 0.42175344470888376, "learning_rate": 7.999954645746422e-06, "loss": -0.0858, "step": 386, "step_time": 12.249511568021262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 4.285714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9078443879261613, "epoch": 0.00387, "frac_reward_zero_std": 0.0, "grad_norm": 0.12616467475891113, "kl": 0.8437868170440197, "learning_rate": 7.999954386209677e-06, "loss": -0.0493, "num_tokens": 8977260.0, "reward": 0.8113418817520142, "reward_std": 0.7680149078369141, "rewards/rollout_reward_func/mean": 0.8113418817520142, "rewards/rollout_reward_func/std": 0.7680148482322693, "sampling/importance_sampling_ratio/max": 1.537463665008545, "sampling/importance_sampling_ratio/mean": 0.8289440274238586, "sampling/importance_sampling_ratio/min": 3.111409750999883e-05, "sampling/sampling_logp_difference/max": 1.522384762763977, "sampling/sampling_logp_difference/mean": 0.19962090253829956, "step": 387, "step_time": 23.771737719973316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9034461230039597, "epoch": 0.00388, "grad_norm": 0.12985466420650482, "kl": 0.8166744150221348, "learning_rate": 7.999954125932465e-06, "loss": -0.0498, "step": 388, "step_time": 13.391169528971659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.53125, "completions/mean_terminated_length": 4.161290168762207, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6038425602018833, "epoch": 0.00389, "frac_reward_zero_std": 0.0, "grad_norm": 0.12813211977481842, "kl": 0.4672217331826687, "learning_rate": 7.999953864914783e-06, "loss": -0.0467, "num_tokens": 9033173.0, "reward": 0.7783325910568237, "reward_std": 0.691533088684082, "rewards/rollout_reward_func/mean": 0.7783325910568237, "rewards/rollout_reward_func/std": 0.691533088684082, "sampling/importance_sampling_ratio/max": 1.3999277353286743, "sampling/importance_sampling_ratio/mean": 0.9893268942832947, "sampling/importance_sampling_ratio/min": 6.454932736232877e-05, "sampling/sampling_logp_difference/max": 1.7020831108093262, "sampling/sampling_logp_difference/mean": 0.16682572662830353, "step": 389, "step_time": 24.876497352088336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 0.6048170682042837, "epoch": 0.0039, "grad_norm": 0.10125811398029327, "kl": 0.466050211340189, "learning_rate": 7.999953603156633e-06, "loss": -0.0468, "step": 390, "step_time": 13.98095569992438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.84375, "completions/mean_terminated_length": 4.793103218078613, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1407796638086438, "epoch": 0.00391, "frac_reward_zero_std": 0.0, "grad_norm": 0.05779409408569336, "kl": 0.6512171449139714, "learning_rate": 7.999953340658018e-06, "loss": -0.0825, "num_tokens": 9078024.0, "reward": 0.7913057804107666, "reward_std": 0.8911302089691162, "rewards/rollout_reward_func/mean": 0.7913057804107666, "rewards/rollout_reward_func/std": 0.8911302089691162, "sampling/importance_sampling_ratio/max": 1.2301932573318481, "sampling/importance_sampling_ratio/mean": 0.8413695096969604, "sampling/importance_sampling_ratio/min": 1.430242718925001e-06, "sampling/sampling_logp_difference/max": 2.042909622192383, "sampling/sampling_logp_difference/mean": 0.2746858596801758, "step": 391, "step_time": 22.0577185570437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1487870458513498, "epoch": 0.00392, "grad_norm": 0.06371783465147018, "kl": 0.6409559100866318, "learning_rate": 7.999953077418933e-06, "loss": -0.0827, "step": 392, "step_time": 12.24397256798693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.03125, "completions/mean_terminated_length": 4.961538791656494, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.501237003132701, "epoch": 0.00393, "frac_reward_zero_std": 0.0, "grad_norm": 0.08097249269485474, "kl": 0.5347633026540279, "learning_rate": 7.99995281343938e-06, "loss": -0.0749, "num_tokens": 9125775.0, "reward": 0.5445797443389893, "reward_std": 0.9025285243988037, "rewards/rollout_reward_func/mean": 0.5445797443389893, "rewards/rollout_reward_func/std": 0.9025284647941589, "sampling/importance_sampling_ratio/max": 1.645660161972046, "sampling/importance_sampling_ratio/mean": 0.763586163520813, "sampling/importance_sampling_ratio/min": 3.5022860629396746e-06, "sampling/sampling_logp_difference/max": 1.965435266494751, "sampling/sampling_logp_difference/mean": 0.3349838852882385, "step": 393, "step_time": 19.885794762987643 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.5067116357386112, "epoch": 0.00394, "grad_norm": 0.07570907473564148, "kl": 0.49182742089033127, "learning_rate": 7.99995254871936e-06, "loss": -0.0752, "step": 394, "step_time": 10.210016845056089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.90625, "completions/mean_terminated_length": 4.464285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8159753428772092, "epoch": 0.00395, "frac_reward_zero_std": 0.0, "grad_norm": 0.13683238625526428, "kl": 0.33482627011835575, "learning_rate": 7.999952283258871e-06, "loss": -0.067, "num_tokens": 9174670.0, "reward": 0.5586168766021729, "reward_std": 0.915533185005188, "rewards/rollout_reward_func/mean": 0.5586168766021729, "rewards/rollout_reward_func/std": 0.9155331254005432, "sampling/importance_sampling_ratio/max": 1.3975480794906616, "sampling/importance_sampling_ratio/mean": 0.9326298236846924, "sampling/importance_sampling_ratio/min": 0.0015488278586417437, "sampling/sampling_logp_difference/max": 1.6552093029022217, "sampling/sampling_logp_difference/mean": 0.16384834051132202, "step": 395, "step_time": 23.562412134953775 }, { "clip_ratio/high_max": 0.005434782709926367, "clip_ratio/high_mean": 0.0027173913549631834, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027173913549631834, "entropy": 0.8236191011965275, "epoch": 0.00396, "grad_norm": 0.1299774944782257, "kl": 0.3193251471966505, "learning_rate": 7.999952017057914e-06, "loss": -0.0676, "step": 396, "step_time": 12.32233847497264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.9375, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9946891106665134, "epoch": 0.00397, "frac_reward_zero_std": 0.25, "grad_norm": 0.052238985896110535, "kl": 0.5066568236798048, "learning_rate": 7.99995175011649e-06, "loss": -0.0406, "num_tokens": 9217703.0, "reward": 0.86269211769104, "reward_std": 0.758620023727417, "rewards/rollout_reward_func/mean": 0.86269211769104, "rewards/rollout_reward_func/std": 0.758620023727417, "sampling/importance_sampling_ratio/max": 1.2760823965072632, "sampling/importance_sampling_ratio/mean": 0.9090371131896973, "sampling/importance_sampling_ratio/min": 1.2145028449594975e-06, "sampling/sampling_logp_difference/max": 2.0877249240875244, "sampling/sampling_logp_difference/mean": 0.22790497541427612, "step": 397, "step_time": 19.526525515015237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9982662480324507, "epoch": 0.00398, "grad_norm": 0.04990943521261215, "kl": 0.47844006307423115, "learning_rate": 7.9999514824346e-06, "loss": -0.0407, "step": 398, "step_time": 10.784963148966199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.53125, "completions/mean_terminated_length": 4.708333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5114845670759678, "epoch": 0.00399, "frac_reward_zero_std": 0.0, "grad_norm": 0.1044623851776123, "kl": 0.7151965722441673, "learning_rate": 7.999951214012241e-06, "loss": -0.0902, "num_tokens": 9265832.0, "reward": 0.5737147927284241, "reward_std": 0.9466375112533569, "rewards/rollout_reward_func/mean": 0.5737147927284241, "rewards/rollout_reward_func/std": 0.9466375112533569, "sampling/importance_sampling_ratio/max": 1.4181602001190186, "sampling/importance_sampling_ratio/mean": 0.7383992671966553, "sampling/importance_sampling_ratio/min": 4.984715815226082e-06, "sampling/sampling_logp_difference/max": 2.455230236053467, "sampling/sampling_logp_difference/mean": 0.30062729120254517, "step": 399, "step_time": 23.474347551062237 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.007925724610686302, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007925724610686302, "entropy": 1.515154829248786, "epoch": 0.004, "grad_norm": 0.08349961787462234, "kl": 0.6339620985090733, "learning_rate": 7.999950944849416e-06, "loss": -0.0906, "step": 400, "step_time": 12.835298585006967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 4.148148059844971, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0741925928741693, "epoch": 0.00401, "frac_reward_zero_std": 0.25, "grad_norm": 0.027165349572896957, "kl": 0.4012613147497177, "learning_rate": 7.999950674946121e-06, "loss": -0.0521, "num_tokens": 9317649.0, "reward": 0.8400805592536926, "reward_std": 0.8230050206184387, "rewards/rollout_reward_func/mean": 0.8400805592536926, "rewards/rollout_reward_func/std": 0.8230050206184387, "sampling/importance_sampling_ratio/max": 1.2967562675476074, "sampling/importance_sampling_ratio/mean": 0.9088068008422852, "sampling/importance_sampling_ratio/min": 3.592750263958777e-10, "sampling/sampling_logp_difference/max": 2.393176555633545, "sampling/sampling_logp_difference/mean": 0.2764817178249359, "step": 401, "step_time": 23.673808438004926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0775948241353035, "epoch": 0.00402, "grad_norm": 0.027421114966273308, "kl": 0.40454868227243423, "learning_rate": 7.99995040430236e-06, "loss": -0.0521, "step": 402, "step_time": 12.563576710032066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0395708438009024, "epoch": 0.00403, "frac_reward_zero_std": 0.25, "grad_norm": 0.030510390177369118, "kl": 0.31272025406360626, "learning_rate": 7.999950132918132e-06, "loss": -0.0578, "num_tokens": 9370386.0, "reward": 0.7701454162597656, "reward_std": 0.8735735416412354, "rewards/rollout_reward_func/mean": 0.7701454162597656, "rewards/rollout_reward_func/std": 0.8735736608505249, "sampling/importance_sampling_ratio/max": 1.297706961631775, "sampling/importance_sampling_ratio/mean": 0.8638167977333069, "sampling/importance_sampling_ratio/min": 8.409459042013623e-06, "sampling/sampling_logp_difference/max": 1.7644693851470947, "sampling/sampling_logp_difference/mean": 0.23505407571792603, "step": 403, "step_time": 25.318404807010666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0402543917298317, "epoch": 0.00404, "grad_norm": 0.029907837510108948, "kl": 0.305375587195158, "learning_rate": 7.999949860793436e-06, "loss": -0.0577, "step": 404, "step_time": 12.787500503036426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.78125, "completions/mean_terminated_length": 4.033333778381348, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6658486537635326, "epoch": 0.00405, "frac_reward_zero_std": 0.25, "grad_norm": 0.052308425307273865, "kl": 1.0152396019548178, "learning_rate": 7.999949587928276e-06, "loss": -0.0638, "num_tokens": 9411645.0, "reward": 1.0738770961761475, "reward_std": 0.7650061249732971, "rewards/rollout_reward_func/mean": 1.0738770961761475, "rewards/rollout_reward_func/std": 0.7650061249732971, "sampling/importance_sampling_ratio/max": 1.3035898208618164, "sampling/importance_sampling_ratio/mean": 0.9230852127075195, "sampling/importance_sampling_ratio/min": 0.0012829304905608296, "sampling/sampling_logp_difference/max": 1.526928424835205, "sampling/sampling_logp_difference/mean": 0.1453637033700943, "step": 405, "step_time": 22.596890756016364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.66778664290905, "epoch": 0.00406, "grad_norm": 0.05151323974132538, "kl": 1.0252370461821556, "learning_rate": 7.999949314322646e-06, "loss": -0.0638, "step": 406, "step_time": 12.082583676994545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.03125, "completions/mean_terminated_length": 4.185185432434082, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2881220430135727, "epoch": 0.00407, "frac_reward_zero_std": 0.0, "grad_norm": 0.123403400182724, "kl": 0.36667309794574976, "learning_rate": 7.999949039976548e-06, "loss": -0.0635, "num_tokens": 9458346.0, "reward": 0.9557647705078125, "reward_std": 0.7630170583724976, "rewards/rollout_reward_func/mean": 0.9557647705078125, "rewards/rollout_reward_func/std": 0.7630170583724976, "sampling/importance_sampling_ratio/max": 2.4481518268585205, "sampling/importance_sampling_ratio/mean": 0.9207555055618286, "sampling/importance_sampling_ratio/min": 6.410352284547116e-07, "sampling/sampling_logp_difference/max": 2.633133888244629, "sampling/sampling_logp_difference/mean": 0.28725773096084595, "step": 407, "step_time": 25.14300957202795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2888096421957016, "epoch": 0.00408, "grad_norm": 0.1305839717388153, "kl": 0.3683302104473114, "learning_rate": 7.999948764889987e-06, "loss": -0.0639, "step": 408, "step_time": 13.388555101962993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.59375, "completions/mean_terminated_length": 4.107142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7566611003130674, "epoch": 0.00409, "frac_reward_zero_std": 0.0, "grad_norm": 0.06466685980558395, "kl": 0.29092769883573055, "learning_rate": 7.999948489062955e-06, "loss": -0.0701, "num_tokens": 9503344.0, "reward": 0.22832335531711578, "reward_std": 0.8431484699249268, "rewards/rollout_reward_func/mean": 0.22832335531711578, "rewards/rollout_reward_func/std": 0.8431484699249268, "sampling/importance_sampling_ratio/max": 1.601158618927002, "sampling/importance_sampling_ratio/mean": 0.9730607271194458, "sampling/importance_sampling_ratio/min": 0.002676282776519656, "sampling/sampling_logp_difference/max": 1.0960980653762817, "sampling/sampling_logp_difference/mean": 0.14870372414588928, "step": 409, "step_time": 22.31595805697725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7521941438317299, "epoch": 0.0041, "grad_norm": 0.058444760739803314, "kl": 0.2912544496357441, "learning_rate": 7.99994821249546e-06, "loss": -0.0703, "step": 410, "step_time": 12.67445758197573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.44444465637207, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2036710940301418, "epoch": 0.00411, "frac_reward_zero_std": 0.25, "grad_norm": 0.07628532499074936, "kl": 0.40507348719984293, "learning_rate": 7.999947935187496e-06, "loss": -0.0735, "num_tokens": 9556932.0, "reward": 0.687767505645752, "reward_std": 0.8163596987724304, "rewards/rollout_reward_func/mean": 0.687767505645752, "rewards/rollout_reward_func/std": 0.8163597583770752, "sampling/importance_sampling_ratio/max": 1.5429061651229858, "sampling/importance_sampling_ratio/mean": 0.8370476961135864, "sampling/importance_sampling_ratio/min": 0.0015227055409923196, "sampling/sampling_logp_difference/max": 1.5418283939361572, "sampling/sampling_logp_difference/mean": 0.21634629368782043, "step": 411, "step_time": 24.929647823009873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1989325173199177, "epoch": 0.00412, "grad_norm": 0.07598745822906494, "kl": 0.4203215930610895, "learning_rate": 7.999947657139067e-06, "loss": -0.0738, "step": 412, "step_time": 13.798840748000657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 5.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3197643272578716, "epoch": 0.00413, "frac_reward_zero_std": 0.0, "grad_norm": 0.10054398328065872, "kl": 0.8467299938201904, "learning_rate": 7.99994737835017e-06, "loss": -0.0632, "num_tokens": 9612043.0, "reward": 0.748711109161377, "reward_std": 0.7917857766151428, "rewards/rollout_reward_func/mean": 0.748711109161377, "rewards/rollout_reward_func/std": 0.7917857766151428, "sampling/importance_sampling_ratio/max": 1.5048890113830566, "sampling/importance_sampling_ratio/mean": 0.7132800817489624, "sampling/importance_sampling_ratio/min": 1.133326577473781e-06, "sampling/sampling_logp_difference/max": 2.3461432456970215, "sampling/sampling_logp_difference/mean": 0.30249232053756714, "step": 413, "step_time": 26.155911732988898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.315633973106742, "epoch": 0.00414, "grad_norm": 0.10230891406536102, "kl": 0.877124760299921, "learning_rate": 7.999947098820806e-06, "loss": -0.0631, "step": 414, "step_time": 14.268861923948862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.875, "completions/mean_terminated_length": 4.133333683013916, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5444227643311024, "epoch": 0.00415, "frac_reward_zero_std": 0.0, "grad_norm": 0.23019617795944214, "kl": 2.134033642709255, "learning_rate": 7.999946818550977e-06, "loss": -0.0625, "num_tokens": 9658917.0, "reward": 0.787929356098175, "reward_std": 0.7678807377815247, "rewards/rollout_reward_func/mean": 0.787929356098175, "rewards/rollout_reward_func/std": 0.7678807377815247, "sampling/importance_sampling_ratio/max": 1.314962387084961, "sampling/importance_sampling_ratio/mean": 0.8919274210929871, "sampling/importance_sampling_ratio/min": 0.014437221921980381, "sampling/sampling_logp_difference/max": 1.7343335151672363, "sampling/sampling_logp_difference/mean": 0.13787348568439484, "step": 415, "step_time": 18.707480162993306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5497943228110671, "epoch": 0.00416, "grad_norm": 0.1623130440711975, "kl": 1.6740547623485327, "learning_rate": 7.99994653754068e-06, "loss": -0.0636, "step": 416, "step_time": 10.356442133983364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 4.90625, "completions/mean_terminated_length": 4.548387050628662, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7014513239264488, "epoch": 0.00417, "frac_reward_zero_std": 0.25, "grad_norm": 0.04980778321623802, "kl": 0.8554966598749161, "learning_rate": 7.999946255789918e-06, "loss": -0.041, "num_tokens": 9696485.0, "reward": 0.8666555881500244, "reward_std": 0.8755183815956116, "rewards/rollout_reward_func/mean": 0.8666555881500244, "rewards/rollout_reward_func/std": 0.8755183815956116, "sampling/importance_sampling_ratio/max": 1.190915822982788, "sampling/importance_sampling_ratio/mean": 0.8890873789787292, "sampling/importance_sampling_ratio/min": 4.203178104944527e-05, "sampling/sampling_logp_difference/max": 2.130462408065796, "sampling/sampling_logp_difference/mean": 0.1969638466835022, "step": 417, "step_time": 16.976363141991897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7084386926144361, "epoch": 0.00418, "grad_norm": 0.04949556663632393, "kl": 0.860804196447134, "learning_rate": 7.99994597329869e-06, "loss": -0.0411, "step": 418, "step_time": 10.251055111992173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.65625, "completions/mean_terminated_length": 4.586206912994385, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7709817532449961, "epoch": 0.00419, "frac_reward_zero_std": 0.0, "grad_norm": 12.053033828735352, "kl": 34.11589650250971, "learning_rate": 7.999945690066996e-06, "loss": 0.0053, "num_tokens": 9743842.0, "reward": 0.9392554759979248, "reward_std": 0.7727034091949463, "rewards/rollout_reward_func/mean": 0.9392554759979248, "rewards/rollout_reward_func/std": 0.7727034091949463, "sampling/importance_sampling_ratio/max": 1.1810015439987183, "sampling/importance_sampling_ratio/mean": 0.8526148200035095, "sampling/importance_sampling_ratio/min": 6.261462840484455e-05, "sampling/sampling_logp_difference/max": 4.950056076049805, "sampling/sampling_logp_difference/mean": 0.23111699521541595, "step": 419, "step_time": 21.11911726204562 }, { "clip_ratio/high_max": 0.014727011788636446, "clip_ratio/high_mean": 0.007363505894318223, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007363505894318223, "entropy": 0.787291930988431, "epoch": 0.0042, "grad_norm": 1.1348159313201904, "kl": 2.437524400651455, "learning_rate": 7.999945406094835e-06, "loss": -0.0735, "step": 420, "step_time": 11.358216327003902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.09375, "completions/mean_terminated_length": 5.068965435028076, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8941318839788437, "epoch": 0.00421, "frac_reward_zero_std": 0.0, "grad_norm": 0.05743320286273956, "kl": 0.341397050768137, "learning_rate": 7.999945121382207e-06, "loss": -0.0596, "num_tokens": 9797291.0, "reward": 0.9493669271469116, "reward_std": 0.7283130288124084, "rewards/rollout_reward_func/mean": 0.9493669271469116, "rewards/rollout_reward_func/std": 0.7283130288124084, "sampling/importance_sampling_ratio/max": 1.6341075897216797, "sampling/importance_sampling_ratio/mean": 0.816868245601654, "sampling/importance_sampling_ratio/min": 7.256078242789954e-05, "sampling/sampling_logp_difference/max": 1.4957844018936157, "sampling/sampling_logp_difference/mean": 0.19687753915786743, "step": 421, "step_time": 25.86495052301325 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 0.9127088692039251, "epoch": 0.00422, "grad_norm": 0.0713726207613945, "kl": 0.3017097022384405, "learning_rate": 7.999944835929116e-06, "loss": -0.0594, "step": 422, "step_time": 13.627120669960277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.620689868927002, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0078777633607388, "epoch": 0.00423, "frac_reward_zero_std": 0.25, "grad_norm": 0.06953392177820206, "kl": 0.3037557229399681, "learning_rate": 7.999944549735557e-06, "loss": -0.0445, "num_tokens": 9838434.0, "reward": 0.986161470413208, "reward_std": 0.6894993782043457, "rewards/rollout_reward_func/mean": 0.986161470413208, "rewards/rollout_reward_func/std": 0.6894993185997009, "sampling/importance_sampling_ratio/max": 1.4387962818145752, "sampling/importance_sampling_ratio/mean": 0.8362661600112915, "sampling/importance_sampling_ratio/min": 4.941530733049149e-06, "sampling/sampling_logp_difference/max": 2.251340627670288, "sampling/sampling_logp_difference/mean": 0.20533576607704163, "step": 423, "step_time": 18.713772529998096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0176112800836563, "epoch": 0.00424, "grad_norm": 0.07309430837631226, "kl": 0.2958119176328182, "learning_rate": 7.999944262801533e-06, "loss": -0.0444, "step": 424, "step_time": 10.854071523033781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.96875, "completions/mean_terminated_length": 4.233333587646484, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6041507795453072, "epoch": 0.00425, "frac_reward_zero_std": 0.25, "grad_norm": 0.05576584115624428, "kl": 0.3064746465533972, "learning_rate": 7.999943975127043e-06, "loss": -0.032, "num_tokens": 9890826.0, "reward": 0.9138927459716797, "reward_std": 0.7086270451545715, "rewards/rollout_reward_func/mean": 0.9138927459716797, "rewards/rollout_reward_func/std": 0.7086270451545715, "sampling/importance_sampling_ratio/max": 1.8442261219024658, "sampling/importance_sampling_ratio/mean": 0.9979860186576843, "sampling/importance_sampling_ratio/min": 5.901440090383403e-05, "sampling/sampling_logp_difference/max": 2.076197624206543, "sampling/sampling_logp_difference/mean": 0.1426524817943573, "step": 425, "step_time": 22.393034399021417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6113379783928394, "epoch": 0.00426, "grad_norm": 0.05685051158070564, "kl": 0.305101839825511, "learning_rate": 7.999943686712088e-06, "loss": -0.032, "step": 426, "step_time": 12.490566568973009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.8275861740112305, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.31564911454916, "epoch": 0.00427, "frac_reward_zero_std": 0.0, "grad_norm": 0.07715099304914474, "kl": 0.5486567616462708, "learning_rate": 7.999943397556666e-06, "loss": -0.0717, "num_tokens": 9930172.0, "reward": 1.1436715126037598, "reward_std": 0.7249509692192078, "rewards/rollout_reward_func/mean": 1.1436715126037598, "rewards/rollout_reward_func/std": 0.7249509692192078, "sampling/importance_sampling_ratio/max": 1.321905493736267, "sampling/importance_sampling_ratio/mean": 0.8582471609115601, "sampling/importance_sampling_ratio/min": 3.8588463979749577e-08, "sampling/sampling_logp_difference/max": 2.0831518173217773, "sampling/sampling_logp_difference/mean": 0.2968835234642029, "step": 427, "step_time": 17.97744349797722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3147878907620907, "epoch": 0.00428, "grad_norm": 0.07733435183763504, "kl": 0.5582096166908741, "learning_rate": 7.99994310766078e-06, "loss": -0.0715, "step": 428, "step_time": 10.374921944981907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.09375, "completions/mean_terminated_length": 4.259259223937988, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5525561347603798, "epoch": 0.00429, "frac_reward_zero_std": 0.0, "grad_norm": 0.04358163848519325, "kl": 0.8072533402591944, "learning_rate": 7.999942817024428e-06, "loss": -0.0809, "num_tokens": 9978012.0, "reward": 0.5558370351791382, "reward_std": 0.8866031765937805, "rewards/rollout_reward_func/mean": 0.5558370351791382, "rewards/rollout_reward_func/std": 0.8866031765937805, "sampling/importance_sampling_ratio/max": 1.2641386985778809, "sampling/importance_sampling_ratio/mean": 0.7120373249053955, "sampling/importance_sampling_ratio/min": 8.796452988235615e-08, "sampling/sampling_logp_difference/max": 1.9315682649612427, "sampling/sampling_logp_difference/mean": 0.3271472752094269, "step": 429, "step_time": 24.480102455971064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.551437422633171, "epoch": 0.0043, "grad_norm": 0.04190121218562126, "kl": 0.8577346932142973, "learning_rate": 7.99994252564761e-06, "loss": -0.081, "step": 430, "step_time": 13.457015213003615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.535714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.220441646873951, "epoch": 0.00431, "frac_reward_zero_std": 0.0, "grad_norm": 0.05914371833205223, "kl": 0.43658007122576237, "learning_rate": 7.999942233530327e-06, "loss": -0.074, "num_tokens": 10028176.0, "reward": 0.49083253741264343, "reward_std": 0.7564385533332825, "rewards/rollout_reward_func/mean": 0.49083253741264343, "rewards/rollout_reward_func/std": 0.7564385533332825, "sampling/importance_sampling_ratio/max": 1.4037858247756958, "sampling/importance_sampling_ratio/mean": 0.8222168684005737, "sampling/importance_sampling_ratio/min": 0.0015871624927967787, "sampling/sampling_logp_difference/max": 1.7039592266082764, "sampling/sampling_logp_difference/mean": 0.21238358318805695, "step": 431, "step_time": 20.26823779309052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2108271531760693, "epoch": 0.00432, "grad_norm": 0.05301377549767494, "kl": 0.4556863587349653, "learning_rate": 7.999941940672578e-06, "loss": -0.074, "step": 432, "step_time": 11.008747112035053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 4.296296119689941, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2428869660943747, "epoch": 0.00433, "frac_reward_zero_std": 0.0, "grad_norm": 0.07406355440616608, "kl": 0.3677310384809971, "learning_rate": 7.999941647074366e-06, "loss": -0.0679, "num_tokens": 10079605.0, "reward": 0.38076817989349365, "reward_std": 0.7797383069992065, "rewards/rollout_reward_func/mean": 0.38076817989349365, "rewards/rollout_reward_func/std": 0.7797383666038513, "sampling/importance_sampling_ratio/max": 1.619674563407898, "sampling/importance_sampling_ratio/mean": 0.8351776599884033, "sampling/importance_sampling_ratio/min": 2.2785712872064323e-07, "sampling/sampling_logp_difference/max": 2.050109386444092, "sampling/sampling_logp_difference/mean": 0.2711907923221588, "step": 433, "step_time": 25.67403205501614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2385822664946318, "epoch": 0.00434, "grad_norm": 0.07353239506483078, "kl": 0.3750270865857601, "learning_rate": 7.999941352735688e-06, "loss": -0.0679, "step": 434, "step_time": 14.076423360034823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.78125, "completions/mean_terminated_length": 5.100000381469727, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.282910943031311, "epoch": 0.00435, "frac_reward_zero_std": 0.0, "grad_norm": 0.04898824170231819, "kl": 0.6150991432368755, "learning_rate": 7.999941057656543e-06, "loss": -0.0861, "num_tokens": 10119727.0, "reward": 1.0145399570465088, "reward_std": 0.6562444567680359, "rewards/rollout_reward_func/mean": 1.0145399570465088, "rewards/rollout_reward_func/std": 0.6562444567680359, "sampling/importance_sampling_ratio/max": 1.2796550989151, "sampling/importance_sampling_ratio/mean": 0.7547486424446106, "sampling/importance_sampling_ratio/min": 4.783091753779445e-06, "sampling/sampling_logp_difference/max": 2.1289281845092773, "sampling/sampling_logp_difference/mean": 0.257715106010437, "step": 435, "step_time": 19.79769752197899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2764445394277573, "epoch": 0.00436, "grad_norm": 0.045514751225709915, "kl": 0.6331002656370401, "learning_rate": 7.999940761836937e-06, "loss": -0.0863, "step": 436, "step_time": 10.939296626020223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.3125, "completions/mean_terminated_length": 4.600000381469727, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9559417255222797, "epoch": 0.00437, "frac_reward_zero_std": 0.0, "grad_norm": 0.10249693691730499, "kl": 0.5899664256721735, "learning_rate": 7.999940465276862e-06, "loss": -0.0486, "num_tokens": 10165351.0, "reward": 0.9921737313270569, "reward_std": 0.6132720708847046, "rewards/rollout_reward_func/mean": 0.9921737313270569, "rewards/rollout_reward_func/std": 0.6132720708847046, "sampling/importance_sampling_ratio/max": 1.2658427953720093, "sampling/importance_sampling_ratio/mean": 0.9128040075302124, "sampling/importance_sampling_ratio/min": 1.1729628567991313e-05, "sampling/sampling_logp_difference/max": 2.0558977127075195, "sampling/sampling_logp_difference/mean": 0.2166210412979126, "step": 437, "step_time": 18.60990653000772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9407771285623312, "epoch": 0.00438, "grad_norm": 0.10027054697275162, "kl": 0.6499553360044956, "learning_rate": 7.999940167976326e-06, "loss": -0.0486, "step": 438, "step_time": 10.652584114955971 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.1875, "completions/mean_terminated_length": 4.4666666984558105, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9393727704882622, "epoch": 0.00439, "frac_reward_zero_std": 0.0, "grad_norm": 0.16807715594768524, "kl": 0.7448949441313744, "learning_rate": 7.999939869935323e-06, "loss": -0.0672, "num_tokens": 10223406.0, "reward": 0.6257652640342712, "reward_std": 0.8476064205169678, "rewards/rollout_reward_func/mean": 0.6257652640342712, "rewards/rollout_reward_func/std": 0.847606360912323, "sampling/importance_sampling_ratio/max": 1.2960060834884644, "sampling/importance_sampling_ratio/mean": 0.8532721400260925, "sampling/importance_sampling_ratio/min": 0.001135560218244791, "sampling/sampling_logp_difference/max": 1.8688627481460571, "sampling/sampling_logp_difference/mean": 0.19936782121658325, "step": 439, "step_time": 25.170635411981493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9423157274723053, "epoch": 0.0044, "grad_norm": 0.14979654550552368, "kl": 0.8712356761097908, "learning_rate": 7.999939571153855e-06, "loss": -0.0675, "step": 440, "step_time": 13.574412953981664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 4.461538791656494, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0837529338896275, "epoch": 0.00441, "frac_reward_zero_std": 0.0, "grad_norm": 0.07876934111118317, "kl": 0.3638165593147278, "learning_rate": 7.999939271631924e-06, "loss": -0.0618, "num_tokens": 10272895.0, "reward": 0.27090224623680115, "reward_std": 0.8251415491104126, "rewards/rollout_reward_func/mean": 0.27090224623680115, "rewards/rollout_reward_func/std": 0.8251414895057678, "sampling/importance_sampling_ratio/max": 1.3803131580352783, "sampling/importance_sampling_ratio/mean": 0.8154420852661133, "sampling/importance_sampling_ratio/min": 4.116978379897773e-05, "sampling/sampling_logp_difference/max": 1.575681447982788, "sampling/sampling_logp_difference/mean": 0.21357107162475586, "step": 441, "step_time": 22.181711821031058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.08418745175004, "epoch": 0.00442, "grad_norm": 0.08007293194532394, "kl": 0.36593049205839634, "learning_rate": 7.999938971369529e-06, "loss": -0.0619, "step": 442, "step_time": 11.797082222998142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.71875, "completions/mean_terminated_length": 4.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0466924719512463, "epoch": 0.00443, "frac_reward_zero_std": 0.0, "grad_norm": 0.18582257628440857, "kl": 0.6493821330368519, "learning_rate": 7.999938670366667e-06, "loss": -0.0423, "num_tokens": 10326829.0, "reward": 0.17459148168563843, "reward_std": 0.618916392326355, "rewards/rollout_reward_func/mean": 0.17459148168563843, "rewards/rollout_reward_func/std": 0.6189164519309998, "sampling/importance_sampling_ratio/max": 1.443799614906311, "sampling/importance_sampling_ratio/mean": 0.8473423719406128, "sampling/importance_sampling_ratio/min": 1.624089782126248e-05, "sampling/sampling_logp_difference/max": 1.892994999885559, "sampling/sampling_logp_difference/mean": 0.24218276143074036, "step": 443, "step_time": 26.020916777051752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0458623804152012, "epoch": 0.00444, "grad_norm": 0.18714037537574768, "kl": 0.6430862043052912, "learning_rate": 7.999938368623343e-06, "loss": -0.0427, "step": 444, "step_time": 14.114819738984806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.53125, "completions/mean_terminated_length": 4.708333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5890394151210785, "epoch": 0.00445, "frac_reward_zero_std": 0.25, "grad_norm": 0.08531899750232697, "kl": 0.4912784416228533, "learning_rate": 7.999938066139555e-06, "loss": -0.0558, "num_tokens": 10382073.0, "reward": 0.10221173614263535, "reward_std": 0.7688279151916504, "rewards/rollout_reward_func/mean": 0.10221173614263535, "rewards/rollout_reward_func/std": 0.7688279151916504, "sampling/importance_sampling_ratio/max": 1.645116925239563, "sampling/importance_sampling_ratio/mean": 0.7742736339569092, "sampling/importance_sampling_ratio/min": 6.575196209723799e-08, "sampling/sampling_logp_difference/max": 2.544339179992676, "sampling/sampling_logp_difference/mean": 0.3077351152896881, "step": 445, "step_time": 26.610449236963177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.590175874531269, "epoch": 0.00446, "grad_norm": 0.08146919310092926, "kl": 0.47211611829698086, "learning_rate": 7.9999377629153e-06, "loss": -0.0559, "step": 446, "step_time": 14.111866127001122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.40625, "completions/mean_terminated_length": 4.310344696044922, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8793988283723593, "epoch": 0.00447, "frac_reward_zero_std": 0.25, "grad_norm": 0.10646211355924606, "kl": 0.3899300266057253, "learning_rate": 7.999937458950583e-06, "loss": -0.0456, "num_tokens": 10428446.0, "reward": 0.8470406532287598, "reward_std": 0.7630069851875305, "rewards/rollout_reward_func/mean": 0.8470406532287598, "rewards/rollout_reward_func/std": 0.7630070447921753, "sampling/importance_sampling_ratio/max": 1.4384384155273438, "sampling/importance_sampling_ratio/mean": 0.9562561511993408, "sampling/importance_sampling_ratio/min": 1.3362163372221403e-05, "sampling/sampling_logp_difference/max": 1.7991650104522705, "sampling/sampling_logp_difference/mean": 0.19184672832489014, "step": 447, "step_time": 22.431525175983552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8797408137470484, "epoch": 0.00448, "grad_norm": 0.09731656312942505, "kl": 0.3885291861370206, "learning_rate": 7.999937154245402e-06, "loss": -0.046, "step": 448, "step_time": 11.776013932045316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.84375, "completions/mean_terminated_length": 4.392857551574707, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0751467738300562, "epoch": 0.00449, "frac_reward_zero_std": 0.0, "grad_norm": 0.07652750611305237, "kl": 0.46287139412015676, "learning_rate": 7.999936848799756e-06, "loss": -0.0793, "num_tokens": 10484265.0, "reward": 0.7176253795623779, "reward_std": 0.8316388726234436, "rewards/rollout_reward_func/mean": 0.7176253795623779, "rewards/rollout_reward_func/std": 0.8316388130187988, "sampling/importance_sampling_ratio/max": 1.576878309249878, "sampling/importance_sampling_ratio/mean": 0.8620915412902832, "sampling/importance_sampling_ratio/min": 8.626886847196147e-05, "sampling/sampling_logp_difference/max": 1.4657572507858276, "sampling/sampling_logp_difference/mean": 0.22579360008239746, "step": 449, "step_time": 26.19698761496693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0729845762252808, "epoch": 0.0045, "grad_norm": 0.076454758644104, "kl": 0.4605646822601557, "learning_rate": 7.999936542613647e-06, "loss": -0.0796, "step": 450, "step_time": 13.595407003012951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 5.000000476837158, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9855628591030836, "epoch": 0.00451, "frac_reward_zero_std": 0.0, "grad_norm": 0.07250168919563293, "kl": 0.5746791362762451, "learning_rate": 7.999936235687075e-06, "loss": -0.0977, "num_tokens": 10536142.0, "reward": 0.8903732299804688, "reward_std": 0.753035843372345, "rewards/rollout_reward_func/mean": 0.8903732299804688, "rewards/rollout_reward_func/std": 0.753035843372345, "sampling/importance_sampling_ratio/max": 1.9105591773986816, "sampling/importance_sampling_ratio/mean": 0.8446760773658752, "sampling/importance_sampling_ratio/min": 0.0004431135894265026, "sampling/sampling_logp_difference/max": 2.4694108963012695, "sampling/sampling_logp_difference/mean": 0.22520487010478973, "step": 451, "step_time": 27.096145650051767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.9825783316046, "epoch": 0.00452, "grad_norm": 0.05896155163645744, "kl": 0.6176212951540947, "learning_rate": 7.999935928020036e-06, "loss": -0.0978, "step": 452, "step_time": 15.796823458978906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.34375, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9099730355665088, "epoch": 0.00453, "frac_reward_zero_std": 0.0, "grad_norm": 0.11652425676584244, "kl": 0.48894188553094864, "learning_rate": 7.999935619612536e-06, "loss": -0.069, "num_tokens": 10584947.0, "reward": 0.7086681723594666, "reward_std": 0.8101405501365662, "rewards/rollout_reward_func/mean": 0.7086681723594666, "rewards/rollout_reward_func/std": 0.8101404905319214, "sampling/importance_sampling_ratio/max": 1.3000679016113281, "sampling/importance_sampling_ratio/mean": 0.8647677302360535, "sampling/importance_sampling_ratio/min": 1.4982855418566032e-06, "sampling/sampling_logp_difference/max": 1.8548905849456787, "sampling/sampling_logp_difference/mean": 0.2125306874513626, "step": 453, "step_time": 22.907551970041823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9030332947149873, "epoch": 0.00454, "grad_norm": 0.10838805884122849, "kl": 0.4974936991930008, "learning_rate": 7.999935310464572e-06, "loss": -0.0696, "step": 454, "step_time": 13.232060546026332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.09375, "completions/mean_terminated_length": 4.678571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1372667476534843, "epoch": 0.00455, "frac_reward_zero_std": 0.0, "grad_norm": 0.07876982539892197, "kl": 0.863012969493866, "learning_rate": 7.999935000576144e-06, "loss": -0.063, "num_tokens": 10643017.0, "reward": 0.6924740672111511, "reward_std": 0.7644205689430237, "rewards/rollout_reward_func/mean": 0.6924740672111511, "rewards/rollout_reward_func/std": 0.7644206285476685, "sampling/importance_sampling_ratio/max": 1.3528345823287964, "sampling/importance_sampling_ratio/mean": 0.8058620095252991, "sampling/importance_sampling_ratio/min": 0.0009413784719072282, "sampling/sampling_logp_difference/max": 2.2015206813812256, "sampling/sampling_logp_difference/mean": 0.26382967829704285, "step": 455, "step_time": 24.720982278027805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1374682672321796, "epoch": 0.00456, "grad_norm": 0.08480861037969589, "kl": 0.9459604360163212, "learning_rate": 7.999934689947254e-06, "loss": -0.0631, "step": 456, "step_time": 12.626596341026016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 5.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3735424391925335, "epoch": 0.00457, "frac_reward_zero_std": 0.0, "grad_norm": 0.13616310060024261, "kl": 1.2195007614791393, "learning_rate": 7.999934378577899e-06, "loss": -0.0564, "num_tokens": 10703786.0, "reward": 0.45222529768943787, "reward_std": 0.8714003562927246, "rewards/rollout_reward_func/mean": 0.45222529768943787, "rewards/rollout_reward_func/std": 0.8714004158973694, "sampling/importance_sampling_ratio/max": 1.561670184135437, "sampling/importance_sampling_ratio/mean": 0.7985560894012451, "sampling/importance_sampling_ratio/min": 6.716194178579826e-08, "sampling/sampling_logp_difference/max": 1.924910068511963, "sampling/sampling_logp_difference/mean": 0.27390938997268677, "step": 457, "step_time": 30.594134671031497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3797258473932743, "epoch": 0.00458, "grad_norm": 0.12352892756462097, "kl": 1.1614382322877645, "learning_rate": 7.999934066468082e-06, "loss": -0.0566, "step": 458, "step_time": 15.74204209400341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.65625, "completions/mean_terminated_length": 4.586206912994385, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0338002778589725, "epoch": 0.00459, "frac_reward_zero_std": 0.0, "grad_norm": 0.10277114063501358, "kl": 0.9848314560949802, "learning_rate": 7.9999337536178e-06, "loss": -0.0691, "num_tokens": 10758031.0, "reward": 0.6495634913444519, "reward_std": 0.7857602834701538, "rewards/rollout_reward_func/mean": 0.6495634913444519, "rewards/rollout_reward_func/std": 0.7857602834701538, "sampling/importance_sampling_ratio/max": 1.3315683603286743, "sampling/importance_sampling_ratio/mean": 0.8146376609802246, "sampling/importance_sampling_ratio/min": 1.3611239069177827e-07, "sampling/sampling_logp_difference/max": 2.1361520290374756, "sampling/sampling_logp_difference/mean": 0.2658076286315918, "step": 459, "step_time": 25.37204743194161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0461148619651794, "epoch": 0.0046, "grad_norm": 0.0961964800953865, "kl": 0.9245952889323235, "learning_rate": 7.999933440027056e-06, "loss": -0.0691, "step": 460, "step_time": 13.651452165009687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 4.615384578704834, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4008104223757982, "epoch": 0.00461, "frac_reward_zero_std": 0.0, "grad_norm": 0.06183402240276337, "kl": 0.41466501727700233, "learning_rate": 7.999933125695849e-06, "loss": -0.0721, "num_tokens": 10816746.0, "reward": 0.6473506689071655, "reward_std": 0.8328753113746643, "rewards/rollout_reward_func/mean": 0.6473506689071655, "rewards/rollout_reward_func/std": 0.8328751921653748, "sampling/importance_sampling_ratio/max": 2.4470326900482178, "sampling/importance_sampling_ratio/mean": 0.8484361171722412, "sampling/importance_sampling_ratio/min": 1.634335831113276e-06, "sampling/sampling_logp_difference/max": 2.397919178009033, "sampling/sampling_logp_difference/mean": 0.333204060792923, "step": 461, "step_time": 23.60391228404478 }, { "clip_ratio/high_max": 0.016666667070239782, "clip_ratio/high_mean": 0.008333333535119891, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008333333535119891, "entropy": 1.4060474280267954, "epoch": 0.00462, "grad_norm": 0.05580917000770569, "kl": 0.39964898489415646, "learning_rate": 7.99993281062418e-06, "loss": -0.0723, "step": 462, "step_time": 12.025075980025576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.09375, "completions/mean_terminated_length": 4.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5638075470924377, "epoch": 0.00463, "frac_reward_zero_std": 0.0, "grad_norm": 0.05276424065232277, "kl": 0.49182462785393, "learning_rate": 7.999932494812047e-06, "loss": -0.0781, "num_tokens": 10868443.0, "reward": 0.396838903427124, "reward_std": 0.8137326836585999, "rewards/rollout_reward_func/mean": 0.396838903427124, "rewards/rollout_reward_func/std": 0.8137326836585999, "sampling/importance_sampling_ratio/max": 1.380657434463501, "sampling/importance_sampling_ratio/mean": 0.7321645021438599, "sampling/importance_sampling_ratio/min": 6.9114712459850125e-06, "sampling/sampling_logp_difference/max": 2.1312780380249023, "sampling/sampling_logp_difference/mean": 0.3377552628517151, "step": 463, "step_time": 24.27645565205603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5695242658257484, "epoch": 0.00464, "grad_norm": 0.052316054701805115, "kl": 0.46498769894242287, "learning_rate": 7.999932178259451e-06, "loss": -0.078, "step": 464, "step_time": 12.75783608702477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.71875, "completions/mean_terminated_length": 4.904761791229248, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.526876538991928, "epoch": 0.00465, "frac_reward_zero_std": 0.0, "grad_norm": 0.052536413073539734, "kl": 0.48471412481740117, "learning_rate": 7.999931860966393e-06, "loss": -0.0911, "num_tokens": 10924730.0, "reward": 0.23743462562561035, "reward_std": 0.886920154094696, "rewards/rollout_reward_func/mean": 0.23743462562561035, "rewards/rollout_reward_func/std": 0.8869200944900513, "sampling/importance_sampling_ratio/max": 1.4251806735992432, "sampling/importance_sampling_ratio/mean": 0.641629695892334, "sampling/importance_sampling_ratio/min": 2.5964734806649403e-09, "sampling/sampling_logp_difference/max": 2.070052146911621, "sampling/sampling_logp_difference/mean": 0.4645833373069763, "step": 465, "step_time": 30.51392663194565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.530018672347069, "epoch": 0.00466, "grad_norm": 0.047440044581890106, "kl": 0.43191104056313634, "learning_rate": 7.999931542932872e-06, "loss": -0.0913, "step": 466, "step_time": 15.521323367982404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.3125, "completions/mean_terminated_length": 5.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2971075735986233, "epoch": 0.00467, "frac_reward_zero_std": 0.0, "grad_norm": 0.3189053237438202, "kl": 0.29294630512595177, "learning_rate": 7.999931224158886e-06, "loss": -0.0431, "num_tokens": 10982335.0, "reward": 0.5054774880409241, "reward_std": 0.8056851029396057, "rewards/rollout_reward_func/mean": 0.5054774880409241, "rewards/rollout_reward_func/std": 0.8056851625442505, "sampling/importance_sampling_ratio/max": 1.9120877981185913, "sampling/importance_sampling_ratio/mean": 0.8148833513259888, "sampling/importance_sampling_ratio/min": 1.9064356138187577e-07, "sampling/sampling_logp_difference/max": 2.099660873413086, "sampling/sampling_logp_difference/mean": 0.24574926495552063, "step": 467, "step_time": 32.178547023941064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.300191966816783, "epoch": 0.00468, "grad_norm": 0.34490975737571716, "kl": 0.29593408294022083, "learning_rate": 7.999930904644442e-06, "loss": -0.0445, "step": 468, "step_time": 16.96833751205122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 4.454545497894287, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.939660757780075, "epoch": 0.00469, "frac_reward_zero_std": 0.0, "grad_norm": 0.04702865704894066, "kl": 0.2340615764260292, "learning_rate": 7.999930584389531e-06, "loss": -0.0809, "num_tokens": 11032185.0, "reward": 0.6182172298431396, "reward_std": 0.9419757127761841, "rewards/rollout_reward_func/mean": 0.6182172298431396, "rewards/rollout_reward_func/std": 0.9419757127761841, "sampling/importance_sampling_ratio/max": 1.2224148511886597, "sampling/importance_sampling_ratio/mean": 0.6371061205863953, "sampling/importance_sampling_ratio/min": 1.6921627548072138e-06, "sampling/sampling_logp_difference/max": 2.0292139053344727, "sampling/sampling_logp_difference/mean": 0.36639827489852905, "step": 469, "step_time": 24.925958732055733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.940890982747078, "epoch": 0.0047, "grad_norm": 0.04890264943242073, "kl": 0.23242785315960646, "learning_rate": 7.999930263394161e-06, "loss": -0.0808, "step": 470, "step_time": 12.61580475300434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.40625, "completions/mean_terminated_length": 4.954545497894287, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.068998582661152, "epoch": 0.00471, "frac_reward_zero_std": 0.0, "grad_norm": 0.05342728644609451, "kl": 0.7258205483667552, "learning_rate": 7.999929941658327e-06, "loss": -0.0755, "num_tokens": 11092721.0, "reward": 0.16865353286266327, "reward_std": 0.7434404492378235, "rewards/rollout_reward_func/mean": 0.16865353286266327, "rewards/rollout_reward_func/std": 0.7434404492378235, "sampling/importance_sampling_ratio/max": 1.4035414457321167, "sampling/importance_sampling_ratio/mean": 0.6635403037071228, "sampling/importance_sampling_ratio/min": 1.1342401684544257e-08, "sampling/sampling_logp_difference/max": 2.023164749145508, "sampling/sampling_logp_difference/mean": 0.39809465408325195, "step": 471, "step_time": 30.101128083013464 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 2.068058021366596, "epoch": 0.00472, "grad_norm": 0.05161934718489647, "kl": 0.634024428203702, "learning_rate": 7.999929619182034e-06, "loss": -0.0757, "step": 472, "step_time": 15.971280163998017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.125, "completions/mean_terminated_length": 5.076923370361328, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.080074302852154, "epoch": 0.00473, "frac_reward_zero_std": 0.0, "grad_norm": 0.11290726810693741, "kl": 0.7003677003085613, "learning_rate": 7.999929295965276e-06, "loss": -0.0745, "num_tokens": 11145451.0, "reward": 0.48189395666122437, "reward_std": 0.8950052261352539, "rewards/rollout_reward_func/mean": 0.48189395666122437, "rewards/rollout_reward_func/std": 0.8950051665306091, "sampling/importance_sampling_ratio/max": 1.7308119535446167, "sampling/importance_sampling_ratio/mean": 0.6761693954467773, "sampling/importance_sampling_ratio/min": 1.1303080782454344e-07, "sampling/sampling_logp_difference/max": 2.0327281951904297, "sampling/sampling_logp_difference/mean": 0.4285852909088135, "step": 473, "step_time": 25.421276895009214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.077254109084606, "epoch": 0.00474, "grad_norm": 0.11588740348815918, "kl": 0.6723746135830879, "learning_rate": 7.999928972008055e-06, "loss": -0.0749, "step": 474, "step_time": 13.440731778013287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.03125, "completions/mean_terminated_length": 4.185185432434082, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3296767547726631, "epoch": 0.00475, "frac_reward_zero_std": 0.0, "grad_norm": 0.07043924927711487, "kl": 0.44902315316721797, "learning_rate": 7.999928647310374e-06, "loss": -0.0736, "num_tokens": 11199005.0, "reward": 0.8208462595939636, "reward_std": 0.7975001931190491, "rewards/rollout_reward_func/mean": 0.8208462595939636, "rewards/rollout_reward_func/std": 0.7975001931190491, "sampling/importance_sampling_ratio/max": 1.3193655014038086, "sampling/importance_sampling_ratio/mean": 0.8506030440330505, "sampling/importance_sampling_ratio/min": 2.7310983341521933e-07, "sampling/sampling_logp_difference/max": 2.422891855239868, "sampling/sampling_logp_difference/mean": 0.2992197275161743, "step": 475, "step_time": 23.07718611799646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3340415805578232, "epoch": 0.00476, "grad_norm": 0.07597330957651138, "kl": 0.4385577477514744, "learning_rate": 7.99992832187223e-06, "loss": -0.0739, "step": 476, "step_time": 12.385645013971953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6482316181063652, "epoch": 0.00477, "frac_reward_zero_std": 0.0, "grad_norm": 0.08493640273809433, "kl": 0.29578012600541115, "learning_rate": 7.999927995693626e-06, "loss": -0.0901, "num_tokens": 11261611.0, "reward": 0.42266860604286194, "reward_std": 0.8267195224761963, "rewards/rollout_reward_func/mean": 0.42266860604286194, "rewards/rollout_reward_func/std": 0.8267194032669067, "sampling/importance_sampling_ratio/max": 2.0086185932159424, "sampling/importance_sampling_ratio/mean": 0.7898989915847778, "sampling/importance_sampling_ratio/min": 0.00013482969370670617, "sampling/sampling_logp_difference/max": 1.7942328453063965, "sampling/sampling_logp_difference/mean": 0.27514591813087463, "step": 477, "step_time": 29.076624413981335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6474096104502678, "epoch": 0.00478, "grad_norm": 0.08395721763372421, "kl": 0.297436760738492, "learning_rate": 7.999927668774559e-06, "loss": -0.0904, "step": 478, "step_time": 14.936525524011813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.40625, "completions/mean_terminated_length": 4.192307949066162, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.311167972162366, "epoch": 0.00479, "frac_reward_zero_std": 0.0, "grad_norm": 0.07084281742572784, "kl": 0.35810175351798534, "learning_rate": 7.99992734111503e-06, "loss": -0.1059, "num_tokens": 11305261.0, "reward": 0.9702949523925781, "reward_std": 0.8525658249855042, "rewards/rollout_reward_func/mean": 0.9702949523925781, "rewards/rollout_reward_func/std": 0.8525658249855042, "sampling/importance_sampling_ratio/max": 1.253726601600647, "sampling/importance_sampling_ratio/mean": 0.8165009617805481, "sampling/importance_sampling_ratio/min": 4.850735990657995e-07, "sampling/sampling_logp_difference/max": 2.104079246520996, "sampling/sampling_logp_difference/mean": 0.2811121940612793, "step": 479, "step_time": 21.165911058022175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.303929727524519, "epoch": 0.0048, "grad_norm": 0.06678757071495056, "kl": 0.3699024822562933, "learning_rate": 7.99992701271504e-06, "loss": -0.1061, "step": 480, "step_time": 11.040780980983982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 5.217391490936279, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8718453794717789, "epoch": 0.00481, "frac_reward_zero_std": 0.0, "grad_norm": 0.14549529552459717, "kl": 0.5210047122091055, "learning_rate": 7.99992668357459e-06, "loss": -0.121, "num_tokens": 11357609.0, "reward": 0.48713254928588867, "reward_std": 0.9487867951393127, "rewards/rollout_reward_func/mean": 0.48713254928588867, "rewards/rollout_reward_func/std": 0.9487867951393127, "sampling/importance_sampling_ratio/max": 2.3518471717834473, "sampling/importance_sampling_ratio/mean": 0.7880797386169434, "sampling/importance_sampling_ratio/min": 4.615037687472068e-05, "sampling/sampling_logp_difference/max": 1.6025046110153198, "sampling/sampling_logp_difference/mean": 0.2936639189720154, "step": 481, "step_time": 27.491737539035967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8580367267131805, "epoch": 0.00482, "grad_norm": 0.1297236680984497, "kl": 0.5755948992446065, "learning_rate": 7.999926353693675e-06, "loss": -0.1215, "step": 482, "step_time": 14.451977048011031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.15625, "completions/mean_terminated_length": 5.137930870056152, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2283544018864632, "epoch": 0.00483, "frac_reward_zero_std": 0.0, "grad_norm": 0.12779687345027924, "kl": 0.2867805603891611, "learning_rate": 7.999926023072302e-06, "loss": -0.0323, "num_tokens": 11407950.0, "reward": 0.6533698439598083, "reward_std": 0.7851335406303406, "rewards/rollout_reward_func/mean": 0.6533698439598083, "rewards/rollout_reward_func/std": 0.7851335406303406, "sampling/importance_sampling_ratio/max": 1.5123517513275146, "sampling/importance_sampling_ratio/mean": 0.8961306810379028, "sampling/importance_sampling_ratio/min": 1.4671210919914301e-05, "sampling/sampling_logp_difference/max": 2.0707974433898926, "sampling/sampling_logp_difference/mean": 0.24472665786743164, "step": 483, "step_time": 23.71500876097707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2235076054930687, "epoch": 0.00484, "grad_norm": 0.12783628702163696, "kl": 0.29673800989985466, "learning_rate": 7.999925691710467e-06, "loss": -0.0325, "step": 484, "step_time": 12.453829637990566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.518518447875977, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3443979695439339, "epoch": 0.00485, "frac_reward_zero_std": 0.0, "grad_norm": 0.05683717131614685, "kl": 0.4403806999325752, "learning_rate": 7.99992535960817e-06, "loss": -0.0843, "num_tokens": 11459426.0, "reward": 0.6813647747039795, "reward_std": 0.8458127975463867, "rewards/rollout_reward_func/mean": 0.6813647747039795, "rewards/rollout_reward_func/std": 0.8458127975463867, "sampling/importance_sampling_ratio/max": 2.3095085620880127, "sampling/importance_sampling_ratio/mean": 0.8355644345283508, "sampling/importance_sampling_ratio/min": 1.0976687917718664e-05, "sampling/sampling_logp_difference/max": 1.7391624450683594, "sampling/sampling_logp_difference/mean": 0.2757020592689514, "step": 485, "step_time": 25.10814134692191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.34378220140934, "epoch": 0.00486, "grad_norm": 0.05842785909771919, "kl": 0.4436835292726755, "learning_rate": 7.999925026765412e-06, "loss": -0.0842, "step": 486, "step_time": 13.669637045066338 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 5.034482955932617, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1653759628534317, "epoch": 0.00487, "frac_reward_zero_std": 0.0, "grad_norm": 0.09875653684139252, "kl": 0.7571102548390627, "learning_rate": 7.999924693182194e-06, "loss": -0.0908, "num_tokens": 11512851.0, "reward": 0.8903168439865112, "reward_std": 0.7899300456047058, "rewards/rollout_reward_func/mean": 0.8903168439865112, "rewards/rollout_reward_func/std": 0.7899300456047058, "sampling/importance_sampling_ratio/max": 1.3218833208084106, "sampling/importance_sampling_ratio/mean": 0.8547342419624329, "sampling/importance_sampling_ratio/min": 3.842032583634136e-06, "sampling/sampling_logp_difference/max": 2.1148722171783447, "sampling/sampling_logp_difference/mean": 0.24606333673000336, "step": 487, "step_time": 25.0600538700819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1638554073870182, "epoch": 0.00488, "grad_norm": 0.10024292021989822, "kl": 0.7677403837442398, "learning_rate": 7.999924358858514e-06, "loss": -0.0911, "step": 488, "step_time": 12.531735428987304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.84375, "completions/mean_terminated_length": 4.392857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.063460006378591, "epoch": 0.00489, "frac_reward_zero_std": 0.0, "grad_norm": 0.093206487596035, "kl": 1.6352518536150455, "learning_rate": 7.999924023794374e-06, "loss": -0.0701, "num_tokens": 11560862.0, "reward": 0.9515632390975952, "reward_std": 0.8922812938690186, "rewards/rollout_reward_func/mean": 0.9515632390975952, "rewards/rollout_reward_func/std": 0.8922812938690186, "sampling/importance_sampling_ratio/max": 1.4983619451522827, "sampling/importance_sampling_ratio/mean": 0.8903762102127075, "sampling/importance_sampling_ratio/min": 0.00017759727779775858, "sampling/sampling_logp_difference/max": 1.7576837539672852, "sampling/sampling_logp_difference/mean": 0.2114764302968979, "step": 489, "step_time": 28.200102223985596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0655948631465435, "epoch": 0.0049, "grad_norm": 0.08360596746206284, "kl": 1.5129771940410137, "learning_rate": 7.999923687989774e-06, "loss": -0.0704, "step": 490, "step_time": 15.195641595986672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.15625, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2555901389569044, "epoch": 0.00491, "frac_reward_zero_std": 0.0, "grad_norm": 0.04914679378271103, "kl": 1.0411898791790009, "learning_rate": 7.999923351444713e-06, "loss": -0.0997, "num_tokens": 11605948.0, "reward": 1.0549850463867188, "reward_std": 0.7377558350563049, "rewards/rollout_reward_func/mean": 1.0549850463867188, "rewards/rollout_reward_func/std": 0.7377558350563049, "sampling/importance_sampling_ratio/max": 1.4070483446121216, "sampling/importance_sampling_ratio/mean": 0.8655003309249878, "sampling/importance_sampling_ratio/min": 4.895463916909648e-06, "sampling/sampling_logp_difference/max": 2.4657840728759766, "sampling/sampling_logp_difference/mean": 0.3030361533164978, "step": 491, "step_time": 20.216751045023557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2572268098592758, "epoch": 0.00492, "grad_norm": 0.04707992821931839, "kl": 0.9903748482465744, "learning_rate": 7.99992301415919e-06, "loss": -0.0998, "step": 492, "step_time": 11.597842251998372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 4.461538791656494, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7816320806741714, "epoch": 0.00493, "frac_reward_zero_std": 0.0, "grad_norm": 0.14163434505462646, "kl": 0.3549789562821388, "learning_rate": 7.999922676133208e-06, "loss": -0.076, "num_tokens": 11659277.0, "reward": 0.3619038760662079, "reward_std": 0.8807392120361328, "rewards/rollout_reward_func/mean": 0.3619038760662079, "rewards/rollout_reward_func/std": 0.880739152431488, "sampling/importance_sampling_ratio/max": 1.5560569763183594, "sampling/importance_sampling_ratio/mean": 0.7546484470367432, "sampling/importance_sampling_ratio/min": 1.1279455065960065e-05, "sampling/sampling_logp_difference/max": 2.423860549926758, "sampling/sampling_logp_difference/mean": 0.346530020236969, "step": 493, "step_time": 23.529007861972786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 1.781023744493723, "epoch": 0.00494, "grad_norm": 0.12203740328550339, "kl": 0.3782849069684744, "learning_rate": 7.999922337366765e-06, "loss": -0.0764, "step": 494, "step_time": 12.678790523001226 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.90625, "completions/mean_terminated_length": 5.208333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.869752362370491, "epoch": 0.00495, "frac_reward_zero_std": 0.0, "grad_norm": 0.15871509909629822, "kl": 0.40076558850705624, "learning_rate": 7.99992199785986e-06, "loss": -0.0972, "num_tokens": 11723974.0, "reward": 0.1816389560699463, "reward_std": 0.7926532030105591, "rewards/rollout_reward_func/mean": 0.1816389560699463, "rewards/rollout_reward_func/std": 0.7926532030105591, "sampling/importance_sampling_ratio/max": 1.655285358428955, "sampling/importance_sampling_ratio/mean": 0.6995508670806885, "sampling/importance_sampling_ratio/min": 2.1297337298165075e-05, "sampling/sampling_logp_difference/max": 1.7356127500534058, "sampling/sampling_logp_difference/mean": 0.3247535824775696, "step": 495, "step_time": 28.100377726979787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.86977881193161, "epoch": 0.00496, "grad_norm": 0.15292279422283173, "kl": 0.4015046311542392, "learning_rate": 7.999921657612498e-06, "loss": -0.0971, "step": 496, "step_time": 14.25287095902604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.46875, "completions/mean_terminated_length": 4.703703880310059, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5802722927182913, "epoch": 0.00497, "frac_reward_zero_std": 0.0, "grad_norm": 0.11027173697948456, "kl": 0.6935782618820667, "learning_rate": 7.999921316624673e-06, "loss": -0.0798, "num_tokens": 11778074.0, "reward": 0.6680933833122253, "reward_std": 0.8768616914749146, "rewards/rollout_reward_func/mean": 0.6680933833122253, "rewards/rollout_reward_func/std": 0.8768617510795593, "sampling/importance_sampling_ratio/max": 1.6111619472503662, "sampling/importance_sampling_ratio/mean": 0.8161230683326721, "sampling/importance_sampling_ratio/min": 1.336664240625396e-06, "sampling/sampling_logp_difference/max": 2.695322036743164, "sampling/sampling_logp_difference/mean": 0.415325403213501, "step": 497, "step_time": 24.76455704800901 }, { "clip_ratio/high_max": 0.010282258037477732, "clip_ratio/high_mean": 0.005141129018738866, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005141129018738866, "entropy": 1.5777900321409106, "epoch": 0.00498, "grad_norm": 0.09713035821914673, "kl": 0.6771973334252834, "learning_rate": 7.99992097489639e-06, "loss": -0.0802, "step": 498, "step_time": 12.498095312999794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 4.615384578704834, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6364540047943592, "epoch": 0.00499, "frac_reward_zero_std": 0.0, "grad_norm": 0.08753769844770432, "kl": 0.35495161823928356, "learning_rate": 7.999920632427647e-06, "loss": -0.088, "num_tokens": 11830207.0, "reward": 0.4619121849536896, "reward_std": 0.8955438137054443, "rewards/rollout_reward_func/mean": 0.4619121849536896, "rewards/rollout_reward_func/std": 0.8955437541007996, "sampling/importance_sampling_ratio/max": 1.6088495254516602, "sampling/importance_sampling_ratio/mean": 0.888256311416626, "sampling/importance_sampling_ratio/min": 1.1377019326630489e-09, "sampling/sampling_logp_difference/max": 2.467972755432129, "sampling/sampling_logp_difference/mean": 0.3614148497581482, "step": 499, "step_time": 28.796263925993117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6374413892626762, "epoch": 0.005, "grad_norm": 0.08706203103065491, "kl": 0.357520604506135, "learning_rate": 7.999920289218444e-06, "loss": -0.0879, "step": 500, "step_time": 15.667656258010538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.84375, "completions/mean_terminated_length": 4.730769634246826, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5965379336848855, "epoch": 0.00501, "frac_reward_zero_std": 0.0, "grad_norm": 0.048755280673503876, "kl": 0.5415244102478027, "learning_rate": 7.999919945268779e-06, "loss": -0.1171, "num_tokens": 11876472.0, "reward": 0.8024928569793701, "reward_std": 0.7241319417953491, "rewards/rollout_reward_func/mean": 0.8024928569793701, "rewards/rollout_reward_func/std": 0.7241318821907043, "sampling/importance_sampling_ratio/max": 1.3529950380325317, "sampling/importance_sampling_ratio/mean": 0.811984658241272, "sampling/importance_sampling_ratio/min": 1.37580036252416e-09, "sampling/sampling_logp_difference/max": 2.343559980392456, "sampling/sampling_logp_difference/mean": 0.43410933017730713, "step": 501, "step_time": 25.27420777900261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.593473106622696, "epoch": 0.00502, "grad_norm": 0.046851277351379395, "kl": 0.5331760831177235, "learning_rate": 7.999919600578657e-06, "loss": -0.1172, "step": 502, "step_time": 13.141047605051426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.3125, "completions/mean_terminated_length": 5.304347991943359, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0013074837625027, "epoch": 0.00503, "frac_reward_zero_std": 0.0, "grad_norm": 0.09583243727684021, "kl": 0.3380113928578794, "learning_rate": 7.999919255148074e-06, "loss": -0.1089, "num_tokens": 11944567.0, "reward": 0.4117680788040161, "reward_std": 0.8683053255081177, "rewards/rollout_reward_func/mean": 0.4117680788040161, "rewards/rollout_reward_func/std": 0.8683053851127625, "sampling/importance_sampling_ratio/max": 1.7371031045913696, "sampling/importance_sampling_ratio/mean": 0.6540837287902832, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.819533109664917, "sampling/sampling_logp_difference/mean": 0.3476356267929077, "step": 503, "step_time": 31.1811569169804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9982973728328943, "epoch": 0.00504, "grad_norm": 0.09736309200525284, "kl": 0.3341761357150972, "learning_rate": 7.999918908977031e-06, "loss": -0.1091, "step": 504, "step_time": 15.984268450032687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.71875, "completions/mean_terminated_length": 5.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9891350269317627, "epoch": 0.00505, "frac_reward_zero_std": 0.0, "grad_norm": 0.06150316447019577, "kl": 0.41652421560138464, "learning_rate": 7.999918562065531e-06, "loss": -0.0872, "num_tokens": 12001275.0, "reward": 0.43384939432144165, "reward_std": 0.8373721837997437, "rewards/rollout_reward_func/mean": 0.43384939432144165, "rewards/rollout_reward_func/std": 0.8373721837997437, "sampling/importance_sampling_ratio/max": 1.7669775485992432, "sampling/importance_sampling_ratio/mean": 0.7122617959976196, "sampling/importance_sampling_ratio/min": 1.3252735016067163e-06, "sampling/sampling_logp_difference/max": 1.8902157545089722, "sampling/sampling_logp_difference/mean": 0.3797786235809326, "step": 505, "step_time": 25.098746016970836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9894132167100906, "epoch": 0.00506, "grad_norm": 0.05630103498697281, "kl": 0.4196521509438753, "learning_rate": 7.999918214413569e-06, "loss": -0.0876, "step": 506, "step_time": 12.550135936966399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.15625, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9622571412473917, "epoch": 0.00507, "frac_reward_zero_std": 0.0, "grad_norm": 0.1442531943321228, "kl": 0.4858885984867811, "learning_rate": 7.999917866021148e-06, "loss": -0.08, "num_tokens": 12061192.0, "reward": 0.662356972694397, "reward_std": 0.8135550618171692, "rewards/rollout_reward_func/mean": 0.662356972694397, "rewards/rollout_reward_func/std": 0.8135550618171692, "sampling/importance_sampling_ratio/max": 1.670366644859314, "sampling/importance_sampling_ratio/mean": 0.8502553105354309, "sampling/importance_sampling_ratio/min": 0.0002864185662474483, "sampling/sampling_logp_difference/max": 3.2509078979492188, "sampling/sampling_logp_difference/mean": 0.23525381088256836, "step": 507, "step_time": 27.080904845963232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.9614807590842247, "epoch": 0.00508, "grad_norm": 0.11061251163482666, "kl": 0.4954367559403181, "learning_rate": 7.999917516888269e-06, "loss": -0.0809, "step": 508, "step_time": 13.963921949005453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 5.0714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2599854990839958, "epoch": 0.00509, "frac_reward_zero_std": 0.0, "grad_norm": 0.07914985716342926, "kl": 0.5808066856116056, "learning_rate": 7.99991716701493e-06, "loss": -0.0622, "num_tokens": 12112834.0, "reward": 0.48642951250076294, "reward_std": 0.955630898475647, "rewards/rollout_reward_func/mean": 0.48642951250076294, "rewards/rollout_reward_func/std": 0.9556308388710022, "sampling/importance_sampling_ratio/max": 1.3985434770584106, "sampling/importance_sampling_ratio/mean": 0.8343710899353027, "sampling/importance_sampling_ratio/min": 0.00010222351556876674, "sampling/sampling_logp_difference/max": 1.7398903369903564, "sampling/sampling_logp_difference/mean": 0.23502269387245178, "step": 509, "step_time": 25.04268270294415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2666093669831753, "epoch": 0.0051, "grad_norm": 0.07837681472301483, "kl": 0.5625295676290989, "learning_rate": 7.999916816401132e-06, "loss": -0.0625, "step": 510, "step_time": 12.659923191007692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.5625, "completions/mean_terminated_length": 5.199999809265137, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4701437894254923, "epoch": 0.00511, "frac_reward_zero_std": 0.0, "grad_norm": 0.10110102593898773, "kl": 0.39355877321213484, "learning_rate": 7.999916465046877e-06, "loss": -0.1017, "num_tokens": 12160450.0, "reward": 0.6184738874435425, "reward_std": 0.9935376048088074, "rewards/rollout_reward_func/mean": 0.6184738874435425, "rewards/rollout_reward_func/std": 0.9935376048088074, "sampling/importance_sampling_ratio/max": 1.3633346557617188, "sampling/importance_sampling_ratio/mean": 0.713800311088562, "sampling/importance_sampling_ratio/min": 1.1895557463503792e-06, "sampling/sampling_logp_difference/max": 1.7737793922424316, "sampling/sampling_logp_difference/mean": 0.31079983711242676, "step": 511, "step_time": 20.886436811939348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 1.4748801533132792, "epoch": 0.00512, "grad_norm": 0.1115521565079689, "kl": 0.39872945100069046, "learning_rate": 7.99991611295216e-06, "loss": -0.1013, "step": 512, "step_time": 11.295200268970802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.84375, "completions/mean_terminated_length": 5.148148059844971, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2768788877874613, "epoch": 0.00513, "frac_reward_zero_std": 0.25, "grad_norm": 0.1417359858751297, "kl": 0.8412035247310996, "learning_rate": 7.999915760116986e-06, "loss": -0.0636, "num_tokens": 12204023.0, "reward": 0.562293529510498, "reward_std": 1.0126851797103882, "rewards/rollout_reward_func/mean": 0.562293529510498, "rewards/rollout_reward_func/std": 1.0126851797103882, "sampling/importance_sampling_ratio/max": 1.2086769342422485, "sampling/importance_sampling_ratio/mean": 0.7469204664230347, "sampling/importance_sampling_ratio/min": 3.607350663514808e-05, "sampling/sampling_logp_difference/max": 1.7910923957824707, "sampling/sampling_logp_difference/mean": 0.2681092619895935, "step": 513, "step_time": 23.801520036970032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2736190557479858, "epoch": 0.00514, "grad_norm": 0.13422919809818268, "kl": 0.803531626239419, "learning_rate": 7.999915406541353e-06, "loss": -0.0643, "step": 514, "step_time": 12.471562442049617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.09375, "completions/mean_terminated_length": 6.052631855010986, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9921011310070753, "epoch": 0.00515, "frac_reward_zero_std": 0.0, "grad_norm": 0.1082802340388298, "kl": 0.2602963495301083, "learning_rate": 7.999915052225262e-06, "loss": -0.1108, "num_tokens": 12263534.0, "reward": 0.3467492461204529, "reward_std": 1.0019515752792358, "rewards/rollout_reward_func/mean": 0.3467492461204529, "rewards/rollout_reward_func/std": 1.0019515752792358, "sampling/importance_sampling_ratio/max": 1.3722543716430664, "sampling/importance_sampling_ratio/mean": 0.5337336659431458, "sampling/importance_sampling_ratio/min": 1.1367687875463162e-06, "sampling/sampling_logp_difference/max": 2.018385410308838, "sampling/sampling_logp_difference/mean": 0.36868858337402344, "step": 515, "step_time": 28.993856632965617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9926512194797397, "epoch": 0.00516, "grad_norm": 0.10506637394428253, "kl": 0.2541355218272656, "learning_rate": 7.999914697168712e-06, "loss": -0.111, "step": 516, "step_time": 13.932876264036167 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 4.769230842590332, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.546830277889967, "epoch": 0.00517, "frac_reward_zero_std": 0.25, "grad_norm": 0.07775261998176575, "kl": 0.3075137631967664, "learning_rate": 7.999914341371702e-06, "loss": -0.0699, "num_tokens": 12318394.0, "reward": 0.1641930192708969, "reward_std": 0.7705875039100647, "rewards/rollout_reward_func/mean": 0.1641930192708969, "rewards/rollout_reward_func/std": 0.7705875039100647, "sampling/importance_sampling_ratio/max": 1.6078953742980957, "sampling/importance_sampling_ratio/mean": 0.7968416213989258, "sampling/importance_sampling_ratio/min": 9.278355719288811e-07, "sampling/sampling_logp_difference/max": 1.9927393198013306, "sampling/sampling_logp_difference/mean": 0.31352701783180237, "step": 517, "step_time": 26.717655587999616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5527107305824757, "epoch": 0.00518, "grad_norm": 0.07469692081212997, "kl": 0.2845326126553118, "learning_rate": 7.999913984834237e-06, "loss": -0.0701, "step": 518, "step_time": 12.42239460896235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.78125, "completions/mean_terminated_length": 4.5652174949646, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6648273295722902, "epoch": 0.00519, "frac_reward_zero_std": 0.0, "grad_norm": 0.10197988897562027, "kl": 0.25719961151480675, "learning_rate": 7.99991362755631e-06, "loss": -0.1098, "num_tokens": 12375811.0, "reward": 0.45007336139678955, "reward_std": 0.8938204646110535, "rewards/rollout_reward_func/mean": 0.45007336139678955, "rewards/rollout_reward_func/std": 0.8938204050064087, "sampling/importance_sampling_ratio/max": 1.291629672050476, "sampling/importance_sampling_ratio/mean": 0.7118154764175415, "sampling/importance_sampling_ratio/min": 1.3860801573173376e-06, "sampling/sampling_logp_difference/max": 2.109290599822998, "sampling/sampling_logp_difference/mean": 0.3188841938972473, "step": 519, "step_time": 27.611994497012347 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 1.6646395903080702, "epoch": 0.0052, "grad_norm": 0.08800600469112396, "kl": 0.2576419413089752, "learning_rate": 7.999913269537927e-06, "loss": -0.11, "step": 520, "step_time": 13.786195025983034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.96875, "completions/mean_terminated_length": 5.296296119689941, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.497589722275734, "epoch": 0.00521, "frac_reward_zero_std": 0.0, "grad_norm": 0.21764208376407623, "kl": 0.19067962281405926, "learning_rate": 7.999912910779086e-06, "loss": -0.0946, "num_tokens": 12433949.0, "reward": 0.3082420825958252, "reward_std": 0.8906453251838684, "rewards/rollout_reward_func/mean": 0.3082420825958252, "rewards/rollout_reward_func/std": 0.8906453251838684, "sampling/importance_sampling_ratio/max": 1.598819613456726, "sampling/importance_sampling_ratio/mean": 0.7798852324485779, "sampling/importance_sampling_ratio/min": 0.00011206775525351986, "sampling/sampling_logp_difference/max": 1.8308978080749512, "sampling/sampling_logp_difference/mean": 0.2849996089935303, "step": 521, "step_time": 28.324599284969736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.5065000467002392, "epoch": 0.00522, "grad_norm": 0.13669170439243317, "kl": 0.19227438047528267, "learning_rate": 7.999912551279787e-06, "loss": -0.0952, "step": 522, "step_time": 14.17201004899107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.78125, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3946482855826616, "epoch": 0.00523, "frac_reward_zero_std": 0.0, "grad_norm": 0.039949432015419006, "kl": 0.2362902071326971, "learning_rate": 7.99991219104003e-06, "loss": -0.1064, "num_tokens": 12494644.0, "reward": 0.392257422208786, "reward_std": 0.894477367401123, "rewards/rollout_reward_func/mean": 0.392257422208786, "rewards/rollout_reward_func/std": 0.8944773077964783, "sampling/importance_sampling_ratio/max": 1.399983286857605, "sampling/importance_sampling_ratio/mean": 0.6034070253372192, "sampling/importance_sampling_ratio/min": 9.56216922531894e-07, "sampling/sampling_logp_difference/max": 1.9714736938476562, "sampling/sampling_logp_difference/mean": 0.4174121022224426, "step": 523, "step_time": 31.43810394598404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.395830391906202, "epoch": 0.00524, "grad_norm": 0.038580458611249924, "kl": 0.23496997728943825, "learning_rate": 7.999911830059816e-06, "loss": -0.1065, "step": 524, "step_time": 15.841362935025245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.84375, "completions/mean_terminated_length": 5.535714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3432137947529554, "epoch": 0.00525, "frac_reward_zero_std": 0.0, "grad_norm": 0.05539907515048981, "kl": 0.35462547466158867, "learning_rate": 7.999911468339143e-06, "loss": -0.1047, "num_tokens": 12543792.0, "reward": 0.5783983469009399, "reward_std": 0.9599561095237732, "rewards/rollout_reward_func/mean": 0.5783983469009399, "rewards/rollout_reward_func/std": 0.9599561095237732, "sampling/importance_sampling_ratio/max": 1.3430774211883545, "sampling/importance_sampling_ratio/mean": 0.7464827299118042, "sampling/importance_sampling_ratio/min": 8.41955843497999e-05, "sampling/sampling_logp_difference/max": 1.906831979751587, "sampling/sampling_logp_difference/mean": 0.2432495653629303, "step": 525, "step_time": 25.519464164070087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3352582110092044, "epoch": 0.00526, "grad_norm": 0.05187897011637688, "kl": 0.3476182296872139, "learning_rate": 7.999911105878013e-06, "loss": -0.1048, "step": 526, "step_time": 13.797641293989727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 5.066667079925537, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0352946668863297, "epoch": 0.00527, "frac_reward_zero_std": 0.25, "grad_norm": 0.07615797966718674, "kl": 0.28553467616438866, "learning_rate": 7.999910742676423e-06, "loss": -0.0337, "num_tokens": 12593014.0, "reward": 0.5389748811721802, "reward_std": 0.7568817734718323, "rewards/rollout_reward_func/mean": 0.5389748811721802, "rewards/rollout_reward_func/std": 0.7568817734718323, "sampling/importance_sampling_ratio/max": 1.4400990009307861, "sampling/importance_sampling_ratio/mean": 0.9632933139801025, "sampling/importance_sampling_ratio/min": 3.289821688667871e-05, "sampling/sampling_logp_difference/max": 1.7116496562957764, "sampling/sampling_logp_difference/mean": 0.2536059319972992, "step": 527, "step_time": 26.68089255099767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0419083833694458, "epoch": 0.00528, "grad_norm": 0.07782824337482452, "kl": 0.2886368874460459, "learning_rate": 7.999910378734379e-06, "loss": -0.0338, "step": 528, "step_time": 13.603073770034825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.9375, "completions/mean_terminated_length": 4.896551609039307, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0383293265476823, "epoch": 0.00529, "frac_reward_zero_std": 0.0, "grad_norm": 0.05347391217947006, "kl": 0.4442136315628886, "learning_rate": 7.999910014051875e-06, "loss": -0.0798, "num_tokens": 12644391.0, "reward": 0.8582420349121094, "reward_std": 0.8897445201873779, "rewards/rollout_reward_func/mean": 0.8582420349121094, "rewards/rollout_reward_func/std": 0.8897445201873779, "sampling/importance_sampling_ratio/max": 1.2699567079544067, "sampling/importance_sampling_ratio/mean": 0.8110530376434326, "sampling/importance_sampling_ratio/min": 7.370963430730626e-05, "sampling/sampling_logp_difference/max": 1.412327766418457, "sampling/sampling_logp_difference/mean": 0.19130295515060425, "step": 529, "step_time": 31.05689930196968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.043142283335328, "epoch": 0.0053, "grad_norm": 0.05127966031432152, "kl": 0.44917613361030817, "learning_rate": 7.999909648628916e-06, "loss": -0.0799, "step": 530, "step_time": 15.827634380984819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.40625, "completions/mean_terminated_length": 4.5416669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.035416018217802, "epoch": 0.00531, "frac_reward_zero_std": 0.25, "grad_norm": 0.09531406313180923, "kl": 0.38543082028627396, "learning_rate": 7.999909282465499e-06, "loss": -0.0875, "num_tokens": 12700077.0, "reward": 0.483659029006958, "reward_std": 0.8819851279258728, "rewards/rollout_reward_func/mean": 0.483659029006958, "rewards/rollout_reward_func/std": 0.8819851279258728, "sampling/importance_sampling_ratio/max": 1.6420327425003052, "sampling/importance_sampling_ratio/mean": 0.7168556451797485, "sampling/importance_sampling_ratio/min": 2.87523789666011e-07, "sampling/sampling_logp_difference/max": 2.6258912086486816, "sampling/sampling_logp_difference/mean": 0.4116600453853607, "step": 531, "step_time": 28.651504558045417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0340306498110294, "epoch": 0.00532, "grad_norm": 0.09513047337532043, "kl": 0.38729854859411716, "learning_rate": 7.999908915561626e-06, "loss": -0.0873, "step": 532, "step_time": 13.704897979070665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.0625, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7965711336582899, "epoch": 0.00533, "frac_reward_zero_std": 0.5, "grad_norm": 0.06440844386816025, "kl": 0.27922614850103855, "learning_rate": 7.999908547917295e-06, "loss": -0.0385, "num_tokens": 12742181.0, "reward": 1.0610175132751465, "reward_std": 0.7232203483581543, "rewards/rollout_reward_func/mean": 1.0610175132751465, "rewards/rollout_reward_func/std": 0.7232202887535095, "sampling/importance_sampling_ratio/max": 1.7590010166168213, "sampling/importance_sampling_ratio/mean": 0.9733471274375916, "sampling/importance_sampling_ratio/min": 2.15604104596423e-05, "sampling/sampling_logp_difference/max": 2.153876304626465, "sampling/sampling_logp_difference/mean": 0.18151447176933289, "step": 533, "step_time": 21.965444889996434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7939669359475374, "epoch": 0.00534, "grad_norm": 0.05944419279694557, "kl": 0.28825459629297256, "learning_rate": 7.999908179532507e-06, "loss": -0.0385, "step": 534, "step_time": 11.928438470029505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.15625, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3254659352824092, "epoch": 0.00535, "frac_reward_zero_std": 0.0, "grad_norm": 0.0638042464852333, "kl": 1.1044013360515237, "learning_rate": 7.999907810407261e-06, "loss": -0.0898, "num_tokens": 12787192.0, "reward": 0.8772892951965332, "reward_std": 0.9321323037147522, "rewards/rollout_reward_func/mean": 0.8772892951965332, "rewards/rollout_reward_func/std": 0.9321323037147522, "sampling/importance_sampling_ratio/max": 1.168058156967163, "sampling/importance_sampling_ratio/mean": 0.7664487361907959, "sampling/importance_sampling_ratio/min": 2.710872195166303e-06, "sampling/sampling_logp_difference/max": 2.07433819770813, "sampling/sampling_logp_difference/mean": 0.2681725025177002, "step": 535, "step_time": 23.708943527075462 }, { "clip_ratio/high_max": 0.007352941203862429, "clip_ratio/high_mean": 0.0036764706019312143, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01148897036910057, "entropy": 1.326244662515819, "epoch": 0.00536, "grad_norm": 0.055116429924964905, "kl": 1.0116725582629442, "learning_rate": 7.999907440541558e-06, "loss": -0.0901, "step": 536, "step_time": 12.727813726960449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 4.814815044403076, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4145142957568169, "epoch": 0.00537, "frac_reward_zero_std": 0.25, "grad_norm": 0.10219409316778183, "kl": 0.45782945211976767, "learning_rate": 7.999907069935401e-06, "loss": -0.0366, "num_tokens": 12839355.0, "reward": 0.34555715322494507, "reward_std": 0.9161179661750793, "rewards/rollout_reward_func/mean": 0.34555715322494507, "rewards/rollout_reward_func/std": 0.9161179661750793, "sampling/importance_sampling_ratio/max": 1.346991777420044, "sampling/importance_sampling_ratio/mean": 0.7678658962249756, "sampling/importance_sampling_ratio/min": 4.472536238608882e-05, "sampling/sampling_logp_difference/max": 1.661221981048584, "sampling/sampling_logp_difference/mean": 0.27684658765792847, "step": 537, "step_time": 26.004226088058203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4175801649689674, "epoch": 0.00538, "grad_norm": 0.10064797103404999, "kl": 0.43950991332530975, "learning_rate": 7.999906698588786e-06, "loss": -0.0367, "step": 538, "step_time": 12.780433647043537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 5.119999885559082, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.152195394039154, "epoch": 0.00539, "frac_reward_zero_std": 0.0, "grad_norm": 0.5529802441596985, "kl": 0.1626409823074937, "learning_rate": 7.999906326501715e-06, "loss": -0.0868, "num_tokens": 12887457.0, "reward": 0.5816032886505127, "reward_std": 0.9468616247177124, "rewards/rollout_reward_func/mean": 0.5816032886505127, "rewards/rollout_reward_func/std": 0.9468615651130676, "sampling/importance_sampling_ratio/max": 1.2372691631317139, "sampling/importance_sampling_ratio/mean": 0.6448690891265869, "sampling/importance_sampling_ratio/min": 2.127137719298844e-07, "sampling/sampling_logp_difference/max": 1.864038109779358, "sampling/sampling_logp_difference/mean": 0.3667956590652466, "step": 539, "step_time": 23.44140547301504 }, { "clip_ratio/high_max": 0.02297794120386243, "clip_ratio/high_mean": 0.011488970601931214, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027113970601931214, "entropy": 2.12644000351429, "epoch": 0.0054, "grad_norm": 0.2138163298368454, "kl": 0.166935034096241, "learning_rate": 7.999905953674187e-06, "loss": -0.0896, "step": 540, "step_time": 11.815022660011891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 4.807692527770996, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.540712283924222, "epoch": 0.00541, "frac_reward_zero_std": 0.0, "grad_norm": 0.17369982600212097, "kl": 0.9106902191415429, "learning_rate": 7.999905580106204e-06, "loss": -0.0621, "num_tokens": 12938412.0, "reward": 0.764445424079895, "reward_std": 0.956785261631012, "rewards/rollout_reward_func/mean": 0.764445424079895, "rewards/rollout_reward_func/std": 0.956785261631012, "sampling/importance_sampling_ratio/max": 1.1954444646835327, "sampling/importance_sampling_ratio/mean": 0.7672334313392639, "sampling/importance_sampling_ratio/min": 4.768742201122222e-07, "sampling/sampling_logp_difference/max": 2.458155870437622, "sampling/sampling_logp_difference/mean": 0.2926706373691559, "step": 541, "step_time": 27.183876485010842 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "entropy": 1.5379258058965206, "epoch": 0.00542, "grad_norm": 0.1302727311849594, "kl": 0.7460326757282019, "learning_rate": 7.999905205797764e-06, "loss": -0.0635, "step": 542, "step_time": 12.601827424980002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.625, "completions/mean_terminated_length": 4.833333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8304258361458778, "epoch": 0.00543, "frac_reward_zero_std": 0.0, "grad_norm": 0.10782383382320404, "kl": 0.4199704099446535, "learning_rate": 7.999904830748868e-06, "loss": -0.1067, "num_tokens": 12987957.0, "reward": 0.7256779074668884, "reward_std": 0.9552851915359497, "rewards/rollout_reward_func/mean": 0.7256779074668884, "rewards/rollout_reward_func/std": 0.9552851915359497, "sampling/importance_sampling_ratio/max": 1.724481463432312, "sampling/importance_sampling_ratio/mean": 0.7115721702575684, "sampling/importance_sampling_ratio/min": 9.824390872381628e-06, "sampling/sampling_logp_difference/max": 1.7158105373382568, "sampling/sampling_logp_difference/mean": 0.3684811294078827, "step": 543, "step_time": 29.904528660001233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8314388543367386, "epoch": 0.00544, "grad_norm": 0.1045791432261467, "kl": 0.41562676057219505, "learning_rate": 7.999904454959516e-06, "loss": -0.1067, "step": 544, "step_time": 15.33301510498859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.84375, "completions/mean_terminated_length": 4.65217399597168, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2928127190098166, "epoch": 0.00545, "frac_reward_zero_std": 0.0, "grad_norm": 0.10885374248027802, "kl": 0.22855547163635492, "learning_rate": 7.999904078429708e-06, "loss": -0.0975, "num_tokens": 13033754.0, "reward": 0.4243926405906677, "reward_std": 0.921672523021698, "rewards/rollout_reward_func/mean": 0.4243926405906677, "rewards/rollout_reward_func/std": 0.921672523021698, "sampling/importance_sampling_ratio/max": 1.3667609691619873, "sampling/importance_sampling_ratio/mean": 0.709100067615509, "sampling/importance_sampling_ratio/min": 7.094307079569262e-07, "sampling/sampling_logp_difference/max": 2.3211731910705566, "sampling/sampling_logp_difference/mean": 0.4215683937072754, "step": 545, "step_time": 22.868928300973494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.295785597525537, "epoch": 0.00546, "grad_norm": 0.128087118268013, "kl": 0.22209205431863666, "learning_rate": 7.999903701159445e-06, "loss": -0.0979, "step": 546, "step_time": 13.112781480012927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.96875, "completions/mean_terminated_length": 5.71999979019165, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5915681160986423, "epoch": 0.00547, "frac_reward_zero_std": 0.0, "grad_norm": 0.09439007192850113, "kl": 0.48231089720502496, "learning_rate": 7.999903323148725e-06, "loss": -0.101, "num_tokens": 13093480.0, "reward": 0.496952623128891, "reward_std": 0.9013419151306152, "rewards/rollout_reward_func/mean": 0.496952623128891, "rewards/rollout_reward_func/std": 0.9013419151306152, "sampling/importance_sampling_ratio/max": 1.462908148765564, "sampling/importance_sampling_ratio/mean": 0.6786519289016724, "sampling/importance_sampling_ratio/min": 0.00044305570190772414, "sampling/sampling_logp_difference/max": 1.7191848754882812, "sampling/sampling_logp_difference/mean": 0.2935703992843628, "step": 547, "step_time": 27.46865358602372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5848494842648506, "epoch": 0.00548, "grad_norm": 0.09302642941474915, "kl": 0.5087404297664762, "learning_rate": 7.99990294439755e-06, "loss": -0.1009, "step": 548, "step_time": 13.708581449958729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 5.538461685180664, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9297618754208088, "epoch": 0.00549, "frac_reward_zero_std": 0.0, "grad_norm": 0.2797854244709015, "kl": 0.7191847460344434, "learning_rate": 7.999902564905919e-06, "loss": -0.0829, "num_tokens": 13142665.0, "reward": 0.35704880952835083, "reward_std": 0.8710498213768005, "rewards/rollout_reward_func/mean": 0.35704880952835083, "rewards/rollout_reward_func/std": 0.8710497617721558, "sampling/importance_sampling_ratio/max": 1.422981858253479, "sampling/importance_sampling_ratio/mean": 0.7876548171043396, "sampling/importance_sampling_ratio/min": 4.455167186279141e-07, "sampling/sampling_logp_difference/max": 2.16402530670166, "sampling/sampling_logp_difference/mean": 0.3658372759819031, "step": 549, "step_time": 25.566924549028045 }, { "clip_ratio/high_max": 0.007352941203862429, "clip_ratio/high_mean": 0.0036764706019312143, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011488970601931214, "entropy": 1.9271032251417637, "epoch": 0.0055, "grad_norm": 0.17698456346988678, "kl": 0.5457548527047038, "learning_rate": 7.999902184673833e-06, "loss": -0.0852, "step": 550, "step_time": 13.311080128041795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.129032135009766, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.4695922192186117, "epoch": 0.00551, "frac_reward_zero_std": 0.5, "grad_norm": 0.05853649601340294, "kl": 0.5897459276020527, "learning_rate": 7.999901803701292e-06, "loss": -0.0415, "num_tokens": 13181816.0, "reward": 1.3680319786071777, "reward_std": 0.412699431180954, "rewards/rollout_reward_func/mean": 1.3680319786071777, "rewards/rollout_reward_func/std": 0.412699431180954, "sampling/importance_sampling_ratio/max": 1.437813401222229, "sampling/importance_sampling_ratio/mean": 1.103028655052185, "sampling/importance_sampling_ratio/min": 2.682367394868379e-08, "sampling/sampling_logp_difference/max": 2.331829071044922, "sampling/sampling_logp_difference/mean": 0.17170962691307068, "step": 551, "step_time": 19.12496875601937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.46257399674504995, "epoch": 0.00552, "grad_norm": 0.044718027114868164, "kl": 0.5229595825076103, "learning_rate": 7.999901421988296e-06, "loss": -0.0418, "step": 552, "step_time": 9.958009307039902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.03125, "completions/mean_terminated_length": 4.409090995788574, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7465232834219933, "epoch": 0.00553, "frac_reward_zero_std": 0.0, "grad_norm": 0.13380643725395203, "kl": 1.032693576067686, "learning_rate": 7.999901039534843e-06, "loss": -0.0944, "num_tokens": 13239582.0, "reward": 0.6114916801452637, "reward_std": 0.8940340876579285, "rewards/rollout_reward_func/mean": 0.6114916801452637, "rewards/rollout_reward_func/std": 0.8940340876579285, "sampling/importance_sampling_ratio/max": 1.3790172338485718, "sampling/importance_sampling_ratio/mean": 0.7050700783729553, "sampling/importance_sampling_ratio/min": 7.424322575388942e-06, "sampling/sampling_logp_difference/max": 1.5491275787353516, "sampling/sampling_logp_difference/mean": 0.2860586941242218, "step": 553, "step_time": 25.64894254799583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7509892955422401, "epoch": 0.00554, "grad_norm": 0.11224152892827988, "kl": 1.0249285390600562, "learning_rate": 7.999900656340938e-06, "loss": -0.0944, "step": 554, "step_time": 11.974039629974868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 4.769230842590332, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4577694199979305, "epoch": 0.00555, "frac_reward_zero_std": 0.0, "grad_norm": 0.07925105094909668, "kl": 0.6227205507457256, "learning_rate": 7.999900272406576e-06, "loss": -0.0913, "num_tokens": 13290120.0, "reward": 0.5294680595397949, "reward_std": 0.9067544937133789, "rewards/rollout_reward_func/mean": 0.5294680595397949, "rewards/rollout_reward_func/std": 0.9067544937133789, "sampling/importance_sampling_ratio/max": 1.3646984100341797, "sampling/importance_sampling_ratio/mean": 0.792282223701477, "sampling/importance_sampling_ratio/min": 0.0001624840369913727, "sampling/sampling_logp_difference/max": 1.99066960811615, "sampling/sampling_logp_difference/mean": 0.277706116437912, "step": 555, "step_time": 26.821090920013376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4595013838261366, "epoch": 0.00556, "grad_norm": 0.062247443944215775, "kl": 0.5762322712689638, "learning_rate": 7.999899887731757e-06, "loss": -0.0918, "step": 556, "step_time": 14.502531818958232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 4.363636493682861, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9133184105157852, "epoch": 0.00557, "frac_reward_zero_std": 0.0, "grad_norm": 0.06414183974266052, "kl": 0.9821929186582565, "learning_rate": 7.999899502316487e-06, "loss": -0.0963, "num_tokens": 13342302.0, "reward": 0.32857203483581543, "reward_std": 0.7442960143089294, "rewards/rollout_reward_func/mean": 0.32857203483581543, "rewards/rollout_reward_func/std": 0.7442960143089294, "sampling/importance_sampling_ratio/max": 1.5535794496536255, "sampling/importance_sampling_ratio/mean": 0.6787868738174438, "sampling/importance_sampling_ratio/min": 5.365354240893794e-07, "sampling/sampling_logp_difference/max": 2.187575578689575, "sampling/sampling_logp_difference/mean": 0.38517120480537415, "step": 557, "step_time": 29.16852268401999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.9071518927812576, "epoch": 0.00558, "grad_norm": 0.0647462010383606, "kl": 1.0185040142387152, "learning_rate": 7.999899116160762e-06, "loss": -0.0962, "step": 558, "step_time": 14.24572175994399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9756274372339249, "epoch": 0.00559, "frac_reward_zero_std": 0.0, "grad_norm": 0.15988172590732574, "kl": 0.8331594783812761, "learning_rate": 7.99989872926458e-06, "loss": -0.0645, "num_tokens": 13396125.0, "reward": 0.16046568751335144, "reward_std": 0.9132995009422302, "rewards/rollout_reward_func/mean": 0.16046568751335144, "rewards/rollout_reward_func/std": 0.9132994413375854, "sampling/importance_sampling_ratio/max": 1.3467495441436768, "sampling/importance_sampling_ratio/mean": 0.7174978256225586, "sampling/importance_sampling_ratio/min": 8.436413190793246e-05, "sampling/sampling_logp_difference/max": 2.2250449657440186, "sampling/sampling_logp_difference/mean": 0.3213084936141968, "step": 559, "step_time": 26.52147514204262 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.9843303561210632, "epoch": 0.0056, "grad_norm": 0.1407298445701599, "kl": 0.7874693991616368, "learning_rate": 7.999898341627945e-06, "loss": -0.0648, "step": 560, "step_time": 13.377844791015377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 4.137930870056152, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7011589128524065, "epoch": 0.00561, "frac_reward_zero_std": 0.25, "grad_norm": 0.09907400608062744, "kl": 0.731447147205472, "learning_rate": 7.999897953250855e-06, "loss": -0.0412, "num_tokens": 13443450.0, "reward": 0.6676706075668335, "reward_std": 0.9157580733299255, "rewards/rollout_reward_func/mean": 0.6676706075668335, "rewards/rollout_reward_func/std": 0.915757954120636, "sampling/importance_sampling_ratio/max": 1.2830246686935425, "sampling/importance_sampling_ratio/mean": 0.9195060729980469, "sampling/importance_sampling_ratio/min": 0.0006279113003984094, "sampling/sampling_logp_difference/max": 1.4828131198883057, "sampling/sampling_logp_difference/mean": 0.17097041010856628, "step": 561, "step_time": 24.194001266005216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7085807304829359, "epoch": 0.00562, "grad_norm": 0.09200316667556763, "kl": 0.649160472676158, "learning_rate": 7.999897564133312e-06, "loss": -0.0415, "step": 562, "step_time": 12.224458461016184 }, { "clip_ratio/high_max": 0.007352941203862429, "clip_ratio/high_mean": 0.0036764706019312143, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036764706019312143, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.21875, "completions/mean_terminated_length": 5.192307949066162, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5723248198628426, "epoch": 0.00563, "frac_reward_zero_std": 0.0, "grad_norm": 0.05380956456065178, "kl": 0.49949554167687893, "learning_rate": 7.999897174275314e-06, "loss": -0.1061, "num_tokens": 13496601.0, "reward": 0.7927106618881226, "reward_std": 0.881935179233551, "rewards/rollout_reward_func/mean": 0.7927106618881226, "rewards/rollout_reward_func/std": 0.881935179233551, "sampling/importance_sampling_ratio/max": 1.3147486448287964, "sampling/importance_sampling_ratio/mean": 0.745056688785553, "sampling/importance_sampling_ratio/min": 2.645019492319989e-07, "sampling/sampling_logp_difference/max": 2.5468804836273193, "sampling/sampling_logp_difference/mean": 0.3446730971336365, "step": 563, "step_time": 24.993969914037734 }, { "clip_ratio/high_max": 0.007352941203862429, "clip_ratio/high_mean": 0.0036764706019312143, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036764706019312143, "entropy": 1.5743779800832272, "epoch": 0.00564, "grad_norm": 0.053950294852256775, "kl": 0.45304802991449833, "learning_rate": 7.999896783676862e-06, "loss": -0.1062, "step": 564, "step_time": 12.765202674054308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.584370993077755, "epoch": 0.00565, "frac_reward_zero_std": 0.0, "grad_norm": 0.061571720987558365, "kl": 0.29686099849641323, "learning_rate": 7.999896392337957e-06, "loss": -0.0858, "num_tokens": 13556284.0, "reward": 0.31189996004104614, "reward_std": 0.7635747194290161, "rewards/rollout_reward_func/mean": 0.31189996004104614, "rewards/rollout_reward_func/std": 0.7635747194290161, "sampling/importance_sampling_ratio/max": 1.3955280780792236, "sampling/importance_sampling_ratio/mean": 0.8352674245834351, "sampling/importance_sampling_ratio/min": 3.186805042787455e-05, "sampling/sampling_logp_difference/max": 2.0516929626464844, "sampling/sampling_logp_difference/mean": 0.30462682247161865, "step": 565, "step_time": 30.72564148597303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.582691565155983, "epoch": 0.00566, "grad_norm": 0.05972716584801674, "kl": 0.3054721560329199, "learning_rate": 7.999896000258596e-06, "loss": -0.0858, "step": 566, "step_time": 16.92241856997134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.84375, "completions/mean_terminated_length": 4.100000381469727, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8073283415287733, "epoch": 0.00567, "frac_reward_zero_std": 0.25, "grad_norm": 0.2003570795059204, "kl": 0.512157553806901, "learning_rate": 7.999895607438783e-06, "loss": -0.0162, "num_tokens": 13611643.0, "reward": 0.5707868337631226, "reward_std": 0.7890711426734924, "rewards/rollout_reward_func/mean": 0.5707868337631226, "rewards/rollout_reward_func/std": 0.7890710830688477, "sampling/importance_sampling_ratio/max": 1.742681860923767, "sampling/importance_sampling_ratio/mean": 1.001569151878357, "sampling/importance_sampling_ratio/min": 3.2152470907931274e-07, "sampling/sampling_logp_difference/max": 2.1139237880706787, "sampling/sampling_logp_difference/mean": 0.1927293837070465, "step": 567, "step_time": 23.209512331988662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8139800671488047, "epoch": 0.00568, "grad_norm": 0.20425046980381012, "kl": 0.5047579687088728, "learning_rate": 7.999895213878515e-06, "loss": -0.0168, "step": 568, "step_time": 12.547378166986164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.53125, "completions/mean_terminated_length": 5.159999847412109, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.128646584227681, "epoch": 0.00569, "frac_reward_zero_std": 0.0, "grad_norm": 0.15376336872577667, "kl": 0.6448492985218763, "learning_rate": 7.999894819577795e-06, "loss": -0.0833, "num_tokens": 13664489.0, "reward": 0.5794107913970947, "reward_std": 0.963969886302948, "rewards/rollout_reward_func/mean": 0.5794107913970947, "rewards/rollout_reward_func/std": 0.963969886302948, "sampling/importance_sampling_ratio/max": 1.8087122440338135, "sampling/importance_sampling_ratio/mean": 0.6969685554504395, "sampling/importance_sampling_ratio/min": 9.728542949005714e-08, "sampling/sampling_logp_difference/max": 2.3220748901367188, "sampling/sampling_logp_difference/mean": 0.41481679677963257, "step": 569, "step_time": 26.5184664830158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 2.135938785970211, "epoch": 0.0057, "grad_norm": 0.14168879389762878, "kl": 0.6229180991649628, "learning_rate": 7.99989442453662e-06, "loss": -0.0836, "step": 570, "step_time": 13.972492628032342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 4.888888835906982, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5757501684129238, "epoch": 0.00571, "frac_reward_zero_std": 0.0, "grad_norm": 0.04457739368081093, "kl": 1.1511543928645551, "learning_rate": 7.999894028754991e-06, "loss": -0.0892, "num_tokens": 13714322.0, "reward": 0.668411374092102, "reward_std": 0.8856760263442993, "rewards/rollout_reward_func/mean": 0.668411374092102, "rewards/rollout_reward_func/std": 0.8856760263442993, "sampling/importance_sampling_ratio/max": 1.2882452011108398, "sampling/importance_sampling_ratio/mean": 0.7606403827667236, "sampling/importance_sampling_ratio/min": 3.656723990275168e-08, "sampling/sampling_logp_difference/max": 2.1770310401916504, "sampling/sampling_logp_difference/mean": 0.32821187376976013, "step": 571, "step_time": 26.895688221033197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5796822998672724, "epoch": 0.00572, "grad_norm": 0.040116406977176666, "kl": 1.0777154294773936, "learning_rate": 7.99989363223291e-06, "loss": -0.0892, "step": 572, "step_time": 14.245945074013434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.480000019073486, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2573661226779222, "epoch": 0.00573, "frac_reward_zero_std": 0.0, "grad_norm": 0.06903836876153946, "kl": 0.6031562350690365, "learning_rate": 7.999893234970375e-06, "loss": -0.089, "num_tokens": 13774651.0, "reward": 0.7530867457389832, "reward_std": 0.8056772947311401, "rewards/rollout_reward_func/mean": 0.7530867457389832, "rewards/rollout_reward_func/std": 0.8056772947311401, "sampling/importance_sampling_ratio/max": 1.325884461402893, "sampling/importance_sampling_ratio/mean": 0.8566370010375977, "sampling/importance_sampling_ratio/min": 7.265948398504918e-10, "sampling/sampling_logp_difference/max": 1.8750704526901245, "sampling/sampling_logp_difference/mean": 0.3298988342285156, "step": 573, "step_time": 29.34198083897354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2589746117591858, "epoch": 0.00574, "grad_norm": 0.068872831761837, "kl": 0.5779284732416272, "learning_rate": 7.999892836967388e-06, "loss": -0.089, "step": 574, "step_time": 15.67124736995902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 4.5217390060424805, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.006985370069742, "epoch": 0.00575, "frac_reward_zero_std": 0.0, "grad_norm": 0.035163603723049164, "kl": 0.304120073094964, "learning_rate": 7.999892438223946e-06, "loss": -0.0907, "num_tokens": 13827522.0, "reward": 0.3281025290489197, "reward_std": 0.7636971473693848, "rewards/rollout_reward_func/mean": 0.3281025290489197, "rewards/rollout_reward_func/std": 0.7636971473693848, "sampling/importance_sampling_ratio/max": 1.3374629020690918, "sampling/importance_sampling_ratio/mean": 0.7058221101760864, "sampling/importance_sampling_ratio/min": 5.4090008916318766e-08, "sampling/sampling_logp_difference/max": 2.446371078491211, "sampling/sampling_logp_difference/mean": 0.37578362226486206, "step": 575, "step_time": 29.654528470040532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.010117245838046, "epoch": 0.00576, "grad_norm": 0.03503134101629257, "kl": 0.30391597375273705, "learning_rate": 7.999892038740054e-06, "loss": -0.0907, "step": 576, "step_time": 15.808368070051074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.78125, "completions/mean_terminated_length": 4.653846263885498, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5938767958432436, "epoch": 0.00577, "frac_reward_zero_std": 0.0, "grad_norm": 0.07618820667266846, "kl": 0.9459665566682816, "learning_rate": 7.999891638515707e-06, "loss": -0.0925, "num_tokens": 13889347.0, "reward": 0.44966763257980347, "reward_std": 0.8452543616294861, "rewards/rollout_reward_func/mean": 0.44966763257980347, "rewards/rollout_reward_func/std": 0.8452543616294861, "sampling/importance_sampling_ratio/max": 1.4083482027053833, "sampling/importance_sampling_ratio/mean": 0.7489627003669739, "sampling/importance_sampling_ratio/min": 2.5809109502006322e-05, "sampling/sampling_logp_difference/max": 1.58109712600708, "sampling/sampling_logp_difference/mean": 0.32993701100349426, "step": 577, "step_time": 27.499183760985034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5882397834211588, "epoch": 0.00578, "grad_norm": 0.07120843976736069, "kl": 0.8937862310558558, "learning_rate": 7.999891237550908e-06, "loss": -0.0927, "step": 578, "step_time": 13.889780035038712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 4.545454502105713, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5853361710906029, "epoch": 0.00579, "frac_reward_zero_std": 0.25, "grad_norm": 0.15014569461345673, "kl": 0.1857196381315589, "learning_rate": 7.999890835845657e-06, "loss": -0.0642, "num_tokens": 13939085.0, "reward": 0.3181857764720917, "reward_std": 0.8228017687797546, "rewards/rollout_reward_func/mean": 0.3181857764720917, "rewards/rollout_reward_func/std": 0.8228017091751099, "sampling/importance_sampling_ratio/max": 1.2421530485153198, "sampling/importance_sampling_ratio/mean": 0.6784012317657471, "sampling/importance_sampling_ratio/min": 4.560463821690064e-06, "sampling/sampling_logp_difference/max": 2.2254421710968018, "sampling/sampling_logp_difference/mean": 0.2654675245285034, "step": 579, "step_time": 32.10093669197522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5796749889850616, "epoch": 0.0058, "grad_norm": 0.1711236834526062, "kl": 0.18851157650351524, "learning_rate": 7.999890433399953e-06, "loss": -0.065, "step": 580, "step_time": 14.364685846987413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.800000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.136237733066082, "epoch": 0.00581, "frac_reward_zero_std": 0.0, "grad_norm": 0.09661009907722473, "kl": 0.6962293237447739, "learning_rate": 7.999890030213796e-06, "loss": -0.074, "num_tokens": 13992170.0, "reward": 0.8272750377655029, "reward_std": 0.8009769320487976, "rewards/rollout_reward_func/mean": 0.8272750377655029, "rewards/rollout_reward_func/std": 0.8009769320487976, "sampling/importance_sampling_ratio/max": 1.2752114534378052, "sampling/importance_sampling_ratio/mean": 0.8605276346206665, "sampling/importance_sampling_ratio/min": 7.141586593206739e-07, "sampling/sampling_logp_difference/max": 1.8066917657852173, "sampling/sampling_logp_difference/mean": 0.2818853259086609, "step": 581, "step_time": 26.105576996982563 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.1274949871003628, "epoch": 0.00582, "grad_norm": 0.09561453759670258, "kl": 0.6949430480599403, "learning_rate": 7.999889626287187e-06, "loss": -0.0739, "step": 582, "step_time": 13.69310441595735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.28125, "completions/mean_terminated_length": 4.375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5836471412330866, "epoch": 0.00583, "frac_reward_zero_std": 0.0, "grad_norm": 0.04985359311103821, "kl": 0.3184788804501295, "learning_rate": 7.999889221620126e-06, "loss": -0.0935, "num_tokens": 14039002.0, "reward": 0.6859559416770935, "reward_std": 1.0032620429992676, "rewards/rollout_reward_func/mean": 0.6859559416770935, "rewards/rollout_reward_func/std": 1.0032620429992676, "sampling/importance_sampling_ratio/max": 1.2759040594100952, "sampling/importance_sampling_ratio/mean": 0.7383629679679871, "sampling/importance_sampling_ratio/min": 1.318441036346485e-06, "sampling/sampling_logp_difference/max": 2.136033535003662, "sampling/sampling_logp_difference/mean": 0.29708099365234375, "step": 583, "step_time": 29.560413312981836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 1.5804088860750198, "epoch": 0.00584, "grad_norm": 0.047492656856775284, "kl": 0.32934044301509857, "learning_rate": 7.999888816212612e-06, "loss": -0.0936, "step": 584, "step_time": 13.741101477004122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.40625, "completions/mean_terminated_length": 4.310344696044922, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2250808514654636, "epoch": 0.00585, "frac_reward_zero_std": 0.0, "grad_norm": 0.15233179926872253, "kl": 0.7312899567186832, "learning_rate": 7.999888410064647e-06, "loss": -0.0486, "num_tokens": 14089195.0, "reward": 0.34338170289993286, "reward_std": 0.7506309747695923, "rewards/rollout_reward_func/mean": 0.34338170289993286, "rewards/rollout_reward_func/std": 0.7506309747695923, "sampling/importance_sampling_ratio/max": 1.2112938165664673, "sampling/importance_sampling_ratio/mean": 0.8282781839370728, "sampling/importance_sampling_ratio/min": 0.0001109595104935579, "sampling/sampling_logp_difference/max": 1.826097011566162, "sampling/sampling_logp_difference/mean": 0.2316676676273346, "step": 585, "step_time": 24.017082281934563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2246467173099518, "epoch": 0.00586, "grad_norm": 0.14470703899860382, "kl": 0.7398058474063873, "learning_rate": 7.99988800317623e-06, "loss": -0.0485, "step": 586, "step_time": 12.91858411906287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.3125, "completions/mean_terminated_length": 4.600000381469727, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8928221575915813, "epoch": 0.00587, "frac_reward_zero_std": 0.0, "grad_norm": 0.06424601376056671, "kl": 0.7269137352705002, "learning_rate": 7.99988759554736e-06, "loss": -0.0653, "num_tokens": 14135136.0, "reward": 0.8665563464164734, "reward_std": 0.7522395849227905, "rewards/rollout_reward_func/mean": 0.8665563464164734, "rewards/rollout_reward_func/std": 0.7522396445274353, "sampling/importance_sampling_ratio/max": 1.203321933746338, "sampling/importance_sampling_ratio/mean": 0.8631455898284912, "sampling/importance_sampling_ratio/min": 0.0006981155020184815, "sampling/sampling_logp_difference/max": 1.4718799591064453, "sampling/sampling_logp_difference/mean": 0.196748286485672, "step": 587, "step_time": 19.586418827995658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8928107228130102, "epoch": 0.00588, "grad_norm": 0.06761535257101059, "kl": 0.7509621661156416, "learning_rate": 7.999887187178037e-06, "loss": -0.0652, "step": 588, "step_time": 10.661115159949986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 4.592592716217041, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5447262972593307, "epoch": 0.00589, "frac_reward_zero_std": 0.25, "grad_norm": 0.1705378293991089, "kl": 0.43907404504716396, "learning_rate": 7.999886778068266e-06, "loss": -0.0209, "num_tokens": 14191704.0, "reward": 0.29200655221939087, "reward_std": 0.8364866971969604, "rewards/rollout_reward_func/mean": 0.29200655221939087, "rewards/rollout_reward_func/std": 0.8364866971969604, "sampling/importance_sampling_ratio/max": 1.3736299276351929, "sampling/importance_sampling_ratio/mean": 0.723455548286438, "sampling/importance_sampling_ratio/min": 0.00025929120602086186, "sampling/sampling_logp_difference/max": 1.5489540100097656, "sampling/sampling_logp_difference/mean": 0.2783692479133606, "step": 589, "step_time": 24.808576775947586 }, { "clip_ratio/high_max": 0.028273810632526875, "clip_ratio/high_mean": 0.014136905316263437, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014136905316263437, "entropy": 1.5530180260539055, "epoch": 0.0059, "grad_norm": 0.04654763638973236, "kl": 0.42530455719679594, "learning_rate": 7.999886368218042e-06, "loss": -0.0213, "step": 590, "step_time": 12.977532154036453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 4.799999713897705, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0391951203346252, "epoch": 0.00591, "frac_reward_zero_std": 0.0, "grad_norm": 0.08861362189054489, "kl": 0.7376598343253136, "learning_rate": 7.999885957627366e-06, "loss": -0.0788, "num_tokens": 14255498.0, "reward": 0.2950173020362854, "reward_std": 0.7688872218132019, "rewards/rollout_reward_func/mean": 0.2950173020362854, "rewards/rollout_reward_func/std": 0.7688871622085571, "sampling/importance_sampling_ratio/max": 1.1860624551773071, "sampling/importance_sampling_ratio/mean": 0.6678770184516907, "sampling/importance_sampling_ratio/min": 9.752593177481117e-10, "sampling/sampling_logp_difference/max": 2.2483739852905273, "sampling/sampling_logp_difference/mean": 0.4980851113796234, "step": 591, "step_time": 32.933476853970205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0381519496440887, "epoch": 0.00592, "grad_norm": 0.07404807209968567, "kl": 0.654971357434988, "learning_rate": 7.99988554629624e-06, "loss": -0.0791, "step": 592, "step_time": 16.361251823982457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.40625, "completions/mean_terminated_length": 4.700000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4035434760153294, "epoch": 0.00593, "frac_reward_zero_std": 0.0, "grad_norm": 0.10126062482595444, "kl": 0.9130404125899076, "learning_rate": 7.999885134224663e-06, "loss": -0.0738, "num_tokens": 14313723.0, "reward": 0.4197009205818176, "reward_std": 0.7559742331504822, "rewards/rollout_reward_func/mean": 0.4197009205818176, "rewards/rollout_reward_func/std": 0.7559742331504822, "sampling/importance_sampling_ratio/max": 1.4950647354125977, "sampling/importance_sampling_ratio/mean": 0.7909085750579834, "sampling/importance_sampling_ratio/min": 3.266963517489785e-07, "sampling/sampling_logp_difference/max": 1.8323678970336914, "sampling/sampling_logp_difference/mean": 0.33656230568885803, "step": 593, "step_time": 27.210200922039803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4162170514464378, "epoch": 0.00594, "grad_norm": 0.08957070112228394, "kl": 0.835867278277874, "learning_rate": 7.999884721412632e-06, "loss": -0.074, "step": 594, "step_time": 15.444127213035244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.09375, "completions/mean_terminated_length": 4.678571701049805, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.506237704306841, "epoch": 0.00595, "frac_reward_zero_std": 0.25, "grad_norm": 0.08073657006025314, "kl": 0.3809766471385956, "learning_rate": 7.999884307860151e-06, "loss": -0.0815, "num_tokens": 14363067.0, "reward": 0.4787759482860565, "reward_std": 0.8885853886604309, "rewards/rollout_reward_func/mean": 0.4787759482860565, "rewards/rollout_reward_func/std": 0.8885853886604309, "sampling/importance_sampling_ratio/max": 2.021033763885498, "sampling/importance_sampling_ratio/mean": 0.7800809741020203, "sampling/importance_sampling_ratio/min": 4.118144047993155e-09, "sampling/sampling_logp_difference/max": 2.3632090091705322, "sampling/sampling_logp_difference/mean": 0.3513804078102112, "step": 595, "step_time": 22.262797896983102 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 1.5212066546082497, "epoch": 0.00596, "grad_norm": 0.08205923438072205, "kl": 0.3718995023518801, "learning_rate": 7.999883893567221e-06, "loss": -0.0816, "step": 596, "step_time": 11.38605077500688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 4.5714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2759842704981565, "epoch": 0.00597, "frac_reward_zero_std": 0.0, "grad_norm": 0.04305852949619293, "kl": 0.34251839481294155, "learning_rate": 7.999883478533838e-06, "loss": -0.0815, "num_tokens": 14407607.0, "reward": 0.794040322303772, "reward_std": 0.8886386752128601, "rewards/rollout_reward_func/mean": 0.794040322303772, "rewards/rollout_reward_func/std": 0.8886386156082153, "sampling/importance_sampling_ratio/max": 1.236463189125061, "sampling/importance_sampling_ratio/mean": 0.824446976184845, "sampling/importance_sampling_ratio/min": 1.3677921018029338e-08, "sampling/sampling_logp_difference/max": 2.0245766639709473, "sampling/sampling_logp_difference/mean": 0.2504385709762573, "step": 597, "step_time": 24.559735114045907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2788099572062492, "epoch": 0.00598, "grad_norm": 0.045469578355550766, "kl": 0.32718458119779825, "learning_rate": 7.999883062760005e-06, "loss": -0.0815, "step": 598, "step_time": 12.527495402056957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.03125, "completions/mean_terminated_length": 4.677419185638428, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8933786004781723, "epoch": 0.00599, "frac_reward_zero_std": 0.25, "grad_norm": 0.2807352840900421, "kl": 0.36446768045425415, "learning_rate": 7.999882646245722e-06, "loss": -0.0328, "num_tokens": 14450231.0, "reward": 0.8585637807846069, "reward_std": 0.6998790502548218, "rewards/rollout_reward_func/mean": 0.8585637807846069, "rewards/rollout_reward_func/std": 0.6998790502548218, "sampling/importance_sampling_ratio/max": 1.5451250076293945, "sampling/importance_sampling_ratio/mean": 0.907063364982605, "sampling/importance_sampling_ratio/min": 0.0005254809511825442, "sampling/sampling_logp_difference/max": 1.8660669326782227, "sampling/sampling_logp_difference/mean": 0.1725243180990219, "step": 599, "step_time": 21.946393718011677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.8975747041404247, "epoch": 0.006, "grad_norm": 0.05173974856734276, "kl": 0.35700137726962566, "learning_rate": 7.999882228990988e-06, "loss": -0.0344, "step": 600, "step_time": 12.405745060998015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 5.142857551574707, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5379819199442863, "epoch": 0.00601, "frac_reward_zero_std": 0.0, "grad_norm": 0.13478054106235504, "kl": 0.4123996999114752, "learning_rate": 7.999881810995804e-06, "loss": -0.0657, "num_tokens": 14502985.0, "reward": 0.2879352569580078, "reward_std": 0.8082134127616882, "rewards/rollout_reward_func/mean": 0.2879352569580078, "rewards/rollout_reward_func/std": 0.8082134127616882, "sampling/importance_sampling_ratio/max": 1.3803198337554932, "sampling/importance_sampling_ratio/mean": 0.7880194187164307, "sampling/importance_sampling_ratio/min": 2.1878769018712774e-07, "sampling/sampling_logp_difference/max": 2.098687171936035, "sampling/sampling_logp_difference/mean": 0.2911539673805237, "step": 601, "step_time": 22.94814335097908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.546728953719139, "epoch": 0.00602, "grad_norm": 0.14114612340927124, "kl": 0.4070178512483835, "learning_rate": 7.99988139226017e-06, "loss": -0.0658, "step": 602, "step_time": 11.756811709026806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 4.615384578704834, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5317136496305466, "epoch": 0.00603, "frac_reward_zero_std": 0.25, "grad_norm": 0.03958636894822121, "kl": 0.3266781996935606, "learning_rate": 7.999880972784085e-06, "loss": -0.072, "num_tokens": 14554161.0, "reward": 0.7125345468521118, "reward_std": 0.9513958692550659, "rewards/rollout_reward_func/mean": 0.7125345468521118, "rewards/rollout_reward_func/std": 0.9513958692550659, "sampling/importance_sampling_ratio/max": 1.7535752058029175, "sampling/importance_sampling_ratio/mean": 0.7892395257949829, "sampling/importance_sampling_ratio/min": 2.9297373416170558e-08, "sampling/sampling_logp_difference/max": 2.33821964263916, "sampling/sampling_logp_difference/mean": 0.30011439323425293, "step": 603, "step_time": 24.366040187014733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5304265972226858, "epoch": 0.00604, "grad_norm": 0.0390588715672493, "kl": 0.3342397939413786, "learning_rate": 7.99988055256755e-06, "loss": -0.0719, "step": 604, "step_time": 12.423268738057232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.65625, "completions/mean_terminated_length": 4.039999961853027, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6463601998984814, "epoch": 0.00605, "frac_reward_zero_std": 0.0, "grad_norm": 0.0439196452498436, "kl": 0.31661967746913433, "learning_rate": 7.999880131610566e-06, "loss": -0.0709, "num_tokens": 14610842.0, "reward": 0.7034456729888916, "reward_std": 0.8407367467880249, "rewards/rollout_reward_func/mean": 0.7034456729888916, "rewards/rollout_reward_func/std": 0.8407366871833801, "sampling/importance_sampling_ratio/max": 1.3344361782073975, "sampling/importance_sampling_ratio/mean": 0.8543407917022705, "sampling/importance_sampling_ratio/min": 5.695436122721276e-09, "sampling/sampling_logp_difference/max": 1.999410629272461, "sampling/sampling_logp_difference/mean": 0.3778407573699951, "step": 605, "step_time": 25.442453880008543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6464902237057686, "epoch": 0.00606, "grad_norm": 0.04388470947742462, "kl": 0.31819187477231026, "learning_rate": 7.99987970991313e-06, "loss": -0.0709, "step": 606, "step_time": 12.623503528040601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 4.692307949066162, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.854657843708992, "epoch": 0.00607, "frac_reward_zero_std": 0.0, "grad_norm": 0.10117188841104507, "kl": 0.5813536457717419, "learning_rate": 7.999879287475248e-06, "loss": -0.0657, "num_tokens": 14666163.0, "reward": 0.5361707806587219, "reward_std": 0.8305320143699646, "rewards/rollout_reward_func/mean": 0.5361707806587219, "rewards/rollout_reward_func/std": 0.8305320739746094, "sampling/importance_sampling_ratio/max": 1.2811822891235352, "sampling/importance_sampling_ratio/mean": 0.7305300235748291, "sampling/importance_sampling_ratio/min": 9.475404567638179e-07, "sampling/sampling_logp_difference/max": 2.2976603507995605, "sampling/sampling_logp_difference/mean": 0.3545718491077423, "step": 607, "step_time": 25.090159716957714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8683657795190811, "epoch": 0.00608, "grad_norm": 0.10660681873559952, "kl": 0.5780824981629848, "learning_rate": 7.999878864296913e-06, "loss": -0.066, "step": 608, "step_time": 12.679628214944387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.96875, "completions/mean_terminated_length": 5.296296119689941, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5390233173966408, "epoch": 0.00609, "frac_reward_zero_std": 0.0, "grad_norm": 0.053075503557920456, "kl": 0.3959229998290539, "learning_rate": 7.999878440378129e-06, "loss": -0.0616, "num_tokens": 14719347.0, "reward": 0.8235484957695007, "reward_std": 0.9167829155921936, "rewards/rollout_reward_func/mean": 0.8235484957695007, "rewards/rollout_reward_func/std": 0.9167829751968384, "sampling/importance_sampling_ratio/max": 1.3112601041793823, "sampling/importance_sampling_ratio/mean": 0.7252441644668579, "sampling/importance_sampling_ratio/min": 1.1050573220927618e-06, "sampling/sampling_logp_difference/max": 2.2527589797973633, "sampling/sampling_logp_difference/mean": 0.3151913285255432, "step": 609, "step_time": 26.62830817597569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5472238194197416, "epoch": 0.0061, "grad_norm": 0.052229683846235275, "kl": 0.38357938453555107, "learning_rate": 7.999878015718894e-06, "loss": -0.0617, "step": 610, "step_time": 13.6119786890049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2171510495245457, "epoch": 0.00611, "frac_reward_zero_std": 0.25, "grad_norm": 0.08324611186981201, "kl": 0.28027218021452427, "learning_rate": 7.999877590319212e-06, "loss": -0.0669, "num_tokens": 14761866.0, "reward": 1.105849266052246, "reward_std": 0.7177499532699585, "rewards/rollout_reward_func/mean": 1.105849266052246, "rewards/rollout_reward_func/std": 0.7177499532699585, "sampling/importance_sampling_ratio/max": 1.2753020524978638, "sampling/importance_sampling_ratio/mean": 0.85759037733078, "sampling/importance_sampling_ratio/min": 3.525414626892598e-08, "sampling/sampling_logp_difference/max": 2.3263635635375977, "sampling/sampling_logp_difference/mean": 0.21862952411174774, "step": 611, "step_time": 23.536337598983664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.224491786211729, "epoch": 0.00612, "grad_norm": 0.08638058602809906, "kl": 0.27287718933075666, "learning_rate": 7.99987716417908e-06, "loss": -0.0667, "step": 612, "step_time": 11.902999392012134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.09375, "completions/mean_terminated_length": 5.44444465637207, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9538464844226837, "epoch": 0.00613, "frac_reward_zero_std": 0.0, "grad_norm": 0.07357621192932129, "kl": 0.5007764454931021, "learning_rate": 7.999876737298501e-06, "loss": -0.0903, "num_tokens": 14823048.0, "reward": 0.16035953164100647, "reward_std": 0.7038300037384033, "rewards/rollout_reward_func/mean": 0.16035953164100647, "rewards/rollout_reward_func/std": 0.7038300037384033, "sampling/importance_sampling_ratio/max": 1.344364047050476, "sampling/importance_sampling_ratio/mean": 0.6864049434661865, "sampling/importance_sampling_ratio/min": 2.4681019567651674e-05, "sampling/sampling_logp_difference/max": 1.5623836517333984, "sampling/sampling_logp_difference/mean": 0.3822208046913147, "step": 613, "step_time": 26.720134407049045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9446582645177841, "epoch": 0.00614, "grad_norm": 0.07576391100883484, "kl": 0.5055045671761036, "learning_rate": 7.99987630967747e-06, "loss": -0.0905, "step": 614, "step_time": 12.932758608018048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.71875, "completions/mean_terminated_length": 5.758620738983154, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.011517085134983, "epoch": 0.00615, "frac_reward_zero_std": 0.0, "grad_norm": 0.08721795678138733, "kl": 0.7963624466210604, "learning_rate": 7.99987588131599e-06, "loss": -0.0935, "num_tokens": 14873842.0, "reward": 0.7191935777664185, "reward_std": 0.8914937973022461, "rewards/rollout_reward_func/mean": 0.7191935777664185, "rewards/rollout_reward_func/std": 0.8914937973022461, "sampling/importance_sampling_ratio/max": 1.3112738132476807, "sampling/importance_sampling_ratio/mean": 0.6989266872406006, "sampling/importance_sampling_ratio/min": 5.321550133885466e-07, "sampling/sampling_logp_difference/max": 3.1543986797332764, "sampling/sampling_logp_difference/mean": 0.4129783511161804, "step": 615, "step_time": 27.41540984704625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0042583346366882, "epoch": 0.00616, "grad_norm": 0.08565042167901993, "kl": 0.7927328888326883, "learning_rate": 7.999875452214063e-06, "loss": -0.0938, "step": 616, "step_time": 14.287738486978924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.625, "completions/mean_terminated_length": 5.2727274894714355, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.608550578355789, "epoch": 0.00617, "frac_reward_zero_std": 0.0, "grad_norm": 0.12686435878276825, "kl": 0.22925933357328176, "learning_rate": 7.999875022371686e-06, "loss": -0.0861, "num_tokens": 14948687.0, "reward": 0.2985207736492157, "reward_std": 0.7409701347351074, "rewards/rollout_reward_func/mean": 0.2985207736492157, "rewards/rollout_reward_func/std": 0.7409700751304626, "sampling/importance_sampling_ratio/max": 1.399825096130371, "sampling/importance_sampling_ratio/mean": 0.5102779269218445, "sampling/importance_sampling_ratio/min": 3.884350174843121e-09, "sampling/sampling_logp_difference/max": 2.2345948219299316, "sampling/sampling_logp_difference/mean": 0.4346444308757782, "step": 617, "step_time": 37.33400545900804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.606506012380123, "epoch": 0.00618, "grad_norm": 0.11683876812458038, "kl": 0.24182082153856754, "learning_rate": 7.99987459178886e-06, "loss": -0.0867, "step": 618, "step_time": 18.288682383979904 }, { "clip_ratio/high_max": 0.0059523810632526875, "clip_ratio/high_mean": 0.0029761905316263437, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029761905316263437, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.46875, "completions/mean_terminated_length": 5.045454502105713, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.251504100859165, "epoch": 0.00619, "frac_reward_zero_std": 0.0, "grad_norm": 0.06699223071336746, "kl": 0.4481767974793911, "learning_rate": 7.999874160465587e-06, "loss": -0.0916, "num_tokens": 15006242.0, "reward": 0.27944689989089966, "reward_std": 0.7864671945571899, "rewards/rollout_reward_func/mean": 0.27944689989089966, "rewards/rollout_reward_func/std": 0.7864671945571899, "sampling/importance_sampling_ratio/max": 1.4620355367660522, "sampling/importance_sampling_ratio/mean": 0.5384005308151245, "sampling/importance_sampling_ratio/min": 4.5836038964353065e-08, "sampling/sampling_logp_difference/max": 2.2138800621032715, "sampling/sampling_logp_difference/mean": 0.4322977066040039, "step": 619, "step_time": 28.24725354203838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.243686467409134, "epoch": 0.0062, "grad_norm": 0.0707361176609993, "kl": 0.5001559546217322, "learning_rate": 7.999873728401865e-06, "loss": -0.0914, "step": 620, "step_time": 14.601761551981326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 4.319999694824219, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9168898984789848, "epoch": 0.00621, "frac_reward_zero_std": 0.0, "grad_norm": 0.10004229843616486, "kl": 0.2839524429291487, "learning_rate": 7.999873295597694e-06, "loss": -0.0599, "num_tokens": 15056513.0, "reward": 0.5081830024719238, "reward_std": 0.8975529670715332, "rewards/rollout_reward_func/mean": 0.5081830024719238, "rewards/rollout_reward_func/std": 0.8975529670715332, "sampling/importance_sampling_ratio/max": 1.4471052885055542, "sampling/importance_sampling_ratio/mean": 0.7654093503952026, "sampling/importance_sampling_ratio/min": 1.1255545384614152e-09, "sampling/sampling_logp_difference/max": 2.3493154048919678, "sampling/sampling_logp_difference/mean": 0.33366650342941284, "step": 621, "step_time": 29.47965036702226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9072678238153458, "epoch": 0.00622, "grad_norm": 0.09753204137086868, "kl": 0.29956040903925896, "learning_rate": 7.999872862053077e-06, "loss": -0.0602, "step": 622, "step_time": 14.060049270017771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.53125, "completions/mean_terminated_length": 4.217391490936279, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5859116343781352, "epoch": 0.00623, "frac_reward_zero_std": 0.25, "grad_norm": 0.05191798135638237, "kl": 0.25172504503279924, "learning_rate": 7.99987242776801e-06, "loss": -0.0935, "num_tokens": 15122586.0, "reward": 0.5740118026733398, "reward_std": 0.9090780019760132, "rewards/rollout_reward_func/mean": 0.5740118026733398, "rewards/rollout_reward_func/std": 0.9090780019760132, "sampling/importance_sampling_ratio/max": 1.7354180812835693, "sampling/importance_sampling_ratio/mean": 0.7940953373908997, "sampling/importance_sampling_ratio/min": 1.318250286708178e-09, "sampling/sampling_logp_difference/max": 2.0552496910095215, "sampling/sampling_logp_difference/mean": 0.36548686027526855, "step": 623, "step_time": 32.73870649095625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5755827678367496, "epoch": 0.00624, "grad_norm": 0.04656323045492172, "kl": 0.251214271876961, "learning_rate": 7.999871992742494e-06, "loss": -0.0938, "step": 624, "step_time": 16.358977720083203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.625, "completions/mean_terminated_length": 4.551723957061768, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9816789515316486, "epoch": 0.00625, "frac_reward_zero_std": 0.25, "grad_norm": 0.04811901971697807, "kl": 0.5964869149029255, "learning_rate": 7.999871556976533e-06, "loss": -0.0759, "num_tokens": 15168983.0, "reward": 1.2314578294754028, "reward_std": 0.5451591610908508, "rewards/rollout_reward_func/mean": 1.2314578294754028, "rewards/rollout_reward_func/std": 0.545159101486206, "sampling/importance_sampling_ratio/max": 1.5591706037521362, "sampling/importance_sampling_ratio/mean": 0.8947290778160095, "sampling/importance_sampling_ratio/min": 2.34899246720488e-08, "sampling/sampling_logp_difference/max": 2.3636281490325928, "sampling/sampling_logp_difference/mean": 0.29658135771751404, "step": 625, "step_time": 29.888806607981678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 0.9733163062483072, "epoch": 0.00626, "grad_norm": 0.05700458213686943, "kl": 0.670900471508503, "learning_rate": 7.999871120470122e-06, "loss": -0.0758, "step": 626, "step_time": 16.22053495197906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 4.5625, "completions/mean_terminated_length": 4.5625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5434870929457247, "epoch": 0.00627, "frac_reward_zero_std": 0.5, "grad_norm": 0.017695220187306404, "kl": 0.4499796908348799, "learning_rate": 7.999870683223264e-06, "loss": -0.0372, "num_tokens": 15221822.0, "reward": 1.186911940574646, "reward_std": 0.6125363111495972, "rewards/rollout_reward_func/mean": 1.186911940574646, "rewards/rollout_reward_func/std": 0.6125363111495972, "sampling/importance_sampling_ratio/max": 1.380379319190979, "sampling/importance_sampling_ratio/mean": 0.9930553436279297, "sampling/importance_sampling_ratio/min": 0.00042872579069808125, "sampling/sampling_logp_difference/max": 1.3939902782440186, "sampling/sampling_logp_difference/mean": 0.1439063400030136, "step": 627, "step_time": 22.9614029659715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5367982708849013, "epoch": 0.00628, "grad_norm": 0.016454173251986504, "kl": 0.43999665044248104, "learning_rate": 7.99987024523596e-06, "loss": -0.0372, "step": 628, "step_time": 12.792720487981569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 5.034482955932617, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.119451087899506, "epoch": 0.00629, "frac_reward_zero_std": 0.0, "grad_norm": 0.08694598078727722, "kl": 0.3838769532740116, "learning_rate": 7.999869806508206e-06, "loss": -0.0818, "num_tokens": 15271233.0, "reward": 0.4998767375946045, "reward_std": 0.81727534532547, "rewards/rollout_reward_func/mean": 0.4998767375946045, "rewards/rollout_reward_func/std": 0.8172754049301147, "sampling/importance_sampling_ratio/max": 1.1393731832504272, "sampling/importance_sampling_ratio/mean": 0.8247106671333313, "sampling/importance_sampling_ratio/min": 1.589041858096607e-05, "sampling/sampling_logp_difference/max": 2.4512476921081543, "sampling/sampling_logp_difference/mean": 0.2558549642562866, "step": 629, "step_time": 22.837038886063965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.113947325386107, "epoch": 0.0063, "grad_norm": 0.07801114767789841, "kl": 0.39510487765073776, "learning_rate": 7.999869367040006e-06, "loss": -0.0819, "step": 630, "step_time": 11.68024664602126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.78125, "completions/mean_terminated_length": 4.724137783050537, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1652370463125408, "epoch": 0.00631, "frac_reward_zero_std": 0.0, "grad_norm": 0.1192060336470604, "kl": 1.3451243937015533, "learning_rate": 7.999868926831357e-06, "loss": -0.0818, "num_tokens": 15323065.0, "reward": 0.9321688413619995, "reward_std": 0.7848160862922668, "rewards/rollout_reward_func/mean": 0.9321688413619995, "rewards/rollout_reward_func/std": 0.7848160862922668, "sampling/importance_sampling_ratio/max": 1.1654236316680908, "sampling/importance_sampling_ratio/mean": 0.7870608568191528, "sampling/importance_sampling_ratio/min": 2.7914362021874695e-07, "sampling/sampling_logp_difference/max": 2.7348692417144775, "sampling/sampling_logp_difference/mean": 0.26531344652175903, "step": 631, "step_time": 27.554323893011315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.160417846404016, "epoch": 0.00632, "grad_norm": 0.10997090488672256, "kl": 1.182968644425273, "learning_rate": 7.999868485882263e-06, "loss": -0.0822, "step": 632, "step_time": 15.306508609995944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.928571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2626909986138344, "epoch": 0.00633, "frac_reward_zero_std": 0.25, "grad_norm": 0.06487908959388733, "kl": 0.4665615763515234, "learning_rate": 7.999868044192721e-06, "loss": -0.0602, "num_tokens": 15381634.0, "reward": 0.5388630628585815, "reward_std": 0.7927513718605042, "rewards/rollout_reward_func/mean": 0.5388630628585815, "rewards/rollout_reward_func/std": 0.7927514314651489, "sampling/importance_sampling_ratio/max": 1.2457621097564697, "sampling/importance_sampling_ratio/mean": 0.8425557017326355, "sampling/importance_sampling_ratio/min": 1.928312940435717e-06, "sampling/sampling_logp_difference/max": 1.877756118774414, "sampling/sampling_logp_difference/mean": 0.24455131590366364, "step": 633, "step_time": 29.976955214020563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2618493735790253, "epoch": 0.00634, "grad_norm": 0.06539298593997955, "kl": 0.5060365907847881, "learning_rate": 7.999867601762732e-06, "loss": -0.0602, "step": 634, "step_time": 15.807123237929773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 4.239999771118164, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5845728619024158, "epoch": 0.00635, "frac_reward_zero_std": 0.0, "grad_norm": 0.11966641247272491, "kl": 0.7196854632347822, "learning_rate": 7.999867158592297e-06, "loss": -0.0516, "num_tokens": 15439412.0, "reward": 0.06594428420066833, "reward_std": 0.651947557926178, "rewards/rollout_reward_func/mean": 0.06594428420066833, "rewards/rollout_reward_func/std": 0.6519474983215332, "sampling/importance_sampling_ratio/max": 1.3145928382873535, "sampling/importance_sampling_ratio/mean": 0.7023531794548035, "sampling/importance_sampling_ratio/min": 7.255997452187968e-12, "sampling/sampling_logp_difference/max": 2.3315305709838867, "sampling/sampling_logp_difference/mean": 0.3494740426540375, "step": 635, "step_time": 31.30153951406828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5791743099689484, "epoch": 0.00636, "grad_norm": 0.12062124907970428, "kl": 0.6903574364259839, "learning_rate": 7.999866714681415e-06, "loss": -0.052, "step": 636, "step_time": 16.04656014899956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 5.0714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.427160419523716, "epoch": 0.00637, "frac_reward_zero_std": 0.0, "grad_norm": 0.07833511382341385, "kl": 0.7116500176489353, "learning_rate": 7.999866270030085e-06, "loss": -0.0769, "num_tokens": 15500784.0, "reward": 0.5633800029754639, "reward_std": 0.8134323358535767, "rewards/rollout_reward_func/mean": 0.5633800029754639, "rewards/rollout_reward_func/std": 0.8134322762489319, "sampling/importance_sampling_ratio/max": 1.375873327255249, "sampling/importance_sampling_ratio/mean": 0.7447874546051025, "sampling/importance_sampling_ratio/min": 0.0008644886547699571, "sampling/sampling_logp_difference/max": 1.8595154285430908, "sampling/sampling_logp_difference/mean": 0.2856992185115814, "step": 637, "step_time": 32.85159942001337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4188774917274714, "epoch": 0.00638, "grad_norm": 0.12329093366861343, "kl": 0.715512877330184, "learning_rate": 7.999865824638311e-06, "loss": -0.0768, "step": 638, "step_time": 16.93038847402204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.1875, "completions/mean_terminated_length": 4.4666666984558105, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8822582853026688, "epoch": 0.00639, "frac_reward_zero_std": 0.0, "grad_norm": 0.28999656438827515, "kl": 2.1874721329659224, "learning_rate": 7.999865378506088e-06, "loss": -0.0818, "num_tokens": 15557085.0, "reward": 0.6316165328025818, "reward_std": 0.789236307144165, "rewards/rollout_reward_func/mean": 0.6316165328025818, "rewards/rollout_reward_func/std": 0.789236307144165, "sampling/importance_sampling_ratio/max": 1.629631519317627, "sampling/importance_sampling_ratio/mean": 0.7995069026947021, "sampling/importance_sampling_ratio/min": 0.0006060729501768947, "sampling/sampling_logp_difference/max": 2.222135066986084, "sampling/sampling_logp_difference/mean": 0.23993650078773499, "step": 639, "step_time": 30.080124666012125 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.8829745412804186, "epoch": 0.0064, "grad_norm": 0.1394345611333847, "kl": 1.697444923222065, "learning_rate": 7.999864931633422e-06, "loss": -0.0836, "step": 640, "step_time": 16.12372956602485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.46875, "completions/mean_terminated_length": 4.096774101257324, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5560626611113548, "epoch": 0.00641, "frac_reward_zero_std": 0.5, "grad_norm": 0.08186359703540802, "kl": 0.6131113078445196, "learning_rate": 7.999864484020307e-06, "loss": -0.0217, "num_tokens": 15608815.0, "reward": 0.9479614496231079, "reward_std": 0.5715668797492981, "rewards/rollout_reward_func/mean": 0.9479614496231079, "rewards/rollout_reward_func/std": 0.5715668797492981, "sampling/importance_sampling_ratio/max": 1.714598536491394, "sampling/importance_sampling_ratio/mean": 0.9553072452545166, "sampling/importance_sampling_ratio/min": 4.9498930820846e-06, "sampling/sampling_logp_difference/max": 1.7571953535079956, "sampling/sampling_logp_difference/mean": 0.149858757853508, "step": 641, "step_time": 28.410721115011256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5594187211245298, "epoch": 0.00642, "grad_norm": 0.07159034162759781, "kl": 0.5824175141751766, "learning_rate": 7.999864035666747e-06, "loss": -0.0217, "step": 642, "step_time": 15.86891047997051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6775290863588452, "epoch": 0.00643, "frac_reward_zero_std": 0.25, "grad_norm": 0.06740476936101913, "kl": 0.33430699072778225, "learning_rate": 7.999863586572742e-06, "loss": -0.0481, "num_tokens": 15651111.0, "reward": 1.005631446838379, "reward_std": 0.8372712731361389, "rewards/rollout_reward_func/mean": 1.005631446838379, "rewards/rollout_reward_func/std": 0.8372713327407837, "sampling/importance_sampling_ratio/max": 1.2425886392593384, "sampling/importance_sampling_ratio/mean": 0.96675705909729, "sampling/importance_sampling_ratio/min": 4.6083172833277786e-07, "sampling/sampling_logp_difference/max": 2.5858099460601807, "sampling/sampling_logp_difference/mean": 0.1791539490222931, "step": 643, "step_time": 20.48568344800151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6868952456861734, "epoch": 0.00644, "grad_norm": 0.06991592794656754, "kl": 0.31356436014175415, "learning_rate": 7.99986313673829e-06, "loss": -0.0483, "step": 644, "step_time": 12.094543879036792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.53125, "completions/mean_terminated_length": 5.193548202514648, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3567134961485863, "epoch": 0.00645, "frac_reward_zero_std": 0.0, "grad_norm": 0.1577313393354416, "kl": 0.40785571187734604, "learning_rate": 7.999862686163393e-06, "loss": -0.058, "num_tokens": 15710007.0, "reward": 0.5255494117736816, "reward_std": 0.8310731649398804, "rewards/rollout_reward_func/mean": 0.5255494117736816, "rewards/rollout_reward_func/std": 0.8310731053352356, "sampling/importance_sampling_ratio/max": 1.414715051651001, "sampling/importance_sampling_ratio/mean": 0.881436824798584, "sampling/importance_sampling_ratio/min": 1.98527686734451e-06, "sampling/sampling_logp_difference/max": 2.410433292388916, "sampling/sampling_logp_difference/mean": 0.31577110290527344, "step": 645, "step_time": 26.598038219992304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 1.3752035070210695, "epoch": 0.00646, "grad_norm": 0.1716381311416626, "kl": 0.3833606243133545, "learning_rate": 7.999862234848049e-06, "loss": -0.0585, "step": 646, "step_time": 14.188705161010148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.1875, "completions/mean_terminated_length": 4.4666666984558105, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2311323452740908, "epoch": 0.00647, "frac_reward_zero_std": 0.0, "grad_norm": 0.12015901505947113, "kl": 1.2178875245153904, "learning_rate": 7.99986178279226e-06, "loss": -0.07, "num_tokens": 15757134.0, "reward": 0.7907242774963379, "reward_std": 0.7815701365470886, "rewards/rollout_reward_func/mean": 0.7907242774963379, "rewards/rollout_reward_func/std": 0.7815701365470886, "sampling/importance_sampling_ratio/max": 1.2261581420898438, "sampling/importance_sampling_ratio/mean": 0.7870984077453613, "sampling/importance_sampling_ratio/min": 2.9603723305626772e-05, "sampling/sampling_logp_difference/max": 1.9619171619415283, "sampling/sampling_logp_difference/mean": 0.31184011697769165, "step": 647, "step_time": 20.473197444021935 }, { "clip_ratio/high_max": 0.017769608180969954, "clip_ratio/high_mean": 0.008884804090484977, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008884804090484977, "entropy": 1.2461556419730186, "epoch": 0.00648, "grad_norm": 0.08978547900915146, "kl": 0.9733599293977022, "learning_rate": 7.999861329996028e-06, "loss": -0.0707, "step": 648, "step_time": 11.032882592000533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 4.71999979019165, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7540506571531296, "epoch": 0.00649, "frac_reward_zero_std": 0.0, "grad_norm": 0.0489591546356678, "kl": 0.5619189459830523, "learning_rate": 7.999860876459348e-06, "loss": -0.0879, "num_tokens": 15816352.0, "reward": 0.5961824655532837, "reward_std": 0.9324072599411011, "rewards/rollout_reward_func/mean": 0.5961824655532837, "rewards/rollout_reward_func/std": 0.9324072599411011, "sampling/importance_sampling_ratio/max": 1.4407435655593872, "sampling/importance_sampling_ratio/mean": 0.7378042936325073, "sampling/importance_sampling_ratio/min": 6.544941300035134e-08, "sampling/sampling_logp_difference/max": 2.0909671783447266, "sampling/sampling_logp_difference/mean": 0.35894981026649475, "step": 649, "step_time": 29.173159391008085 }, { "clip_ratio/high_max": 0.004999999888241291, "clip_ratio/high_mean": 0.0024999999441206455, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024999999441206455, "entropy": 1.7606255412101746, "epoch": 0.0065, "grad_norm": 0.03588792309165001, "kl": 0.5105183459818363, "learning_rate": 7.999860422182224e-06, "loss": -0.0881, "step": 650, "step_time": 14.370311212987872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.34375, "completions/mean_terminated_length": 4.863636493682861, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0260403603315353, "epoch": 0.00651, "frac_reward_zero_std": 0.0, "grad_norm": 0.18679381906986237, "kl": 0.3149622529745102, "learning_rate": 7.999859967164654e-06, "loss": -0.0547, "num_tokens": 15869107.0, "reward": 0.3213867247104645, "reward_std": 0.9631365537643433, "rewards/rollout_reward_func/mean": 0.3213867247104645, "rewards/rollout_reward_func/std": 0.9631365537643433, "sampling/importance_sampling_ratio/max": 1.4474247694015503, "sampling/importance_sampling_ratio/mean": 0.6429899334907532, "sampling/importance_sampling_ratio/min": 2.771118943201145e-07, "sampling/sampling_logp_difference/max": 2.576960802078247, "sampling/sampling_logp_difference/mean": 0.41726696491241455, "step": 651, "step_time": 27.17923462303588 }, { "clip_ratio/high_max": 0.012834821827709675, "clip_ratio/high_mean": 0.00954241119325161, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00954241119325161, "entropy": 2.0430982932448387, "epoch": 0.00652, "grad_norm": 0.1889868825674057, "kl": 0.2734419805929065, "learning_rate": 7.99985951140664e-06, "loss": -0.0557, "step": 652, "step_time": 12.262358080013655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 5.6666669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2195270750671625, "epoch": 0.00653, "frac_reward_zero_std": 0.0, "grad_norm": 0.10074726492166519, "kl": 0.4005127106793225, "learning_rate": 7.999859054908181e-06, "loss": -0.0996, "num_tokens": 15930942.0, "reward": 0.2299545407295227, "reward_std": 0.8095769286155701, "rewards/rollout_reward_func/mean": 0.2299545407295227, "rewards/rollout_reward_func/std": 0.8095768690109253, "sampling/importance_sampling_ratio/max": 1.392885684967041, "sampling/importance_sampling_ratio/mean": 0.6583304405212402, "sampling/importance_sampling_ratio/min": 5.030583452025894e-07, "sampling/sampling_logp_difference/max": 2.0753440856933594, "sampling/sampling_logp_difference/mean": 0.4064157009124756, "step": 653, "step_time": 28.76132393296575 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 2.2348747923970222, "epoch": 0.00654, "grad_norm": 0.09965783357620239, "kl": 0.3647707784548402, "learning_rate": 7.999858597669278e-06, "loss": -0.0994, "step": 654, "step_time": 14.032447421981487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.375, "completions/mean_terminated_length": 5.3913044929504395, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.6177211329340935, "epoch": 0.00655, "frac_reward_zero_std": 0.0, "grad_norm": 0.1225324496626854, "kl": 0.19061079621315002, "learning_rate": 7.99985813968993e-06, "loss": -0.0924, "num_tokens": 15988030.0, "reward": 0.17110836505889893, "reward_std": 0.8962276577949524, "rewards/rollout_reward_func/mean": 0.17110836505889893, "rewards/rollout_reward_func/std": 0.8962276577949524, "sampling/importance_sampling_ratio/max": 1.5202888250350952, "sampling/importance_sampling_ratio/mean": 0.5026887655258179, "sampling/importance_sampling_ratio/min": 7.456653960957738e-09, "sampling/sampling_logp_difference/max": 2.458770990371704, "sampling/sampling_logp_difference/mean": 0.48206785321235657, "step": 655, "step_time": 33.077106042939704 }, { "clip_ratio/high_max": 0.0059523810632526875, "clip_ratio/high_mean": 0.0029761905316263437, "clip_ratio/low_mean": 0.010937500046566129, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013913690578192472, "entropy": 2.627207189798355, "epoch": 0.00656, "grad_norm": 0.10209153592586517, "kl": 0.18376676551997662, "learning_rate": 7.999857680970137e-06, "loss": -0.0924, "step": 656, "step_time": 16.43830979600898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.53125, "completions/mean_terminated_length": 4.035714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3284261636435986, "epoch": 0.00657, "frac_reward_zero_std": 0.25, "grad_norm": 0.04447246342897415, "kl": 0.41876995004713535, "learning_rate": 7.9998572215099e-06, "loss": -0.062, "num_tokens": 16033203.0, "reward": 0.9679161310195923, "reward_std": 0.7052366733551025, "rewards/rollout_reward_func/mean": 0.9679161310195923, "rewards/rollout_reward_func/std": 0.7052366733551025, "sampling/importance_sampling_ratio/max": 1.411274790763855, "sampling/importance_sampling_ratio/mean": 0.9025020599365234, "sampling/importance_sampling_ratio/min": 4.3619586165277724e-08, "sampling/sampling_logp_difference/max": 2.244035005569458, "sampling/sampling_logp_difference/mean": 0.3030428886413574, "step": 657, "step_time": 22.82299532397883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3378444537520409, "epoch": 0.00658, "grad_norm": 0.04818740114569664, "kl": 0.40608376637101173, "learning_rate": 7.99985676130922e-06, "loss": -0.0622, "step": 658, "step_time": 12.643697878957028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 4.807692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6711012572050095, "epoch": 0.00659, "frac_reward_zero_std": 0.0, "grad_norm": 0.10917739570140839, "kl": 0.3459614934399724, "learning_rate": 7.999856300368094e-06, "loss": -0.1056, "num_tokens": 16080780.0, "reward": 0.27761247754096985, "reward_std": 0.86508709192276, "rewards/rollout_reward_func/mean": 0.27761247754096985, "rewards/rollout_reward_func/std": 0.86508709192276, "sampling/importance_sampling_ratio/max": 1.6136149168014526, "sampling/importance_sampling_ratio/mean": 0.7114683389663696, "sampling/importance_sampling_ratio/min": 2.4608909370726906e-05, "sampling/sampling_logp_difference/max": 1.6698262691497803, "sampling/sampling_logp_difference/mean": 0.2963225841522217, "step": 659, "step_time": 24.354788087046472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6769345700740814, "epoch": 0.0066, "grad_norm": 0.11083339154720306, "kl": 0.34669195767492056, "learning_rate": 7.999855838686525e-06, "loss": -0.106, "step": 660, "step_time": 12.673503018973861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.15625, "completions/mean_terminated_length": 5.050000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.605195552110672, "epoch": 0.00661, "frac_reward_zero_std": 0.0, "grad_norm": 0.11618717014789581, "kl": 0.26347159780561924, "learning_rate": 7.999855376264513e-06, "loss": -0.0976, "num_tokens": 16148949.0, "reward": 0.23403650522232056, "reward_std": 0.8216521739959717, "rewards/rollout_reward_func/mean": 0.23403650522232056, "rewards/rollout_reward_func/std": 0.8216521739959717, "sampling/importance_sampling_ratio/max": 2.1007871627807617, "sampling/importance_sampling_ratio/mean": 0.6060469746589661, "sampling/importance_sampling_ratio/min": 2.5974888906432625e-09, "sampling/sampling_logp_difference/max": 2.360060691833496, "sampling/sampling_logp_difference/mean": 0.4469453990459442, "step": 661, "step_time": 31.075030540960142 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 2.594911776483059, "epoch": 0.00662, "grad_norm": 0.08661012351512909, "kl": 0.2662023175507784, "learning_rate": 7.999854913102054e-06, "loss": -0.0984, "step": 662, "step_time": 15.347410472924821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.4375, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.711062505841255, "epoch": 0.00663, "frac_reward_zero_std": 0.0, "grad_norm": 0.2893345355987549, "kl": 0.16938892984762788, "learning_rate": 7.999854449199155e-06, "loss": -0.1312, "num_tokens": 16215826.0, "reward": 0.1603858768939972, "reward_std": 0.8055580258369446, "rewards/rollout_reward_func/mean": 0.1603858768939972, "rewards/rollout_reward_func/std": 0.8055580258369446, "sampling/importance_sampling_ratio/max": 1.6147619485855103, "sampling/importance_sampling_ratio/mean": 0.596784234046936, "sampling/importance_sampling_ratio/min": 1.858524889541968e-08, "sampling/sampling_logp_difference/max": 2.1560518741607666, "sampling/sampling_logp_difference/mean": 0.4653646647930145, "step": 663, "step_time": 32.6154831139429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 2.670554056763649, "epoch": 0.00664, "grad_norm": 0.2522004544734955, "kl": 0.1808503190986812, "learning_rate": 7.99985398455581e-06, "loss": -0.1335, "step": 664, "step_time": 14.745935959974304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9282611161470413, "epoch": 0.00665, "frac_reward_zero_std": 0.25, "grad_norm": 0.08822242170572281, "kl": 0.5137783922255039, "learning_rate": 7.999853519172022e-06, "loss": -0.0846, "num_tokens": 16266631.0, "reward": 0.5491922497749329, "reward_std": 0.8688389658927917, "rewards/rollout_reward_func/mean": 0.5491922497749329, "rewards/rollout_reward_func/std": 0.8688389658927917, "sampling/importance_sampling_ratio/max": 1.7197469472885132, "sampling/importance_sampling_ratio/mean": 0.7742269039154053, "sampling/importance_sampling_ratio/min": 3.227098102631665e-11, "sampling/sampling_logp_difference/max": 2.6386525630950928, "sampling/sampling_logp_difference/mean": 0.4766632914543152, "step": 665, "step_time": 26.13170593001996 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.009375000139698386, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017187499906867743, "entropy": 1.8814720697700977, "epoch": 0.00666, "grad_norm": 0.07852089405059814, "kl": 0.5926050869747996, "learning_rate": 7.999853053047792e-06, "loss": -0.0852, "step": 666, "step_time": 12.729076618968975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.800000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8957498390227556, "epoch": 0.00667, "frac_reward_zero_std": 0.0, "grad_norm": 0.0750134289264679, "kl": 0.39135834388434887, "learning_rate": 7.999852586183117e-06, "loss": -0.0708, "num_tokens": 16321313.0, "reward": 0.6949410438537598, "reward_std": 0.8000962734222412, "rewards/rollout_reward_func/mean": 0.6949410438537598, "rewards/rollout_reward_func/std": 0.800096333026886, "sampling/importance_sampling_ratio/max": 1.379580020904541, "sampling/importance_sampling_ratio/mean": 0.9353280663490295, "sampling/importance_sampling_ratio/min": 9.197055078402627e-06, "sampling/sampling_logp_difference/max": 1.9829065799713135, "sampling/sampling_logp_difference/mean": 0.21051552891731262, "step": 667, "step_time": 24.0845068419585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8844382353127003, "epoch": 0.00668, "grad_norm": 0.08472084254026413, "kl": 0.450159152969718, "learning_rate": 7.999852118577999e-06, "loss": -0.0706, "step": 668, "step_time": 12.76782532694051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003289473708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.1875, "completions/mean_terminated_length": 4.4666666984558105, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.974433172494173, "epoch": 0.00669, "frac_reward_zero_std": 0.25, "grad_norm": 0.10093717277050018, "kl": 0.6527474261820316, "learning_rate": 7.99985165023244e-06, "loss": -0.0593, "num_tokens": 16371049.0, "reward": 0.9597532749176025, "reward_std": 0.6644648313522339, "rewards/rollout_reward_func/mean": 0.9597532749176025, "rewards/rollout_reward_func/std": 0.6644648313522339, "sampling/importance_sampling_ratio/max": 1.4488009214401245, "sampling/importance_sampling_ratio/mean": 0.8604036569595337, "sampling/importance_sampling_ratio/min": 9.124873656674026e-08, "sampling/sampling_logp_difference/max": 2.194730520248413, "sampling/sampling_logp_difference/mean": 0.25206658244132996, "step": 669, "step_time": 22.412816584052052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.014574195956811309, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014574195956811309, "entropy": 0.9569903407245874, "epoch": 0.0067, "grad_norm": 0.1349961757659912, "kl": 0.8026696518063545, "learning_rate": 7.999851181146436e-06, "loss": -0.0588, "step": 670, "step_time": 12.956464807037264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.21875, "completions/mean_terminated_length": 4.103448390960693, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9126049103215337, "epoch": 0.00671, "frac_reward_zero_std": 0.25, "grad_norm": 0.08250818401575089, "kl": 0.7802911587059498, "learning_rate": 7.999850711319991e-06, "loss": -0.0548, "num_tokens": 16419511.0, "reward": 0.34482741355895996, "reward_std": 0.7559270858764648, "rewards/rollout_reward_func/mean": 0.34482741355895996, "rewards/rollout_reward_func/std": 0.7559270858764648, "sampling/importance_sampling_ratio/max": 1.3208973407745361, "sampling/importance_sampling_ratio/mean": 0.9216852188110352, "sampling/importance_sampling_ratio/min": 1.337667185907776e-07, "sampling/sampling_logp_difference/max": 2.2013227939605713, "sampling/sampling_logp_difference/mean": 0.304974764585495, "step": 671, "step_time": 24.456468417978613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9159133648499846, "epoch": 0.00672, "grad_norm": 0.09054658561944962, "kl": 0.8355386294424534, "learning_rate": 7.999850240753102e-06, "loss": -0.0547, "step": 672, "step_time": 13.210877958947094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.03125, "completions/mean_terminated_length": 4.607142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.116858066059649, "epoch": 0.00673, "frac_reward_zero_std": 0.0, "grad_norm": 0.05130657181143761, "kl": 0.3184902146458626, "learning_rate": 7.999849769445771e-06, "loss": -0.0721, "num_tokens": 16476949.0, "reward": 0.47308242321014404, "reward_std": 0.8511734008789062, "rewards/rollout_reward_func/mean": 0.47308242321014404, "rewards/rollout_reward_func/std": 0.8511733412742615, "sampling/importance_sampling_ratio/max": 1.42823326587677, "sampling/importance_sampling_ratio/mean": 0.8239655494689941, "sampling/importance_sampling_ratio/min": 0.0008079196559265256, "sampling/sampling_logp_difference/max": 1.3834691047668457, "sampling/sampling_logp_difference/mean": 0.22052696347236633, "step": 673, "step_time": 27.401938986004097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1164339715614915, "epoch": 0.00674, "grad_norm": 0.05179581418633461, "kl": 0.3117409199476242, "learning_rate": 7.999849297398e-06, "loss": -0.0721, "step": 674, "step_time": 14.084905639028875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.84375, "completions/mean_terminated_length": 4.392857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1118240375071764, "epoch": 0.00675, "frac_reward_zero_std": 0.0, "grad_norm": 0.14051836729049683, "kl": 0.7225972283631563, "learning_rate": 7.999848824609783e-06, "loss": -0.0762, "num_tokens": 16532103.0, "reward": 0.40064480900764465, "reward_std": 0.8494378328323364, "rewards/rollout_reward_func/mean": 0.40064480900764465, "rewards/rollout_reward_func/std": 0.8494377732276917, "sampling/importance_sampling_ratio/max": 1.2784162759780884, "sampling/importance_sampling_ratio/mean": 0.8305343985557556, "sampling/importance_sampling_ratio/min": 0.00010892933642026037, "sampling/sampling_logp_difference/max": 4.4852705001831055, "sampling/sampling_logp_difference/mean": 0.22652672231197357, "step": 675, "step_time": 29.37624298396986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1001190152019262, "epoch": 0.00676, "grad_norm": 0.11919993162155151, "kl": 0.581069327890873, "learning_rate": 7.999848351081125e-06, "loss": -0.0766, "step": 676, "step_time": 14.671327414020197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.375, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.42082350538112223, "epoch": 0.00677, "frac_reward_zero_std": 0.0, "grad_norm": 0.01919855736196041, "kl": 0.43685397505760193, "learning_rate": 7.999847876812025e-06, "loss": -0.0459, "num_tokens": 16581984.0, "reward": 1.0762274265289307, "reward_std": 0.6827955842018127, "rewards/rollout_reward_func/mean": 1.0762274265289307, "rewards/rollout_reward_func/std": 0.6827956438064575, "sampling/importance_sampling_ratio/max": 1.2395801544189453, "sampling/importance_sampling_ratio/mean": 0.9535166025161743, "sampling/importance_sampling_ratio/min": 0.0006866353796795011, "sampling/sampling_logp_difference/max": 1.8901762962341309, "sampling/sampling_logp_difference/mean": 0.0992451161146164, "step": 677, "step_time": 21.706083015043987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42140294215641916, "epoch": 0.00678, "grad_norm": 0.018996937200427055, "kl": 0.4366785064339638, "learning_rate": 7.999847401802484e-06, "loss": -0.0459, "step": 678, "step_time": 12.752512292994652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 4.461538791656494, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.317972426302731, "epoch": 0.00679, "frac_reward_zero_std": 0.0, "grad_norm": 0.1194215714931488, "kl": 0.4965007845312357, "learning_rate": 7.9998469260525e-06, "loss": -0.0542, "num_tokens": 16639504.0, "reward": 0.5129151940345764, "reward_std": 0.8456333875656128, "rewards/rollout_reward_func/mean": 0.5129151940345764, "rewards/rollout_reward_func/std": 0.845633327960968, "sampling/importance_sampling_ratio/max": 1.3647677898406982, "sampling/importance_sampling_ratio/mean": 0.7689745426177979, "sampling/importance_sampling_ratio/min": 3.167561226291582e-05, "sampling/sampling_logp_difference/max": 2.216078758239746, "sampling/sampling_logp_difference/mean": 0.2950056791305542, "step": 679, "step_time": 26.393982039007824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3160137394443154, "epoch": 0.0068, "grad_norm": 0.11336199939250946, "kl": 0.49804189056158066, "learning_rate": 7.999846449562074e-06, "loss": -0.0544, "step": 680, "step_time": 12.926566325011663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.21875, "completions/mean_terminated_length": 4.103448390960693, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0606374982744455, "epoch": 0.00681, "frac_reward_zero_std": 0.25, "grad_norm": 0.08208000659942627, "kl": 0.6466356813907623, "learning_rate": 7.999845972331208e-06, "loss": -0.0706, "num_tokens": 16691699.0, "reward": 0.9269440770149231, "reward_std": 0.6838842630386353, "rewards/rollout_reward_func/mean": 0.9269440770149231, "rewards/rollout_reward_func/std": 0.6838842630386353, "sampling/importance_sampling_ratio/max": 1.175836205482483, "sampling/importance_sampling_ratio/mean": 0.8221468925476074, "sampling/importance_sampling_ratio/min": 9.81895641183339e-12, "sampling/sampling_logp_difference/max": 2.4897139072418213, "sampling/sampling_logp_difference/mean": 0.3215855062007904, "step": 681, "step_time": 25.901565206964733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0448696482926607, "epoch": 0.00682, "grad_norm": 0.07612679153680801, "kl": 0.6178215648978949, "learning_rate": 7.9998454943599e-06, "loss": -0.0708, "step": 682, "step_time": 13.45928030606592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.03125, "completions/mean_terminated_length": 4.03125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.40574912168085575, "epoch": 0.00683, "frac_reward_zero_std": 0.25, "grad_norm": 0.03913900628685951, "kl": 1.0200732499361038, "learning_rate": 7.99984501564815e-06, "loss": -0.0315, "num_tokens": 16743769.0, "reward": 0.9729364514350891, "reward_std": 0.6350643038749695, "rewards/rollout_reward_func/mean": 0.9729364514350891, "rewards/rollout_reward_func/std": 0.6350643038749695, "sampling/importance_sampling_ratio/max": 1.1042734384536743, "sampling/importance_sampling_ratio/mean": 0.9256908893585205, "sampling/importance_sampling_ratio/min": 0.03040948323905468, "sampling/sampling_logp_difference/max": 1.5896036624908447, "sampling/sampling_logp_difference/mean": 0.09372476488351822, "step": 683, "step_time": 23.134280884987675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40550021501258016, "epoch": 0.00684, "grad_norm": 0.038189131766557693, "kl": 0.9889646098017693, "learning_rate": 7.999844536195958e-06, "loss": -0.0315, "step": 684, "step_time": 13.281481306970818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.15625, "completions/mean_terminated_length": 4.034482955932617, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.683674976695329, "epoch": 0.00685, "frac_reward_zero_std": 0.25, "grad_norm": 0.12065216153860092, "kl": 0.36187837831676006, "learning_rate": 7.999844056003326e-06, "loss": -0.0382, "num_tokens": 16788384.0, "reward": 1.1235156059265137, "reward_std": 0.7021518349647522, "rewards/rollout_reward_func/mean": 1.1235156059265137, "rewards/rollout_reward_func/std": 0.7021518349647522, "sampling/importance_sampling_ratio/max": 1.12598717212677, "sampling/importance_sampling_ratio/mean": 0.9022097587585449, "sampling/importance_sampling_ratio/min": 0.0009522580076009035, "sampling/sampling_logp_difference/max": 1.4128135442733765, "sampling/sampling_logp_difference/mean": 0.13317662477493286, "step": 685, "step_time": 20.42450988406199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6698616659268737, "epoch": 0.00686, "grad_norm": 0.09469857066869736, "kl": 0.3800843209028244, "learning_rate": 7.999843575070253e-06, "loss": -0.0385, "step": 686, "step_time": 10.265718329988886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.71875, "completions/mean_terminated_length": 4.119999885559082, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.524331970140338, "epoch": 0.00687, "frac_reward_zero_std": 0.0, "grad_norm": 0.0886879488825798, "kl": 0.5062249656766653, "learning_rate": 7.99984309339674e-06, "loss": -0.0839, "num_tokens": 16844152.0, "reward": 0.49711093306541443, "reward_std": 0.7971622347831726, "rewards/rollout_reward_func/mean": 0.49711093306541443, "rewards/rollout_reward_func/std": 0.7971622347831726, "sampling/importance_sampling_ratio/max": 1.1738146543502808, "sampling/importance_sampling_ratio/mean": 0.7813136577606201, "sampling/importance_sampling_ratio/min": 2.865326109713351e-07, "sampling/sampling_logp_difference/max": 2.6295132637023926, "sampling/sampling_logp_difference/mean": 0.326600044965744, "step": 687, "step_time": 25.860483698983444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5234034238383174, "epoch": 0.00688, "grad_norm": 0.0737459659576416, "kl": 0.49462976306676865, "learning_rate": 7.999842610982785e-06, "loss": -0.0843, "step": 688, "step_time": 12.86420573995565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.15625, "completions/mean_terminated_length": 4.433333396911621, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9479334875941277, "epoch": 0.00689, "frac_reward_zero_std": 0.25, "grad_norm": 0.4992721378803253, "kl": 0.2820012476295233, "learning_rate": 7.99984212782839e-06, "loss": -0.0442, "num_tokens": 16897997.0, "reward": 0.5705148577690125, "reward_std": 0.8918679356575012, "rewards/rollout_reward_func/mean": 0.5705148577690125, "rewards/rollout_reward_func/std": 0.8918678760528564, "sampling/importance_sampling_ratio/max": 1.415427803993225, "sampling/importance_sampling_ratio/mean": 0.850163459777832, "sampling/importance_sampling_ratio/min": 0.00014601010479964316, "sampling/sampling_logp_difference/max": 2.0161256790161133, "sampling/sampling_logp_difference/mean": 0.1767454296350479, "step": 689, "step_time": 25.62768579501426 }, { "clip_ratio/high_max": 0.025252525694668293, "clip_ratio/high_mean": 0.012626262847334146, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012626262847334146, "entropy": 0.9492634106427431, "epoch": 0.0069, "grad_norm": 0.2824453115463257, "kl": 0.2825144473463297, "learning_rate": 7.999841643933555e-06, "loss": -0.0462, "step": 690, "step_time": 13.581969351042062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 4.9375, "completions/mean_terminated_length": 4.5806450843811035, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7634833343327045, "epoch": 0.00691, "frac_reward_zero_std": 0.5, "grad_norm": 0.3003966510295868, "kl": 0.43199916929006577, "learning_rate": 7.99984115929828e-06, "loss": -0.0371, "num_tokens": 16944914.0, "reward": 1.0322060585021973, "reward_std": 0.7196837067604065, "rewards/rollout_reward_func/mean": 1.0322060585021973, "rewards/rollout_reward_func/std": 0.7196837067604065, "sampling/importance_sampling_ratio/max": 1.49906587600708, "sampling/importance_sampling_ratio/mean": 0.9244840145111084, "sampling/importance_sampling_ratio/min": 2.112158836098388e-05, "sampling/sampling_logp_difference/max": 2.299588918685913, "sampling/sampling_logp_difference/mean": 0.17863425612449646, "step": 691, "step_time": 23.128855120012304 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013494318351149559, "entropy": 0.779647720977664, "epoch": 0.00692, "grad_norm": 0.14842689037322998, "kl": 0.4425934609025717, "learning_rate": 7.999840673922561e-06, "loss": -0.0377, "step": 692, "step_time": 12.588138774997788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.111111164093018, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1120873484760523, "epoch": 0.00693, "frac_reward_zero_std": 0.0, "grad_norm": 0.18097777664661407, "kl": 0.2688388470560312, "learning_rate": 7.999840187806405e-06, "loss": -0.0579, "num_tokens": 17006912.0, "reward": 0.7100441455841064, "reward_std": 0.8514037132263184, "rewards/rollout_reward_func/mean": 0.7100441455841064, "rewards/rollout_reward_func/std": 0.8514037132263184, "sampling/importance_sampling_ratio/max": 1.2678505182266235, "sampling/importance_sampling_ratio/mean": 0.8120790719985962, "sampling/importance_sampling_ratio/min": 2.3177423713605094e-07, "sampling/sampling_logp_difference/max": 2.463278293609619, "sampling/sampling_logp_difference/mean": 0.2715306878089905, "step": 693, "step_time": 31.71503152398509 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.013494318351149559, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02130681835114956, "entropy": 1.1094029061496258, "epoch": 0.00694, "grad_norm": 0.09700451791286469, "kl": 0.2635008189827204, "learning_rate": 7.999839700949809e-06, "loss": -0.0582, "step": 694, "step_time": 16.590854698006297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.3125, "completions/mean_terminated_length": 4.600000381469727, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1354513075202703, "epoch": 0.00695, "frac_reward_zero_std": 0.0, "grad_norm": 0.6149717569351196, "kl": 0.30713952518999577, "learning_rate": 7.999839213352772e-06, "loss": -0.0097, "num_tokens": 17067760.0, "reward": 0.665107250213623, "reward_std": 0.7889302372932434, "rewards/rollout_reward_func/mean": 0.665107250213623, "rewards/rollout_reward_func/std": 0.7889301776885986, "sampling/importance_sampling_ratio/max": 1.4247158765792847, "sampling/importance_sampling_ratio/mean": 0.9466405510902405, "sampling/importance_sampling_ratio/min": 2.6224191707058253e-09, "sampling/sampling_logp_difference/max": 2.461578130722046, "sampling/sampling_logp_difference/mean": 0.31098473072052, "step": 695, "step_time": 28.556677525950363 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.011363636702299118, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026988636702299118, "entropy": 1.1280987244099379, "epoch": 0.00696, "grad_norm": 0.13369891047477722, "kl": 0.38918427377939224, "learning_rate": 7.999838725015296e-06, "loss": -0.0127, "step": 696, "step_time": 15.956498503044713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.15625, "completions/mean_terminated_length": 4.806451320648193, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1917379815131426, "epoch": 0.00697, "frac_reward_zero_std": 0.0, "grad_norm": 0.25460317730903625, "kl": 1.0161671750247478, "learning_rate": 7.99983823593738e-06, "loss": -0.062, "num_tokens": 17128389.0, "reward": 0.46748411655426025, "reward_std": 0.6911622881889343, "rewards/rollout_reward_func/mean": 0.46748411655426025, "rewards/rollout_reward_func/std": 0.6911622881889343, "sampling/importance_sampling_ratio/max": 2.2730002403259277, "sampling/importance_sampling_ratio/mean": 0.8317513465881348, "sampling/importance_sampling_ratio/min": 3.516951255733147e-05, "sampling/sampling_logp_difference/max": 2.0571138858795166, "sampling/sampling_logp_difference/mean": 0.27670031785964966, "step": 697, "step_time": 23.590383720060345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.195772785693407, "epoch": 0.00698, "grad_norm": 0.23633578419685364, "kl": 0.968022994697094, "learning_rate": 7.999837746119025e-06, "loss": -0.0615, "step": 698, "step_time": 13.431158639024943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.03125, "completions/mean_terminated_length": 4.300000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8631732929497957, "epoch": 0.00699, "frac_reward_zero_std": 0.5, "grad_norm": 0.06837975978851318, "kl": 0.34864419884979725, "learning_rate": 7.99983725556023e-06, "loss": -0.0399, "num_tokens": 17170445.0, "reward": 0.782562792301178, "reward_std": 0.7858588099479675, "rewards/rollout_reward_func/mean": 0.782562792301178, "rewards/rollout_reward_func/std": 0.7858587503433228, "sampling/importance_sampling_ratio/max": 1.2505557537078857, "sampling/importance_sampling_ratio/mean": 0.8949657678604126, "sampling/importance_sampling_ratio/min": 9.83338468358852e-05, "sampling/sampling_logp_difference/max": 2.2297935485839844, "sampling/sampling_logp_difference/mean": 0.19482752680778503, "step": 699, "step_time": 19.73808146303054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8610571241006255, "epoch": 0.007, "grad_norm": 0.06584636121988297, "kl": 0.3486955240368843, "learning_rate": 7.999836764260995e-06, "loss": -0.0401, "step": 700, "step_time": 10.411478704074398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.34375, "completions/mean_terminated_length": 4.555555820465088, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5778624545782804, "epoch": 0.00701, "frac_reward_zero_std": 0.25, "grad_norm": 0.2565043866634369, "kl": 0.2633555615320802, "learning_rate": 7.999836272221323e-06, "loss": -0.0428, "num_tokens": 17230423.0, "reward": 0.2883751094341278, "reward_std": 0.6026931405067444, "rewards/rollout_reward_func/mean": 0.2883751094341278, "rewards/rollout_reward_func/std": 0.6026931405067444, "sampling/importance_sampling_ratio/max": 1.5404691696166992, "sampling/importance_sampling_ratio/mean": 0.8207747340202332, "sampling/importance_sampling_ratio/min": 1.0929560367856084e-08, "sampling/sampling_logp_difference/max": 2.1380105018615723, "sampling/sampling_logp_difference/mean": 0.3262360990047455, "step": 701, "step_time": 30.461225101025775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.6040121782571077, "epoch": 0.00702, "grad_norm": 0.11243487894535065, "kl": 0.2593758553266525, "learning_rate": 7.99983577944121e-06, "loss": -0.0433, "step": 702, "step_time": 16.02423612496932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.535714626312256, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3542472142726183, "epoch": 0.00703, "frac_reward_zero_std": 0.0, "grad_norm": 0.20372511446475983, "kl": 0.8007305394858122, "learning_rate": 7.999835285920659e-06, "loss": -0.067, "num_tokens": 17276445.0, "reward": 0.5702798962593079, "reward_std": 0.710990309715271, "rewards/rollout_reward_func/mean": 0.5702798962593079, "rewards/rollout_reward_func/std": 0.7109903693199158, "sampling/importance_sampling_ratio/max": 1.1525517702102661, "sampling/importance_sampling_ratio/mean": 0.7527578473091125, "sampling/importance_sampling_ratio/min": 3.458480080098525e-08, "sampling/sampling_logp_difference/max": 2.030240535736084, "sampling/sampling_logp_difference/mean": 0.34159010648727417, "step": 703, "step_time": 27.171608956035925 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 1.361516758799553, "epoch": 0.00704, "grad_norm": 0.15645882487297058, "kl": 0.802579153329134, "learning_rate": 7.999834791659668e-06, "loss": -0.0677, "step": 704, "step_time": 13.53613890692941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.111111164093018, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2560195997357368, "epoch": 0.00705, "frac_reward_zero_std": 0.25, "grad_norm": 0.26930853724479675, "kl": 0.23864234890788794, "learning_rate": 7.99983429665824e-06, "loss": -0.0464, "num_tokens": 17343940.0, "reward": 0.29956501722335815, "reward_std": 0.7249868512153625, "rewards/rollout_reward_func/mean": 0.29956501722335815, "rewards/rollout_reward_func/std": 0.7249868512153625, "sampling/importance_sampling_ratio/max": 1.4676932096481323, "sampling/importance_sampling_ratio/mean": 0.8402281403541565, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.7442140579223633, "sampling/sampling_logp_difference/mean": 0.3087247610092163, "step": 705, "step_time": 36.05991669395007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.2790224310010672, "epoch": 0.00706, "grad_norm": 0.17266947031021118, "kl": 0.24159460235387087, "learning_rate": 7.999833800916372e-06, "loss": -0.0475, "step": 706, "step_time": 19.875347466993844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 4.592592716217041, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4228655993938446, "epoch": 0.00707, "frac_reward_zero_std": 0.25, "grad_norm": 0.04623429849743843, "kl": 0.4700599769130349, "learning_rate": 7.999833304434066e-06, "loss": -0.0781, "num_tokens": 17405572.0, "reward": 0.6610938310623169, "reward_std": 0.8611040711402893, "rewards/rollout_reward_func/mean": 0.6610938310623169, "rewards/rollout_reward_func/std": 0.8611040115356445, "sampling/importance_sampling_ratio/max": 1.3540034294128418, "sampling/importance_sampling_ratio/mean": 0.8057668209075928, "sampling/importance_sampling_ratio/min": 0.00015475899272132665, "sampling/sampling_logp_difference/max": 1.9704347848892212, "sampling/sampling_logp_difference/mean": 0.24563172459602356, "step": 707, "step_time": 32.2447313000157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4227168122306466, "epoch": 0.00708, "grad_norm": 0.04437056556344032, "kl": 0.44792356342077255, "learning_rate": 7.999832807211321e-06, "loss": -0.0782, "step": 708, "step_time": 16.362355564022437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.46875, "completions/mean_terminated_length": 4.766666889190674, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0694370474666357, "epoch": 0.00709, "frac_reward_zero_std": 0.0, "grad_norm": 0.0890800729393959, "kl": 0.462872879114002, "learning_rate": 7.99983230924814e-06, "loss": -0.0562, "num_tokens": 17454741.0, "reward": 0.16454654932022095, "reward_std": 0.8119615912437439, "rewards/rollout_reward_func/mean": 0.16454654932022095, "rewards/rollout_reward_func/std": 0.8119615912437439, "sampling/importance_sampling_ratio/max": 1.2512425184249878, "sampling/importance_sampling_ratio/mean": 0.8640231490135193, "sampling/importance_sampling_ratio/min": 8.79619037732482e-05, "sampling/sampling_logp_difference/max": 1.969273328781128, "sampling/sampling_logp_difference/mean": 0.24501708149909973, "step": 709, "step_time": 27.618635427003028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 1.0803330754861236, "epoch": 0.0071, "grad_norm": 0.06100693717598915, "kl": 0.4548584697768092, "learning_rate": 7.999831810544519e-06, "loss": -0.0566, "step": 710, "step_time": 15.060658349015284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 4.59375, "completions/mean_terminated_length": 4.22580623626709, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5642439955845475, "epoch": 0.00711, "frac_reward_zero_std": 0.5, "grad_norm": 0.06180070713162422, "kl": 0.5820625796914101, "learning_rate": 7.99983131110046e-06, "loss": -0.037, "num_tokens": 17500934.0, "reward": 1.0147864818572998, "reward_std": 0.6980014443397522, "rewards/rollout_reward_func/mean": 1.0147864818572998, "rewards/rollout_reward_func/std": 0.6980013847351074, "sampling/importance_sampling_ratio/max": 1.1534252166748047, "sampling/importance_sampling_ratio/mean": 0.933521032333374, "sampling/importance_sampling_ratio/min": 0.0005644479533657432, "sampling/sampling_logp_difference/max": 1.8551669120788574, "sampling/sampling_logp_difference/mean": 0.1274881809949875, "step": 711, "step_time": 21.77484316099435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5638255886733532, "epoch": 0.00712, "grad_norm": 0.06490880995988846, "kl": 0.6024096999317408, "learning_rate": 7.999830810915965e-06, "loss": -0.037, "step": 712, "step_time": 11.371012851013802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4623555596917868, "epoch": 0.00713, "frac_reward_zero_std": 0.25, "grad_norm": 0.18216165900230408, "kl": 0.34575608372688293, "learning_rate": 7.999830309991031e-06, "loss": -0.0668, "num_tokens": 17548873.0, "reward": 0.1119101345539093, "reward_std": 0.793044924736023, "rewards/rollout_reward_func/mean": 0.1119101345539093, "rewards/rollout_reward_func/std": 0.7930449843406677, "sampling/importance_sampling_ratio/max": 1.1470342874526978, "sampling/importance_sampling_ratio/mean": 0.775425910949707, "sampling/importance_sampling_ratio/min": 1.6278694374705083e-06, "sampling/sampling_logp_difference/max": 2.372849702835083, "sampling/sampling_logp_difference/mean": 0.2828274071216583, "step": 713, "step_time": 28.07984543495695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4651022460311651, "epoch": 0.00714, "grad_norm": 0.17666715383529663, "kl": 0.33100604079663754, "learning_rate": 7.99982980832566e-06, "loss": -0.067, "step": 714, "step_time": 12.860468361963285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.53125, "completions/mean_terminated_length": 4.448276042938232, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2444632984697819, "epoch": 0.00715, "frac_reward_zero_std": 0.0, "grad_norm": 0.04078972712159157, "kl": 0.6056196205317974, "learning_rate": 7.999829305919849e-06, "loss": -0.0785, "num_tokens": 17590689.0, "reward": 0.9643234014511108, "reward_std": 0.8089791536331177, "rewards/rollout_reward_func/mean": 0.9643234014511108, "rewards/rollout_reward_func/std": 0.8089791536331177, "sampling/importance_sampling_ratio/max": 1.2302099466323853, "sampling/importance_sampling_ratio/mean": 0.8337514400482178, "sampling/importance_sampling_ratio/min": 3.489851110316522e-07, "sampling/sampling_logp_difference/max": 2.049616813659668, "sampling/sampling_logp_difference/mean": 0.3004131317138672, "step": 715, "step_time": 19.71484000099008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2428562119603157, "epoch": 0.00716, "grad_norm": 0.0444311797618866, "kl": 0.6203059107065201, "learning_rate": 7.999828802773603e-06, "loss": -0.0785, "step": 716, "step_time": 10.504060762992594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 4.642857551574707, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3041961807757616, "epoch": 0.00717, "frac_reward_zero_std": 0.0, "grad_norm": 0.08594449609518051, "kl": 0.3542328253388405, "learning_rate": 7.99982829888692e-06, "loss": -0.059, "num_tokens": 17644810.0, "reward": 0.3391335904598236, "reward_std": 0.9497538805007935, "rewards/rollout_reward_func/mean": 0.3391335904598236, "rewards/rollout_reward_func/std": 0.9497537612915039, "sampling/importance_sampling_ratio/max": 1.2910338640213013, "sampling/importance_sampling_ratio/mean": 0.8244378566741943, "sampling/importance_sampling_ratio/min": 3.190262134467048e-07, "sampling/sampling_logp_difference/max": 2.3477225303649902, "sampling/sampling_logp_difference/mean": 0.2808528244495392, "step": 717, "step_time": 26.599812739994377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.29082684032619, "epoch": 0.00718, "grad_norm": 0.08452162891626358, "kl": 0.3608378265053034, "learning_rate": 7.9998277942598e-06, "loss": -0.0592, "step": 718, "step_time": 14.021552421007073 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.800000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1599948145449162, "epoch": 0.00719, "frac_reward_zero_std": 0.25, "grad_norm": 0.05908830463886261, "kl": 0.6183691117912531, "learning_rate": 7.99982728889224e-06, "loss": -0.0512, "num_tokens": 17693747.0, "reward": 0.7702354192733765, "reward_std": 0.7025883197784424, "rewards/rollout_reward_func/mean": 0.7702354192733765, "rewards/rollout_reward_func/std": 0.7025882601737976, "sampling/importance_sampling_ratio/max": 1.2786952257156372, "sampling/importance_sampling_ratio/mean": 0.8498373031616211, "sampling/importance_sampling_ratio/min": 1.0111436665738438e-07, "sampling/sampling_logp_difference/max": 2.452516555786133, "sampling/sampling_logp_difference/mean": 0.2965424060821533, "step": 719, "step_time": 23.626230523048434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1588464807718992, "epoch": 0.0072, "grad_norm": 0.0474645271897316, "kl": 0.5841968469321728, "learning_rate": 7.999826782784247e-06, "loss": -0.0515, "step": 720, "step_time": 12.62234705593437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.09375, "completions/mean_terminated_length": 4.09375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.29739096853882074, "epoch": 0.00721, "frac_reward_zero_std": 0.25, "grad_norm": 0.06707755476236343, "kl": 0.3249608129262924, "learning_rate": 7.999826275935814e-06, "loss": -0.0203, "num_tokens": 17744680.0, "reward": 1.0807271003723145, "reward_std": 0.5983666181564331, "rewards/rollout_reward_func/mean": 1.0807271003723145, "rewards/rollout_reward_func/std": 0.5983666181564331, "sampling/importance_sampling_ratio/max": 1.1351146697998047, "sampling/importance_sampling_ratio/mean": 0.9961501955986023, "sampling/importance_sampling_ratio/min": 0.00869581289589405, "sampling/sampling_logp_difference/max": 2.0975329875946045, "sampling/sampling_logp_difference/mean": 0.06832318753004074, "step": 721, "step_time": 22.12404689603136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.29926441609859467, "epoch": 0.00722, "grad_norm": 0.04737434908747673, "kl": 0.33318541571497917, "learning_rate": 7.999825768346947e-06, "loss": -0.0204, "step": 722, "step_time": 12.386833591124741 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.46875, "completions/mean_terminated_length": 4.379310131072998, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8678631205111742, "epoch": 0.00723, "frac_reward_zero_std": 0.0, "grad_norm": 0.04523564875125885, "kl": 0.41706864163279533, "learning_rate": 7.999825260017642e-06, "loss": -0.087, "num_tokens": 17791971.0, "reward": 0.8858289122581482, "reward_std": 0.8405731320381165, "rewards/rollout_reward_func/mean": 0.8858289122581482, "rewards/rollout_reward_func/std": 0.8405730724334717, "sampling/importance_sampling_ratio/max": 1.4384080171585083, "sampling/importance_sampling_ratio/mean": 0.8713846802711487, "sampling/importance_sampling_ratio/min": 0.00018204205844085664, "sampling/sampling_logp_difference/max": 1.9342669248580933, "sampling/sampling_logp_difference/mean": 0.18647123873233795, "step": 723, "step_time": 30.327932026033523 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.8670826675370336, "epoch": 0.00724, "grad_norm": 0.04225810244679451, "kl": 0.4196678940206766, "learning_rate": 7.999824750947901e-06, "loss": -0.087, "step": 724, "step_time": 15.294030257995473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.84375, "completions/mean_terminated_length": 4.100000381469727, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5139230089262128, "epoch": 0.00725, "frac_reward_zero_std": 0.5, "grad_norm": 0.012525522150099277, "kl": 0.2966040503233671, "learning_rate": 7.999824241137725e-06, "loss": -0.0395, "num_tokens": 17826624.0, "reward": 1.2363183498382568, "reward_std": 0.5590490102767944, "rewards/rollout_reward_func/mean": 1.2363183498382568, "rewards/rollout_reward_func/std": 0.5590490102767944, "sampling/importance_sampling_ratio/max": 1.0920672416687012, "sampling/importance_sampling_ratio/mean": 0.9458029866218567, "sampling/importance_sampling_ratio/min": 0.0004893128643743694, "sampling/sampling_logp_difference/max": 1.7588928937911987, "sampling/sampling_logp_difference/mean": 0.11681325733661652, "step": 725, "step_time": 18.505206677946262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5119871208444238, "epoch": 0.00726, "grad_norm": 0.012310128659009933, "kl": 0.2950301058590412, "learning_rate": 7.999823730587112e-06, "loss": -0.0395, "step": 726, "step_time": 10.285392744990531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.34375, "completions/mean_terminated_length": 4.241379261016846, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1065957928076386, "epoch": 0.00727, "frac_reward_zero_std": 0.25, "grad_norm": 0.1298595517873764, "kl": 0.2887146268039942, "learning_rate": 7.999823219296063e-06, "loss": -0.0547, "num_tokens": 17879590.0, "reward": 0.6442562341690063, "reward_std": 0.891342282295227, "rewards/rollout_reward_func/mean": 0.6442562341690063, "rewards/rollout_reward_func/std": 0.891342282295227, "sampling/importance_sampling_ratio/max": 1.5243030786514282, "sampling/importance_sampling_ratio/mean": 0.9451785087585449, "sampling/importance_sampling_ratio/min": 3.0641427173350166e-08, "sampling/sampling_logp_difference/max": 2.3832452297210693, "sampling/sampling_logp_difference/mean": 0.24905595183372498, "step": 727, "step_time": 27.98385021201102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1044507883489132, "epoch": 0.00728, "grad_norm": 0.13375112414360046, "kl": 0.28983247838914394, "learning_rate": 7.999822707264577e-06, "loss": -0.0549, "step": 728, "step_time": 16.090038165973965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.84375, "completions/mean_terminated_length": 4.730769634246826, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4644489390775561, "epoch": 0.00729, "frac_reward_zero_std": 0.0, "grad_norm": 0.10724466294050217, "kl": 0.8081010766327381, "learning_rate": 7.999822194492657e-06, "loss": -0.0632, "num_tokens": 17932107.0, "reward": 0.5390152335166931, "reward_std": 0.9842143058776855, "rewards/rollout_reward_func/mean": 0.5390152335166931, "rewards/rollout_reward_func/std": 0.9842143654823303, "sampling/importance_sampling_ratio/max": 1.1770304441452026, "sampling/importance_sampling_ratio/mean": 0.7734562754631042, "sampling/importance_sampling_ratio/min": 1.3155227861716412e-05, "sampling/sampling_logp_difference/max": 1.9904183149337769, "sampling/sampling_logp_difference/mean": 0.25947093963623047, "step": 729, "step_time": 25.83548142199288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4660266861319542, "epoch": 0.0073, "grad_norm": 0.10257220268249512, "kl": 0.783719839528203, "learning_rate": 7.999821680980302e-06, "loss": -0.0636, "step": 730, "step_time": 12.508811827021418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5079292040318251, "epoch": 0.00731, "frac_reward_zero_std": 0.25, "grad_norm": 0.07085993140935898, "kl": 0.2734314538538456, "learning_rate": 7.999821166727509e-06, "loss": -0.007, "num_tokens": 17977759.0, "reward": 0.5480271577835083, "reward_std": 0.9504506587982178, "rewards/rollout_reward_func/mean": 0.5480271577835083, "rewards/rollout_reward_func/std": 0.950450599193573, "sampling/importance_sampling_ratio/max": 1.2490371465682983, "sampling/importance_sampling_ratio/mean": 1.0079498291015625, "sampling/importance_sampling_ratio/min": 8.997810255095828e-06, "sampling/sampling_logp_difference/max": 1.7696573734283447, "sampling/sampling_logp_difference/mean": 0.13156047463417053, "step": 731, "step_time": 23.619290816044668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5263338182121515, "epoch": 0.00732, "grad_norm": 0.08934225142002106, "kl": 0.26956162601709366, "learning_rate": 7.999820651734282e-06, "loss": -0.0075, "step": 732, "step_time": 12.380320206022589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.28125, "completions/mean_terminated_length": 5.28125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.982131602242589, "epoch": 0.00733, "frac_reward_zero_std": 0.25, "grad_norm": 0.02696502022445202, "kl": 0.4147274177521467, "learning_rate": 7.99982013600062e-06, "loss": -0.0646, "num_tokens": 18022417.0, "reward": 1.1484606266021729, "reward_std": 0.6327991485595703, "rewards/rollout_reward_func/mean": 1.1484606266021729, "rewards/rollout_reward_func/std": 0.6327991485595703, "sampling/importance_sampling_ratio/max": 1.2461607456207275, "sampling/importance_sampling_ratio/mean": 0.8784758448600769, "sampling/importance_sampling_ratio/min": 7.2281718530575745e-06, "sampling/sampling_logp_difference/max": 2.133103132247925, "sampling/sampling_logp_difference/mean": 0.25129908323287964, "step": 733, "step_time": 22.278698402980808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9847354656085372, "epoch": 0.00734, "grad_norm": 0.027021117508411407, "kl": 0.3928626589477062, "learning_rate": 7.999819619526523e-06, "loss": -0.0646, "step": 734, "step_time": 12.534638600016478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.59375, "completions/mean_terminated_length": 4.7916669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9641717709600925, "epoch": 0.00735, "frac_reward_zero_std": 0.0, "grad_norm": 0.06378407776355743, "kl": 0.41801473405212164, "learning_rate": 7.999819102311991e-06, "loss": -0.0834, "num_tokens": 18079147.0, "reward": 0.5790389776229858, "reward_std": 0.8607074618339539, "rewards/rollout_reward_func/mean": 0.5790389776229858, "rewards/rollout_reward_func/std": 0.8607074022293091, "sampling/importance_sampling_ratio/max": 1.367262840270996, "sampling/importance_sampling_ratio/mean": 0.724753737449646, "sampling/importance_sampling_ratio/min": 2.87711554847192e-06, "sampling/sampling_logp_difference/max": 2.327683448791504, "sampling/sampling_logp_difference/mean": 0.3454221487045288, "step": 735, "step_time": 28.117436101048952 }, { "clip_ratio/high_max": 0.011787280905991793, "clip_ratio/high_mean": 0.005893640452995896, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005893640452995896, "entropy": 1.968412607908249, "epoch": 0.00736, "grad_norm": 0.0611908957362175, "kl": 0.38665641006082296, "learning_rate": 7.999818584357024e-06, "loss": -0.0834, "step": 736, "step_time": 12.855866994999815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.46875, "completions/mean_terminated_length": 5.045454502105713, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.492906291037798, "epoch": 0.00737, "frac_reward_zero_std": 0.0, "grad_norm": 0.10177747160196304, "kl": 0.26865014154464006, "learning_rate": 7.999818065661623e-06, "loss": -0.1007, "num_tokens": 18141870.0, "reward": 0.2650216817855835, "reward_std": 0.8148611187934875, "rewards/rollout_reward_func/mean": 0.2650216817855835, "rewards/rollout_reward_func/std": 0.8148611187934875, "sampling/importance_sampling_ratio/max": 1.4794870615005493, "sampling/importance_sampling_ratio/mean": 0.6608743071556091, "sampling/importance_sampling_ratio/min": 8.030171727568813e-08, "sampling/sampling_logp_difference/max": 2.062234878540039, "sampling/sampling_logp_difference/mean": 0.4609125256538391, "step": 737, "step_time": 34.295609277061885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4956124648451805, "epoch": 0.00738, "grad_norm": 0.10686129331588745, "kl": 0.2614068640395999, "learning_rate": 7.999817546225787e-06, "loss": -0.1009, "step": 738, "step_time": 16.70832489800523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1307880552485585, "epoch": 0.00739, "frac_reward_zero_std": 0.25, "grad_norm": 0.05201808363199234, "kl": 0.4704245254397392, "learning_rate": 7.999817026049515e-06, "loss": -0.0602, "num_tokens": 18188687.0, "reward": 0.6365206241607666, "reward_std": 0.8482286930084229, "rewards/rollout_reward_func/mean": 0.6365206241607666, "rewards/rollout_reward_func/std": 0.8482286930084229, "sampling/importance_sampling_ratio/max": 1.4658178091049194, "sampling/importance_sampling_ratio/mean": 0.8973077535629272, "sampling/importance_sampling_ratio/min": 3.053648356399208e-07, "sampling/sampling_logp_difference/max": 2.5629677772521973, "sampling/sampling_logp_difference/mean": 0.2056863009929657, "step": 739, "step_time": 25.30662524100626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1295181894674897, "epoch": 0.0074, "grad_norm": 0.05195137485861778, "kl": 0.4711457211524248, "learning_rate": 7.99981650513281e-06, "loss": -0.0603, "step": 740, "step_time": 12.518233347975183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.46875, "completions/mean_terminated_length": 6.047619342803955, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.01861509680748, "epoch": 0.00741, "frac_reward_zero_std": 0.0, "grad_norm": 0.04365845024585724, "kl": 0.18806472793221474, "learning_rate": 7.999815983475671e-06, "loss": -0.0871, "num_tokens": 18242159.0, "reward": -0.24523049592971802, "reward_std": 0.6323619484901428, "rewards/rollout_reward_func/mean": -0.24523049592971802, "rewards/rollout_reward_func/std": 0.6323619484901428, "sampling/importance_sampling_ratio/max": 1.3354463577270508, "sampling/importance_sampling_ratio/mean": 0.44833651185035706, "sampling/importance_sampling_ratio/min": 1.731916299263503e-08, "sampling/sampling_logp_difference/max": 2.545588970184326, "sampling/sampling_logp_difference/mean": 0.5274958610534668, "step": 741, "step_time": 26.27112153902999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.0141890347003937, "epoch": 0.00742, "grad_norm": 0.043968819081783295, "kl": 0.19121347507461905, "learning_rate": 7.999815461078097e-06, "loss": -0.0871, "step": 742, "step_time": 12.927790303016081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.96875, "completions/mean_terminated_length": 5.842105388641357, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.367995873093605, "epoch": 0.00743, "frac_reward_zero_std": 0.0, "grad_norm": 0.09661158919334412, "kl": 0.26339411549270153, "learning_rate": 7.999814937940092e-06, "loss": -0.0913, "num_tokens": 18301463.0, "reward": 0.19125694036483765, "reward_std": 0.9864866733551025, "rewards/rollout_reward_func/mean": 0.19125694036483765, "rewards/rollout_reward_func/std": 0.9864866733551025, "sampling/importance_sampling_ratio/max": 1.3046045303344727, "sampling/importance_sampling_ratio/mean": 0.48721617460250854, "sampling/importance_sampling_ratio/min": 6.234696456886013e-07, "sampling/sampling_logp_difference/max": 2.021134376525879, "sampling/sampling_logp_difference/mean": 0.3897634744644165, "step": 743, "step_time": 38.11798226000974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005759804043918848, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005759804043918848, "entropy": 2.3611369729042053, "epoch": 0.00744, "grad_norm": 0.07051097601652145, "kl": 0.2761218389496207, "learning_rate": 7.99981441406165e-06, "loss": -0.0917, "step": 744, "step_time": 14.546154230978573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 5.142857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5540391206741333, "epoch": 0.00745, "frac_reward_zero_std": 0.0, "grad_norm": 0.09643994271755219, "kl": 0.6745645757764578, "learning_rate": 7.999813889442775e-06, "loss": -0.0928, "num_tokens": 18350717.0, "reward": 0.3997075855731964, "reward_std": 0.7971024513244629, "rewards/rollout_reward_func/mean": 0.3997075855731964, "rewards/rollout_reward_func/std": 0.7971024513244629, "sampling/importance_sampling_ratio/max": 1.2317464351654053, "sampling/importance_sampling_ratio/mean": 0.7419531345367432, "sampling/importance_sampling_ratio/min": 1.2281137742320425e-06, "sampling/sampling_logp_difference/max": 2.330432176589966, "sampling/sampling_logp_difference/mean": 0.324762225151062, "step": 745, "step_time": 24.289156600047136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5448681861162186, "epoch": 0.00746, "grad_norm": 0.08202829211950302, "kl": 0.7007176410406828, "learning_rate": 7.999813364083468e-06, "loss": -0.093, "step": 746, "step_time": 12.644929358997615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.4375, "completions/mean_terminated_length": 4.583333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.112667568027973, "epoch": 0.00747, "frac_reward_zero_std": 0.25, "grad_norm": 0.3637135624885559, "kl": 1.7711117174476385, "learning_rate": 7.999812837983725e-06, "loss": -0.0516, "num_tokens": 18395609.0, "reward": 0.5577309131622314, "reward_std": 1.0252100229263306, "rewards/rollout_reward_func/mean": 0.5577309131622314, "rewards/rollout_reward_func/std": 1.0252100229263306, "sampling/importance_sampling_ratio/max": 1.2182179689407349, "sampling/importance_sampling_ratio/mean": 0.6694297790527344, "sampling/importance_sampling_ratio/min": 5.273622960544344e-08, "sampling/sampling_logp_difference/max": 2.467552423477173, "sampling/sampling_logp_difference/mean": 0.38761231303215027, "step": 747, "step_time": 27.86668620700948 }, { "clip_ratio/high_max": 0.021780303679406643, "clip_ratio/high_mean": 0.010890151839703321, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016098485328257084, "entropy": 2.11260399967432, "epoch": 0.00748, "grad_norm": 0.20100978016853333, "kl": 1.0634198393672705, "learning_rate": 7.99981231114355e-06, "loss": -0.0543, "step": 748, "step_time": 12.890532138029812 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.96875, "completions/mean_terminated_length": 4.440000057220459, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8700735494494438, "epoch": 0.00749, "frac_reward_zero_std": 0.0, "grad_norm": 0.17992034554481506, "kl": 0.6309456322342157, "learning_rate": 7.999811783562942e-06, "loss": -0.0722, "num_tokens": 18454234.0, "reward": 0.20038819313049316, "reward_std": 0.83513343334198, "rewards/rollout_reward_func/mean": 0.20038819313049316, "rewards/rollout_reward_func/std": 0.83513343334198, "sampling/importance_sampling_ratio/max": 1.3428977727890015, "sampling/importance_sampling_ratio/mean": 0.6743720173835754, "sampling/importance_sampling_ratio/min": 2.046640368291719e-08, "sampling/sampling_logp_difference/max": 2.207151412963867, "sampling/sampling_logp_difference/mean": 0.3925376832485199, "step": 749, "step_time": 32.76431448201765 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013494318351149559, "entropy": 1.8849971424788237, "epoch": 0.0075, "grad_norm": 0.1634245663881302, "kl": 0.5959444493055344, "learning_rate": 7.9998112552419e-06, "loss": -0.0728, "step": 750, "step_time": 15.832083203014918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0024999999441206455, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024999999441206455, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.413793087005615, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.141834320500493, "epoch": 0.00751, "frac_reward_zero_std": 0.0, "grad_norm": 0.11049794405698776, "kl": 0.4835992716252804, "learning_rate": 7.999810726180428e-06, "loss": -0.0756, "num_tokens": 18517340.0, "reward": 0.6468050479888916, "reward_std": 0.7618268728256226, "rewards/rollout_reward_func/mean": 0.6468050479888916, "rewards/rollout_reward_func/std": 0.7618269324302673, "sampling/importance_sampling_ratio/max": 1.4670437574386597, "sampling/importance_sampling_ratio/mean": 0.8083014488220215, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.1984541416168213, "sampling/sampling_logp_difference/mean": 0.25518572330474854, "step": 751, "step_time": 30.258278766967123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0024999999441206455, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024999999441206455, "entropy": 1.1588164698332548, "epoch": 0.00752, "grad_norm": 0.10874161124229431, "kl": 0.4725192282348871, "learning_rate": 7.99981019637852e-06, "loss": -0.0758, "step": 752, "step_time": 16.08475899003679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 4.645161151885986, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8566004931926727, "epoch": 0.00753, "frac_reward_zero_std": 0.25, "grad_norm": 0.13938555121421814, "kl": 0.6127666383981705, "learning_rate": 7.999809665836181e-06, "loss": -0.0349, "num_tokens": 18568778.0, "reward": 1.0628929138183594, "reward_std": 0.6378694176673889, "rewards/rollout_reward_func/mean": 1.0628929138183594, "rewards/rollout_reward_func/std": 0.6378693580627441, "sampling/importance_sampling_ratio/max": 1.2090072631835938, "sampling/importance_sampling_ratio/mean": 0.8974594473838806, "sampling/importance_sampling_ratio/min": 8.563735498512415e-09, "sampling/sampling_logp_difference/max": 2.044762134552002, "sampling/sampling_logp_difference/mean": 0.21715383231639862, "step": 753, "step_time": 23.261652728018817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8720739930868149, "epoch": 0.00754, "grad_norm": 0.12821228802204132, "kl": 0.5704322382807732, "learning_rate": 7.999809134553408e-06, "loss": -0.0353, "step": 754, "step_time": 12.400835019070655 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 4.642857551574707, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.654865838587284, "epoch": 0.00755, "frac_reward_zero_std": 0.0, "grad_norm": 0.16551174223423004, "kl": 0.3337444067001343, "learning_rate": 7.999808602530205e-06, "loss": -0.0699, "num_tokens": 18629664.0, "reward": 0.49803876876831055, "reward_std": 0.7474576234817505, "rewards/rollout_reward_func/mean": 0.49803876876831055, "rewards/rollout_reward_func/std": 0.7474575638771057, "sampling/importance_sampling_ratio/max": 1.3170517683029175, "sampling/importance_sampling_ratio/mean": 0.7777732610702515, "sampling/importance_sampling_ratio/min": 4.490704252901878e-09, "sampling/sampling_logp_difference/max": 2.327793598175049, "sampling/sampling_logp_difference/mean": 0.29743218421936035, "step": 755, "step_time": 32.49867289297981 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.666775070130825, "epoch": 0.00756, "grad_norm": 0.13273932039737701, "kl": 0.3360400591045618, "learning_rate": 7.99980806976657e-06, "loss": -0.0701, "step": 756, "step_time": 16.735924703010824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.59375, "completions/mean_terminated_length": 4.42307710647583, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9182336069643497, "epoch": 0.00757, "frac_reward_zero_std": 0.0, "grad_norm": 0.14174705743789673, "kl": 0.5468979720026255, "learning_rate": 7.9998075362625e-06, "loss": -0.0647, "num_tokens": 18691320.0, "reward": 0.051959432661533356, "reward_std": 0.5229894518852234, "rewards/rollout_reward_func/mean": 0.051959432661533356, "rewards/rollout_reward_func/std": 0.5229894518852234, "sampling/importance_sampling_ratio/max": 1.1867636442184448, "sampling/importance_sampling_ratio/mean": 0.6778308153152466, "sampling/importance_sampling_ratio/min": 4.260397872712929e-06, "sampling/sampling_logp_difference/max": 1.8279330730438232, "sampling/sampling_logp_difference/mean": 0.3704669773578644, "step": 757, "step_time": 31.724204500031192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.9294525049626827, "epoch": 0.00758, "grad_norm": 0.1386881023645401, "kl": 0.548120766878128, "learning_rate": 7.999807002018e-06, "loss": -0.065, "step": 758, "step_time": 16.954966767982114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 4.666666507720947, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5599078591912985, "epoch": 0.00759, "frac_reward_zero_std": 0.0, "grad_norm": 0.06412308663129807, "kl": 0.2841495471075177, "learning_rate": 7.999806467033068e-06, "loss": -0.0453, "num_tokens": 18737701.0, "reward": 0.5115989446640015, "reward_std": 0.9598435759544373, "rewards/rollout_reward_func/mean": 0.5115989446640015, "rewards/rollout_reward_func/std": 0.9598435759544373, "sampling/importance_sampling_ratio/max": 1.2277616262435913, "sampling/importance_sampling_ratio/mean": 0.7398127913475037, "sampling/importance_sampling_ratio/min": 3.959328751079738e-06, "sampling/sampling_logp_difference/max": 2.0568578243255615, "sampling/sampling_logp_difference/mean": 0.321111798286438, "step": 759, "step_time": 26.857718658982776 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.5580061431974173, "epoch": 0.0076, "grad_norm": 0.03860662877559662, "kl": 0.2807589005678892, "learning_rate": 7.999805931307704e-06, "loss": -0.0455, "step": 760, "step_time": 13.878093495994108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.5625, "completions/mean_terminated_length": 4.866666793823242, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3102885782718658, "epoch": 0.00761, "frac_reward_zero_std": 0.0, "grad_norm": 0.05254001542925835, "kl": 0.5110319256782532, "learning_rate": 7.99980539484191e-06, "loss": -0.0541, "num_tokens": 18796863.0, "reward": 0.5927886962890625, "reward_std": 0.7806029319763184, "rewards/rollout_reward_func/mean": 0.5927886962890625, "rewards/rollout_reward_func/std": 0.7806029319763184, "sampling/importance_sampling_ratio/max": 1.2885520458221436, "sampling/importance_sampling_ratio/mean": 0.8046717643737793, "sampling/importance_sampling_ratio/min": 1.5314325310100685e-06, "sampling/sampling_logp_difference/max": 1.989157795906067, "sampling/sampling_logp_difference/mean": 0.24722929298877716, "step": 761, "step_time": 24.047884252941003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3040667176246643, "epoch": 0.00762, "grad_norm": 0.05217813700437546, "kl": 0.5073418524116278, "learning_rate": 7.99980485763568e-06, "loss": -0.0542, "step": 762, "step_time": 13.815783856029157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.78125, "completions/mean_terminated_length": 4.3214287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2856389712542295, "epoch": 0.00763, "frac_reward_zero_std": 0.0, "grad_norm": 0.10748055577278137, "kl": 0.46479746885597706, "learning_rate": 7.999804319689023e-06, "loss": -0.0657, "num_tokens": 18846276.0, "reward": 0.9359410405158997, "reward_std": 0.7816240191459656, "rewards/rollout_reward_func/mean": 0.9359410405158997, "rewards/rollout_reward_func/std": 0.7816240787506104, "sampling/importance_sampling_ratio/max": 1.1621112823486328, "sampling/importance_sampling_ratio/mean": 0.7559571862220764, "sampling/importance_sampling_ratio/min": 4.369957423477899e-06, "sampling/sampling_logp_difference/max": 2.1280858516693115, "sampling/sampling_logp_difference/mean": 0.24167503416538239, "step": 763, "step_time": 30.960009049071232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2814990002661943, "epoch": 0.00764, "grad_norm": 0.10816283524036407, "kl": 0.48186408914625645, "learning_rate": 7.999803781001934e-06, "loss": -0.0658, "step": 764, "step_time": 15.728835906949826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.90625, "completions/mean_terminated_length": 4.464285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3227116577327251, "epoch": 0.00765, "frac_reward_zero_std": 0.25, "grad_norm": 0.0947321355342865, "kl": 0.39118782617151737, "learning_rate": 7.999803241574412e-06, "loss": -0.0528, "num_tokens": 18903025.0, "reward": 0.6944352984428406, "reward_std": 0.8396210670471191, "rewards/rollout_reward_func/mean": 0.6944352984428406, "rewards/rollout_reward_func/std": 0.8396210074424744, "sampling/importance_sampling_ratio/max": 1.1695270538330078, "sampling/importance_sampling_ratio/mean": 0.7462464570999146, "sampling/importance_sampling_ratio/min": 0.00011106729652965441, "sampling/sampling_logp_difference/max": 1.6917535066604614, "sampling/sampling_logp_difference/mean": 0.2365209311246872, "step": 765, "step_time": 30.03834889800055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3119833245873451, "epoch": 0.00766, "grad_norm": 0.09809916466474533, "kl": 0.3887613480910659, "learning_rate": 7.999802701406462e-06, "loss": -0.053, "step": 766, "step_time": 15.54665626399219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.59375, "completions/mean_terminated_length": 4.7916669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9048826470971107, "epoch": 0.00767, "frac_reward_zero_std": 0.0, "grad_norm": 0.09585080295801163, "kl": 0.47126105055212975, "learning_rate": 7.99980216049808e-06, "loss": -0.062, "num_tokens": 18962915.0, "reward": 0.44161731004714966, "reward_std": 0.7541860938072205, "rewards/rollout_reward_func/mean": 0.44161731004714966, "rewards/rollout_reward_func/std": 0.7541860938072205, "sampling/importance_sampling_ratio/max": 1.4444206953048706, "sampling/importance_sampling_ratio/mean": 0.5961796045303345, "sampling/importance_sampling_ratio/min": 6.887671588629019e-08, "sampling/sampling_logp_difference/max": 2.2925076484680176, "sampling/sampling_logp_difference/mean": 0.42635107040405273, "step": 767, "step_time": 30.249587105005048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8936503492295742, "epoch": 0.00768, "grad_norm": 0.08566351234912872, "kl": 0.49398363567888737, "learning_rate": 7.999801618849266e-06, "loss": -0.0624, "step": 768, "step_time": 15.56464284399408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.0625, "completions/mean_terminated_length": 5.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.162934336811304, "epoch": 0.00769, "frac_reward_zero_std": 0.0, "grad_norm": 0.05426257476210594, "kl": 0.5744538865983486, "learning_rate": 7.999801076460023e-06, "loss": -0.0617, "num_tokens": 19019032.0, "reward": 0.7686749696731567, "reward_std": 0.6689531803131104, "rewards/rollout_reward_func/mean": 0.7686749696731567, "rewards/rollout_reward_func/std": 0.6689531207084656, "sampling/importance_sampling_ratio/max": 1.247862458229065, "sampling/importance_sampling_ratio/mean": 0.8402260541915894, "sampling/importance_sampling_ratio/min": 5.131532532232086e-08, "sampling/sampling_logp_difference/max": 2.7693967819213867, "sampling/sampling_logp_difference/mean": 0.26703983545303345, "step": 769, "step_time": 26.493134001997532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1552436798810959, "epoch": 0.0077, "grad_norm": 0.053317535668611526, "kl": 0.6145418733358383, "learning_rate": 7.999800533330349e-06, "loss": -0.0617, "step": 770, "step_time": 15.043223147979006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 4.375, "completions/mean_terminated_length": 4.375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.48699164390563965, "epoch": 0.00771, "frac_reward_zero_std": 0.0, "grad_norm": 0.11520857363939285, "kl": 0.35380179807543755, "learning_rate": 7.999799989460245e-06, "loss": -0.0297, "num_tokens": 19069484.0, "reward": 0.9647468328475952, "reward_std": 0.5123458504676819, "rewards/rollout_reward_func/mean": 0.9647468328475952, "rewards/rollout_reward_func/std": 0.5123457908630371, "sampling/importance_sampling_ratio/max": 1.2216397523880005, "sampling/importance_sampling_ratio/mean": 1.0087411403656006, "sampling/importance_sampling_ratio/min": 1.581237711434369e-06, "sampling/sampling_logp_difference/max": 2.444542169570923, "sampling/sampling_logp_difference/mean": 0.1310999095439911, "step": 771, "step_time": 21.631465101003414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4811486713588238, "epoch": 0.00772, "grad_norm": 0.1067766547203064, "kl": 0.35616330802440643, "learning_rate": 7.999799444849711e-06, "loss": -0.0298, "step": 772, "step_time": 12.710749140038388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.71875, "completions/mean_terminated_length": 5.033333778381348, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0814394522458315, "epoch": 0.00773, "frac_reward_zero_std": 0.0, "grad_norm": 0.028657104820013046, "kl": 0.5175513308495283, "learning_rate": 7.999798899498748e-06, "loss": -0.0806, "num_tokens": 19116601.0, "reward": 0.7484860420227051, "reward_std": 0.8610365390777588, "rewards/rollout_reward_func/mean": 0.7484860420227051, "rewards/rollout_reward_func/std": 0.8610365390777588, "sampling/importance_sampling_ratio/max": 1.2794049978256226, "sampling/importance_sampling_ratio/mean": 0.8308343887329102, "sampling/importance_sampling_ratio/min": 0.00029834103770554066, "sampling/sampling_logp_difference/max": 1.724616289138794, "sampling/sampling_logp_difference/mean": 0.2151821255683899, "step": 773, "step_time": 25.16269751195796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.081381849013269, "epoch": 0.00774, "grad_norm": 0.02632993645966053, "kl": 0.5110960267484188, "learning_rate": 7.999798353407354e-06, "loss": -0.0808, "step": 774, "step_time": 13.395228531968314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.53125, "completions/mean_terminated_length": 4.161290168762207, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.49117164593189955, "epoch": 0.00775, "frac_reward_zero_std": 0.25, "grad_norm": 0.15630276501178741, "kl": 0.6012132503092289, "learning_rate": 7.99979780657553e-06, "loss": -0.0312, "num_tokens": 19164585.0, "reward": 0.940799355506897, "reward_std": 0.6916813850402832, "rewards/rollout_reward_func/mean": 0.940799355506897, "rewards/rollout_reward_func/std": 0.6916813850402832, "sampling/importance_sampling_ratio/max": 1.1268768310546875, "sampling/importance_sampling_ratio/mean": 0.9490727186203003, "sampling/importance_sampling_ratio/min": 0.0003158381150569767, "sampling/sampling_logp_difference/max": 1.7874921560287476, "sampling/sampling_logp_difference/mean": 0.11217233538627625, "step": 775, "step_time": 25.13634041498881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.49059152603149414, "epoch": 0.00776, "grad_norm": 0.14372770488262177, "kl": 0.5675309002399445, "learning_rate": 7.999797259003276e-06, "loss": -0.0317, "step": 776, "step_time": 14.043167493975488 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 5.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.399061294272542, "epoch": 0.00777, "frac_reward_zero_std": 0.25, "grad_norm": 0.08178606629371643, "kl": 0.5154217723757029, "learning_rate": 7.999796710690594e-06, "loss": -0.0457, "num_tokens": 19222110.0, "reward": 0.8429915308952332, "reward_std": 0.7847921252250671, "rewards/rollout_reward_func/mean": 0.8429915308952332, "rewards/rollout_reward_func/std": 0.7847921848297119, "sampling/importance_sampling_ratio/max": 1.308279275894165, "sampling/importance_sampling_ratio/mean": 0.8118526935577393, "sampling/importance_sampling_ratio/min": 3.5946040810586055e-08, "sampling/sampling_logp_difference/max": 2.30810284614563, "sampling/sampling_logp_difference/mean": 0.3046468496322632, "step": 777, "step_time": 32.28904676804086 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "entropy": 1.4028402445837855, "epoch": 0.00778, "grad_norm": 0.08006086945533752, "kl": 0.5089165531098843, "learning_rate": 7.999796161637481e-06, "loss": -0.0459, "step": 778, "step_time": 17.61455906004994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.6875, "completions/mean_terminated_length": 4.322580337524414, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6158241797238588, "epoch": 0.00779, "frac_reward_zero_std": 0.25, "grad_norm": 0.025958633050322533, "kl": 0.43810470774769783, "learning_rate": 7.99979561184394e-06, "loss": -0.0574, "num_tokens": 19262411.0, "reward": 1.2614226341247559, "reward_std": 0.4365414083003998, "rewards/rollout_reward_func/mean": 1.2614226341247559, "rewards/rollout_reward_func/std": 0.4365413784980774, "sampling/importance_sampling_ratio/max": 1.165929913520813, "sampling/importance_sampling_ratio/mean": 0.9199101328849792, "sampling/importance_sampling_ratio/min": 9.715661144582555e-05, "sampling/sampling_logp_difference/max": 1.8325834274291992, "sampling/sampling_logp_difference/mean": 0.1432715505361557, "step": 779, "step_time": 21.545287412009202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6171036679297686, "epoch": 0.0078, "grad_norm": 0.026677221059799194, "kl": 0.43601078912615776, "learning_rate": 7.99979506130997e-06, "loss": -0.0573, "step": 780, "step_time": 11.977506495051784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.4375, "completions/mean_terminated_length": 4.344827651977539, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9665607586503029, "epoch": 0.00781, "frac_reward_zero_std": 0.0, "grad_norm": 0.05513794347643852, "kl": 0.3427940374240279, "learning_rate": 7.999794510035571e-06, "loss": -0.0773, "num_tokens": 19306338.0, "reward": 0.7142595052719116, "reward_std": 0.8124018311500549, "rewards/rollout_reward_func/mean": 0.7142595052719116, "rewards/rollout_reward_func/std": 0.8124018311500549, "sampling/importance_sampling_ratio/max": 1.193463921546936, "sampling/importance_sampling_ratio/mean": 0.847126841545105, "sampling/importance_sampling_ratio/min": 0.0002461998083163053, "sampling/sampling_logp_difference/max": 1.8361936807632446, "sampling/sampling_logp_difference/mean": 0.18737468123435974, "step": 781, "step_time": 24.04674896004144 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.9599265530705452, "epoch": 0.00782, "grad_norm": 0.04851452261209488, "kl": 0.3151493910700083, "learning_rate": 7.999793958020743e-06, "loss": -0.0775, "step": 782, "step_time": 13.040987993968884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 5.1875, "completions/mean_terminated_length": 4.068965435028076, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0744046233594418, "epoch": 0.00783, "frac_reward_zero_std": 0.0, "grad_norm": 0.0705304816365242, "kl": 0.4417843744158745, "learning_rate": 7.999793405265487e-06, "loss": -0.0626, "num_tokens": 19366192.0, "reward": 0.5266295671463013, "reward_std": 0.7512233257293701, "rewards/rollout_reward_func/mean": 0.5266295671463013, "rewards/rollout_reward_func/std": 0.7512233853340149, "sampling/importance_sampling_ratio/max": 1.2387558221817017, "sampling/importance_sampling_ratio/mean": 0.8975397944450378, "sampling/importance_sampling_ratio/min": 5.071983082416409e-07, "sampling/sampling_logp_difference/max": 2.2152483463287354, "sampling/sampling_logp_difference/mean": 0.25385722517967224, "step": 783, "step_time": 31.430880406987853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.062518559396267, "epoch": 0.00784, "grad_norm": 0.06500022858381271, "kl": 0.42065027728676796, "learning_rate": 7.999792851769802e-06, "loss": -0.0628, "step": 784, "step_time": 17.39739081999869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3003316707909107, "epoch": 0.00785, "frac_reward_zero_std": 0.25, "grad_norm": 0.15374311804771423, "kl": 0.44475926272571087, "learning_rate": 7.99979229753369e-06, "loss": -0.0357, "num_tokens": 19422591.0, "reward": 0.6166452765464783, "reward_std": 0.8532947897911072, "rewards/rollout_reward_func/mean": 0.6166452765464783, "rewards/rollout_reward_func/std": 0.8532947897911072, "sampling/importance_sampling_ratio/max": 1.1874486207962036, "sampling/importance_sampling_ratio/mean": 0.807339608669281, "sampling/importance_sampling_ratio/min": 4.489923412620556e-06, "sampling/sampling_logp_difference/max": 1.597330093383789, "sampling/sampling_logp_difference/mean": 0.2885779142379761, "step": 785, "step_time": 27.92123898898717 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.2933068368583918, "epoch": 0.00786, "grad_norm": 0.15515202283859253, "kl": 0.40499419160187244, "learning_rate": 7.999791742557149e-06, "loss": -0.0362, "step": 786, "step_time": 14.885210773034487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.875, "completions/mean_terminated_length": 4.516129016876221, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7642710488289595, "epoch": 0.00787, "frac_reward_zero_std": 0.0, "grad_norm": 0.022518331184983253, "kl": 0.8706953041255474, "learning_rate": 7.999791186840177e-06, "loss": -0.0802, "num_tokens": 19463084.0, "reward": 1.3086881637573242, "reward_std": 0.18857301771640778, "rewards/rollout_reward_func/mean": 1.3086881637573242, "rewards/rollout_reward_func/std": 0.18857301771640778, "sampling/importance_sampling_ratio/max": 1.1027491092681885, "sampling/importance_sampling_ratio/mean": 0.8684036731719971, "sampling/importance_sampling_ratio/min": 1.0053331607196014e-05, "sampling/sampling_logp_difference/max": 2.3977863788604736, "sampling/sampling_logp_difference/mean": 0.19365759193897247, "step": 787, "step_time": 17.593792765052058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.766818217933178, "epoch": 0.00788, "grad_norm": 0.021543188020586967, "kl": 0.8583049103617668, "learning_rate": 7.99979063038278e-06, "loss": -0.0802, "step": 788, "step_time": 9.798037583997939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.59375, "completions/mean_terminated_length": 5.966667175292969, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4889963557943702, "epoch": 0.00789, "frac_reward_zero_std": 0.0, "grad_norm": 0.06895170360803604, "kl": 0.34633792005479336, "learning_rate": 7.999790073184955e-06, "loss": -0.0938, "num_tokens": 19513809.0, "reward": 0.7083592414855957, "reward_std": 0.9541964530944824, "rewards/rollout_reward_func/mean": 0.7083592414855957, "rewards/rollout_reward_func/std": 0.9541963934898376, "sampling/importance_sampling_ratio/max": 1.13312828540802, "sampling/importance_sampling_ratio/mean": 0.7375473380088806, "sampling/importance_sampling_ratio/min": 2.8940934271304286e-07, "sampling/sampling_logp_difference/max": 2.5494277477264404, "sampling/sampling_logp_difference/mean": 0.31008216738700867, "step": 789, "step_time": 24.64256579100038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.495214680209756, "epoch": 0.0079, "grad_norm": 0.09156224876642227, "kl": 0.3467783201485872, "learning_rate": 7.999789515246702e-06, "loss": -0.094, "step": 790, "step_time": 12.462032242008718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.125, "completions/mean_terminated_length": 4.639999866485596, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1860644072294235, "epoch": 0.00791, "frac_reward_zero_std": 0.0, "grad_norm": 0.08169103413820267, "kl": 0.44206725619733334, "learning_rate": 7.999788956568022e-06, "loss": -0.0792, "num_tokens": 19575123.0, "reward": -0.015541065484285355, "reward_std": 0.5931979417800903, "rewards/rollout_reward_func/mean": -0.015541065484285355, "rewards/rollout_reward_func/std": 0.5931980013847351, "sampling/importance_sampling_ratio/max": 1.2404370307922363, "sampling/importance_sampling_ratio/mean": 0.5988976359367371, "sampling/importance_sampling_ratio/min": 3.5007983001378307e-07, "sampling/sampling_logp_difference/max": 1.9414880275726318, "sampling/sampling_logp_difference/mean": 0.4342498779296875, "step": 791, "step_time": 30.405632844980573 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 2.184439405798912, "epoch": 0.00792, "grad_norm": 0.08172281831502914, "kl": 0.384521366097033, "learning_rate": 7.999788397148915e-06, "loss": -0.0792, "step": 792, "step_time": 15.51031235701521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 4.5217390060424805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6496911495923996, "epoch": 0.00793, "frac_reward_zero_std": 0.0, "grad_norm": 0.20398947596549988, "kl": 0.2606521751731634, "learning_rate": 7.99978783698938e-06, "loss": -0.0854, "num_tokens": 19639009.0, "reward": 0.27964603900909424, "reward_std": 0.810299813747406, "rewards/rollout_reward_func/mean": 0.27964603900909424, "rewards/rollout_reward_func/std": 0.810299813747406, "sampling/importance_sampling_ratio/max": 1.7006736993789673, "sampling/importance_sampling_ratio/mean": 0.6887063980102539, "sampling/importance_sampling_ratio/min": 1.4518523130391259e-05, "sampling/sampling_logp_difference/max": 1.9104313850402832, "sampling/sampling_logp_difference/mean": 0.28954967856407166, "step": 793, "step_time": 31.987409762019524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.655464731156826, "epoch": 0.00794, "grad_norm": 0.16258125007152557, "kl": 0.26038676872849464, "learning_rate": 7.999787276089417e-06, "loss": -0.0859, "step": 794, "step_time": 15.58775579201756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 5.3548383712768555, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.126509441062808, "epoch": 0.00795, "frac_reward_zero_std": 0.25, "grad_norm": 0.28675833344459534, "kl": 0.6411315705627203, "learning_rate": 7.999786714449027e-06, "loss": -0.0476, "num_tokens": 19689947.0, "reward": 1.0085211992263794, "reward_std": 0.7638204097747803, "rewards/rollout_reward_func/mean": 1.0085211992263794, "rewards/rollout_reward_func/std": 0.7638204097747803, "sampling/importance_sampling_ratio/max": 1.4187361001968384, "sampling/importance_sampling_ratio/mean": 0.8667871952056885, "sampling/importance_sampling_ratio/min": 1.9647803384259532e-08, "sampling/sampling_logp_difference/max": 2.19511079788208, "sampling/sampling_logp_difference/mean": 0.2735874056816101, "step": 795, "step_time": 32.97291523599415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009615384973585606, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009615384973585606, "entropy": 1.1421459270641208, "epoch": 0.00796, "grad_norm": 0.21756869554519653, "kl": 0.6069771610200405, "learning_rate": 7.999786152068212e-06, "loss": -0.0488, "step": 796, "step_time": 18.312980762013467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.71875, "completions/mean_terminated_length": 5.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1240034326910973, "epoch": 0.00797, "frac_reward_zero_std": 0.0, "grad_norm": 0.1364317536354065, "kl": 0.6129811946302652, "learning_rate": 7.999785588946968e-06, "loss": -0.0825, "num_tokens": 19745819.0, "reward": 0.483986496925354, "reward_std": 0.8616738319396973, "rewards/rollout_reward_func/mean": 0.483986496925354, "rewards/rollout_reward_func/std": 0.8616738319396973, "sampling/importance_sampling_ratio/max": 1.1685030460357666, "sampling/importance_sampling_ratio/mean": 0.5679417848587036, "sampling/importance_sampling_ratio/min": 2.4030760714666144e-10, "sampling/sampling_logp_difference/max": 2.3767738342285156, "sampling/sampling_logp_difference/mean": 0.505230188369751, "step": 797, "step_time": 33.03265250995173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.014756944496184587, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014756944496184587, "entropy": 2.13811057806015, "epoch": 0.00798, "grad_norm": 0.1149941086769104, "kl": 0.6177927562966943, "learning_rate": 7.999785025085299e-06, "loss": -0.0829, "step": 798, "step_time": 17.885431222064653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 5.000000476837158, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2574783619493246, "epoch": 0.00799, "frac_reward_zero_std": 0.0, "grad_norm": 0.0999835655093193, "kl": 0.616340234875679, "learning_rate": 7.999784460483204e-06, "loss": -0.0739, "num_tokens": 19786045.0, "reward": 1.185175895690918, "reward_std": 0.4506227970123291, "rewards/rollout_reward_func/mean": 1.185175895690918, "rewards/rollout_reward_func/std": 0.4506227970123291, "sampling/importance_sampling_ratio/max": 1.182287335395813, "sampling/importance_sampling_ratio/mean": 0.8477651476860046, "sampling/importance_sampling_ratio/min": 1.7700419618904562e-07, "sampling/sampling_logp_difference/max": 2.540466785430908, "sampling/sampling_logp_difference/mean": 0.32118910551071167, "step": 799, "step_time": 20.871303057007026 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 1.2474936125800014, "epoch": 0.008, "grad_norm": 0.0826960876584053, "kl": 0.577850803732872, "learning_rate": 7.999783895140683e-06, "loss": -0.0742, "step": 800, "step_time": 11.132930735999253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.428571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.150421479716897, "epoch": 0.00801, "frac_reward_zero_std": 0.0, "grad_norm": 0.14294257760047913, "kl": 0.5918467398732901, "learning_rate": 7.999783329057734e-06, "loss": -0.0697, "num_tokens": 19846854.0, "reward": 1.0023257732391357, "reward_std": 0.7275256514549255, "rewards/rollout_reward_func/mean": 1.0023257732391357, "rewards/rollout_reward_func/std": 0.7275256514549255, "sampling/importance_sampling_ratio/max": 1.3997187614440918, "sampling/importance_sampling_ratio/mean": 0.780531108379364, "sampling/importance_sampling_ratio/min": 3.8150956243043765e-05, "sampling/sampling_logp_difference/max": 1.663339376449585, "sampling/sampling_logp_difference/mean": 0.2774255871772766, "step": 801, "step_time": 34.87267726301798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1617954755201936, "epoch": 0.00802, "grad_norm": 0.16540715098381042, "kl": 0.5990455448627472, "learning_rate": 7.99978276223436e-06, "loss": -0.0693, "step": 802, "step_time": 18.38577676092973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.40625, "completions/mean_terminated_length": 4.40625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5736876763403416, "epoch": 0.00803, "frac_reward_zero_std": 0.25, "grad_norm": 0.06076338142156601, "kl": 0.520407497882843, "learning_rate": 7.99978219467056e-06, "loss": -0.0565, "num_tokens": 19895360.0, "reward": 1.1407054662704468, "reward_std": 0.6090307235717773, "rewards/rollout_reward_func/mean": 1.1407054662704468, "rewards/rollout_reward_func/std": 0.6090307235717773, "sampling/importance_sampling_ratio/max": 1.5413535833358765, "sampling/importance_sampling_ratio/mean": 0.9199957251548767, "sampling/importance_sampling_ratio/min": 0.018702562898397446, "sampling/sampling_logp_difference/max": 1.7475943565368652, "sampling/sampling_logp_difference/mean": 0.12792785465717316, "step": 803, "step_time": 19.676443189004203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.5815265588462353, "epoch": 0.00804, "grad_norm": 0.06031208112835884, "kl": 0.5307645685970783, "learning_rate": 7.999781626366335e-06, "loss": -0.0566, "step": 804, "step_time": 10.90362955495948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.928571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.995345801115036, "epoch": 0.00805, "frac_reward_zero_std": 0.0, "grad_norm": 0.3879287838935852, "kl": 0.42586332000792027, "learning_rate": 7.999781057321683e-06, "loss": -0.0396, "num_tokens": 19951110.0, "reward": 0.4534800350666046, "reward_std": 0.8574269413948059, "rewards/rollout_reward_func/mean": 0.4534800350666046, "rewards/rollout_reward_func/std": 0.8574268221855164, "sampling/importance_sampling_ratio/max": 2.04734206199646, "sampling/importance_sampling_ratio/mean": 0.7525206804275513, "sampling/importance_sampling_ratio/min": 3.8124656498439435e-07, "sampling/sampling_logp_difference/max": 2.1497018337249756, "sampling/sampling_logp_difference/mean": 0.4105643630027771, "step": 805, "step_time": 25.378989979973994 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.027113970601931214, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03279578895308077, "entropy": 2.0297203958034515, "epoch": 0.00806, "grad_norm": 0.0657939463853836, "kl": 0.4047283660620451, "learning_rate": 7.999780487536607e-06, "loss": -0.0407, "step": 806, "step_time": 14.258046461065533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 5.111111164093018, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7952554784715176, "epoch": 0.00807, "frac_reward_zero_std": 0.0, "grad_norm": 0.050455302000045776, "kl": 0.45204232074320316, "learning_rate": 7.999779917011105e-06, "loss": -0.0808, "num_tokens": 20007192.0, "reward": 0.25239092111587524, "reward_std": 0.95548015832901, "rewards/rollout_reward_func/mean": 0.25239092111587524, "rewards/rollout_reward_func/std": 0.95548015832901, "sampling/importance_sampling_ratio/max": 1.130635380744934, "sampling/importance_sampling_ratio/mean": 0.6415069699287415, "sampling/importance_sampling_ratio/min": 4.90517049911432e-06, "sampling/sampling_logp_difference/max": 1.6771996021270752, "sampling/sampling_logp_difference/mean": 0.3670101761817932, "step": 807, "step_time": 28.687560764956288 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "entropy": 1.801638849079609, "epoch": 0.00808, "grad_norm": 0.05242055654525757, "kl": 0.4191767517477274, "learning_rate": 7.999779345745178e-06, "loss": -0.0808, "step": 808, "step_time": 13.976780255004996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.71875, "completions/mean_terminated_length": 5.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.08958837389946, "epoch": 0.00809, "frac_reward_zero_std": 0.25, "grad_norm": 0.18229731917381287, "kl": 0.19766751304268837, "learning_rate": 7.999778773738826e-06, "loss": -0.0392, "num_tokens": 20058552.0, "reward": 0.4185529351234436, "reward_std": 0.9427331686019897, "rewards/rollout_reward_func/mean": 0.4185529351234436, "rewards/rollout_reward_func/std": 0.9427331686019897, "sampling/importance_sampling_ratio/max": 1.1687653064727783, "sampling/importance_sampling_ratio/mean": 0.5670018196105957, "sampling/importance_sampling_ratio/min": 4.440425527718617e-06, "sampling/sampling_logp_difference/max": 2.034059524536133, "sampling/sampling_logp_difference/mean": 0.3767085075378418, "step": 809, "step_time": 25.966137534938753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0847892463207245, "epoch": 0.0081, "grad_norm": 0.18503211438655853, "kl": 0.1946799186989665, "learning_rate": 7.999778200992049e-06, "loss": -0.0397, "step": 810, "step_time": 12.753004388971021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.59375, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.029764339327812, "epoch": 0.00811, "frac_reward_zero_std": 0.0, "grad_norm": 0.13091985881328583, "kl": 0.7727513127028942, "learning_rate": 7.999777627504848e-06, "loss": -0.0815, "num_tokens": 20107828.0, "reward": 0.41697317361831665, "reward_std": 0.9030691385269165, "rewards/rollout_reward_func/mean": 0.41697317361831665, "rewards/rollout_reward_func/std": 0.9030691385269165, "sampling/importance_sampling_ratio/max": 1.6953638792037964, "sampling/importance_sampling_ratio/mean": 0.7096346616744995, "sampling/importance_sampling_ratio/min": 6.283676157181617e-08, "sampling/sampling_logp_difference/max": 1.8993719816207886, "sampling/sampling_logp_difference/mean": 0.3795175552368164, "step": 811, "step_time": 26.284204788971692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0218519270420074, "epoch": 0.00812, "grad_norm": 0.12289177626371384, "kl": 0.7292952192947268, "learning_rate": 7.999777053277222e-06, "loss": -0.082, "step": 812, "step_time": 13.976849956990918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.09375, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.24140402674675, "epoch": 0.00813, "frac_reward_zero_std": 0.0, "grad_norm": 0.48286062479019165, "kl": 0.21758331079035997, "learning_rate": 7.99977647830917e-06, "loss": -0.0749, "num_tokens": 20170155.0, "reward": 0.36089539527893066, "reward_std": 0.8233065009117126, "rewards/rollout_reward_func/mean": 0.36089539527893066, "rewards/rollout_reward_func/std": 0.8233065009117126, "sampling/importance_sampling_ratio/max": 1.4276394844055176, "sampling/importance_sampling_ratio/mean": 0.6177198886871338, "sampling/importance_sampling_ratio/min": 5.437645995698404e-07, "sampling/sampling_logp_difference/max": 2.487834930419922, "sampling/sampling_logp_difference/mean": 0.3840459883213043, "step": 813, "step_time": 33.01422606006963 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 2.2136217653751373, "epoch": 0.00814, "grad_norm": 0.1309489607810974, "kl": 0.21605321625247598, "learning_rate": 7.999775902600696e-06, "loss": -0.0785, "step": 814, "step_time": 16.370697889011353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.71875, "completions/mean_terminated_length": 5.392857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.301625069230795, "epoch": 0.00815, "frac_reward_zero_std": 0.25, "grad_norm": 0.16096192598342896, "kl": 0.35030491556972265, "learning_rate": 7.999775326151796e-06, "loss": -0.0807, "num_tokens": 20222488.0, "reward": 0.44029417634010315, "reward_std": 0.835048496723175, "rewards/rollout_reward_func/mean": 0.44029417634010315, "rewards/rollout_reward_func/std": 0.835048496723175, "sampling/importance_sampling_ratio/max": 1.0967676639556885, "sampling/importance_sampling_ratio/mean": 0.7393757700920105, "sampling/importance_sampling_ratio/min": 0.0006169849075376987, "sampling/sampling_logp_difference/max": 1.6229429244995117, "sampling/sampling_logp_difference/mean": 0.22317492961883545, "step": 815, "step_time": 29.007460484019248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2874896191060543, "epoch": 0.00816, "grad_norm": 0.06315891444683075, "kl": 0.3742764741182327, "learning_rate": 7.999774748962473e-06, "loss": -0.0809, "step": 816, "step_time": 14.346027892956045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.46875, "completions/mean_terminated_length": 4.096774101257324, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.3855323111638427, "epoch": 0.00817, "frac_reward_zero_std": 0.5, "grad_norm": 0.051183924078941345, "kl": 0.2723583094775677, "learning_rate": 7.999774171032726e-06, "loss": -0.0247, "num_tokens": 20264383.0, "reward": 1.1591627597808838, "reward_std": 0.6740293502807617, "rewards/rollout_reward_func/mean": 1.1591627597808838, "rewards/rollout_reward_func/std": 0.6740293502807617, "sampling/importance_sampling_ratio/max": 1.1498042345046997, "sampling/importance_sampling_ratio/mean": 0.9966602325439453, "sampling/importance_sampling_ratio/min": 0.002653293777257204, "sampling/sampling_logp_difference/max": 1.145995855331421, "sampling/sampling_logp_difference/mean": 0.06943199038505554, "step": 817, "step_time": 22.623656206036685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.37392353266477585, "epoch": 0.00818, "grad_norm": 0.051286302506923676, "kl": 0.27318621799349785, "learning_rate": 7.999773592362555e-06, "loss": -0.0248, "step": 818, "step_time": 12.628120390989352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 4.137930870056152, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7694396954029799, "epoch": 0.00819, "frac_reward_zero_std": 0.25, "grad_norm": 0.138382226228714, "kl": 0.4716990627348423, "learning_rate": 7.99977301295196e-06, "loss": -0.0411, "num_tokens": 20318146.0, "reward": 0.7657714486122131, "reward_std": 0.8747114539146423, "rewards/rollout_reward_func/mean": 0.7657714486122131, "rewards/rollout_reward_func/std": 0.8747114539146423, "sampling/importance_sampling_ratio/max": 1.1577906608581543, "sampling/importance_sampling_ratio/mean": 0.8394868969917297, "sampling/importance_sampling_ratio/min": 3.356922388775274e-05, "sampling/sampling_logp_difference/max": 2.6095876693725586, "sampling/sampling_logp_difference/mean": 0.19397030770778656, "step": 819, "step_time": 30.849290270009078 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 0.7644951157271862, "epoch": 0.0082, "grad_norm": 0.10063759237527847, "kl": 0.4579758420586586, "learning_rate": 7.999772432800943e-06, "loss": -0.0413, "step": 820, "step_time": 16.260302346025128 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 4.239999771118164, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5951785910874605, "epoch": 0.00821, "frac_reward_zero_std": 0.25, "grad_norm": 0.12883815169334412, "kl": 0.22519602440297604, "learning_rate": 7.999771851909502e-06, "loss": -0.0327, "num_tokens": 20368710.0, "reward": 0.4567839503288269, "reward_std": 0.9343938827514648, "rewards/rollout_reward_func/mean": 0.4567839503288269, "rewards/rollout_reward_func/std": 0.9343939423561096, "sampling/importance_sampling_ratio/max": 1.1832448244094849, "sampling/importance_sampling_ratio/mean": 0.7472530007362366, "sampling/importance_sampling_ratio/min": 4.956069119543827e-07, "sampling/sampling_logp_difference/max": 2.1453146934509277, "sampling/sampling_logp_difference/mean": 0.3179479241371155, "step": 821, "step_time": 30.063400891056517 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 1.5775617808103561, "epoch": 0.00822, "grad_norm": 0.10623496025800705, "kl": 0.22840804047882557, "learning_rate": 7.999771270277637e-06, "loss": -0.0333, "step": 822, "step_time": 15.314717214001575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2442150563001633, "epoch": 0.00823, "frac_reward_zero_std": 0.0, "grad_norm": 0.03273864462971687, "kl": 0.38018242083489895, "learning_rate": 7.99977068790535e-06, "loss": -0.0633, "num_tokens": 20416826.0, "reward": 0.5145161747932434, "reward_std": 0.8897562623023987, "rewards/rollout_reward_func/mean": 0.5145161747932434, "rewards/rollout_reward_func/std": 0.8897562026977539, "sampling/importance_sampling_ratio/max": 1.688960075378418, "sampling/importance_sampling_ratio/mean": 0.8787685036659241, "sampling/importance_sampling_ratio/min": 3.128251080397604e-08, "sampling/sampling_logp_difference/max": 1.7530713081359863, "sampling/sampling_logp_difference/mean": 0.27073389291763306, "step": 823, "step_time": 24.66706818001694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2358866184949875, "epoch": 0.00824, "grad_norm": 0.031098222360014915, "kl": 0.3881107736378908, "learning_rate": 7.99977010479264e-06, "loss": -0.0632, "step": 824, "step_time": 11.598611727065872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.46875, "completions/mean_terminated_length": 5.482758522033691, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4970167726278305, "epoch": 0.00825, "frac_reward_zero_std": 0.0, "grad_norm": 0.08031006902456284, "kl": 0.598029974848032, "learning_rate": 7.999769520939507e-06, "loss": -0.0702, "num_tokens": 20467509.0, "reward": 0.7298738360404968, "reward_std": 0.8976889848709106, "rewards/rollout_reward_func/mean": 0.7298738360404968, "rewards/rollout_reward_func/std": 0.8976889848709106, "sampling/importance_sampling_ratio/max": 1.1519875526428223, "sampling/importance_sampling_ratio/mean": 0.6992131471633911, "sampling/importance_sampling_ratio/min": 3.104610004811548e-05, "sampling/sampling_logp_difference/max": 2.4675400257110596, "sampling/sampling_logp_difference/mean": 0.28876012563705444, "step": 825, "step_time": 30.28137503998005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4876534957438707, "epoch": 0.00826, "grad_norm": 0.0693548396229744, "kl": 0.6368104238063097, "learning_rate": 7.999768936345951e-06, "loss": -0.0704, "step": 826, "step_time": 15.826254818937741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.65625, "completions/mean_terminated_length": 4.586206912994385, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0427764616906643, "epoch": 0.00827, "frac_reward_zero_std": 0.25, "grad_norm": 0.07521101087331772, "kl": 0.31870298460125923, "learning_rate": 7.999768351011975e-06, "loss": -0.0687, "num_tokens": 20512380.0, "reward": 1.0783286094665527, "reward_std": 0.7701178193092346, "rewards/rollout_reward_func/mean": 1.0783286094665527, "rewards/rollout_reward_func/std": 0.7701178193092346, "sampling/importance_sampling_ratio/max": 1.186928391456604, "sampling/importance_sampling_ratio/mean": 0.8638258576393127, "sampling/importance_sampling_ratio/min": 0.0004043517983518541, "sampling/sampling_logp_difference/max": 1.7336958646774292, "sampling/sampling_logp_difference/mean": 0.21491655707359314, "step": 827, "step_time": 23.484323228971334 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.0430927388370037, "epoch": 0.00828, "grad_norm": 0.05476827919483185, "kl": 0.33857614174485207, "learning_rate": 7.999767764937574e-06, "loss": -0.0688, "step": 828, "step_time": 12.4862951139512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 4.592592716217041, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2100645620375872, "epoch": 0.00829, "frac_reward_zero_std": 0.0, "grad_norm": 0.11750947684049606, "kl": 0.506237555295229, "learning_rate": 7.999767178122752e-06, "loss": -0.0643, "num_tokens": 20571610.0, "reward": 0.5579118728637695, "reward_std": 0.8320042490959167, "rewards/rollout_reward_func/mean": 0.5579118728637695, "rewards/rollout_reward_func/std": 0.8320042490959167, "sampling/importance_sampling_ratio/max": 1.2483234405517578, "sampling/importance_sampling_ratio/mean": 0.7938355207443237, "sampling/importance_sampling_ratio/min": 6.104822205088567e-06, "sampling/sampling_logp_difference/max": 2.061645984649658, "sampling/sampling_logp_difference/mean": 0.3083243668079376, "step": 829, "step_time": 26.339422983059194 }, { "clip_ratio/high_max": 0.026041666977107525, "clip_ratio/high_mean": 0.013020833488553762, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013020833488553762, "entropy": 1.2136817825958133, "epoch": 0.0083, "grad_norm": 0.09436893463134766, "kl": 0.5046701226383448, "learning_rate": 7.999766590567506e-06, "loss": -0.0648, "step": 830, "step_time": 14.237557591986842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0853565577417612, "epoch": 0.00831, "frac_reward_zero_std": 0.25, "grad_norm": 0.205308198928833, "kl": 0.25015003979206085, "learning_rate": 7.99976600227184e-06, "loss": -0.0595, "num_tokens": 20624810.0, "reward": 0.9570725560188293, "reward_std": 0.8097566962242126, "rewards/rollout_reward_func/mean": 0.9570725560188293, "rewards/rollout_reward_func/std": 0.8097566962242126, "sampling/importance_sampling_ratio/max": 1.5858774185180664, "sampling/importance_sampling_ratio/mean": 0.9112136960029602, "sampling/importance_sampling_ratio/min": 1.9786800748988753e-06, "sampling/sampling_logp_difference/max": 2.13226056098938, "sampling/sampling_logp_difference/mean": 0.24287457764148712, "step": 831, "step_time": 24.628414777951548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.0948987193405628, "epoch": 0.00832, "grad_norm": 0.1304110735654831, "kl": 0.25053925812244415, "learning_rate": 7.999765413235754e-06, "loss": -0.0598, "step": 832, "step_time": 13.144531578058377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.53125, "completions/mean_terminated_length": 4.161290168762207, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.615722892805934, "epoch": 0.00833, "frac_reward_zero_std": 0.5, "grad_norm": 0.0521894209086895, "kl": 0.7946826070547104, "learning_rate": 7.999764823459244e-06, "loss": -0.0455, "num_tokens": 20662622.0, "reward": 1.2337806224822998, "reward_std": 0.6958711743354797, "rewards/rollout_reward_func/mean": 1.2337806224822998, "rewards/rollout_reward_func/std": 0.6958711743354797, "sampling/importance_sampling_ratio/max": 1.0774441957473755, "sampling/importance_sampling_ratio/mean": 0.9203377366065979, "sampling/importance_sampling_ratio/min": 0.00016479774785693735, "sampling/sampling_logp_difference/max": 1.9064692258834839, "sampling/sampling_logp_difference/mean": 0.13870461285114288, "step": 833, "step_time": 17.893420096021146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.622146038338542, "epoch": 0.00834, "grad_norm": 0.04392947256565094, "kl": 0.8157474994659424, "learning_rate": 7.999764232942311e-06, "loss": -0.0455, "step": 834, "step_time": 9.608146287995623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.8275861740112305, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4149269172921777, "epoch": 0.00835, "frac_reward_zero_std": 0.0, "grad_norm": 0.07117465883493423, "kl": 0.7088750638067722, "learning_rate": 7.999763641684959e-06, "loss": -0.0879, "num_tokens": 20718886.0, "reward": 0.6057935357093811, "reward_std": 0.7076815962791443, "rewards/rollout_reward_func/mean": 0.6057935357093811, "rewards/rollout_reward_func/std": 0.7076815366744995, "sampling/importance_sampling_ratio/max": 1.3271321058273315, "sampling/importance_sampling_ratio/mean": 0.8330211043357849, "sampling/importance_sampling_ratio/min": 9.85707626632859e-10, "sampling/sampling_logp_difference/max": 2.1388392448425293, "sampling/sampling_logp_difference/mean": 0.33100855350494385, "step": 835, "step_time": 26.50475855602417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4153191018849611, "epoch": 0.00836, "grad_norm": 0.06396344304084778, "kl": 0.6401225179433823, "learning_rate": 7.999763049687186e-06, "loss": -0.0881, "step": 836, "step_time": 14.006863778020488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 4.461538791656494, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5139697939157486, "epoch": 0.00837, "frac_reward_zero_std": 0.0, "grad_norm": 0.06297363340854645, "kl": 0.7503237184137106, "learning_rate": 7.999762456948991e-06, "loss": -0.0745, "num_tokens": 20774664.0, "reward": 0.48653239011764526, "reward_std": 0.9128743410110474, "rewards/rollout_reward_func/mean": 0.48653239011764526, "rewards/rollout_reward_func/std": 0.9128743410110474, "sampling/importance_sampling_ratio/max": 1.2939238548278809, "sampling/importance_sampling_ratio/mean": 0.7692440748214722, "sampling/importance_sampling_ratio/min": 2.0091595160920406e-06, "sampling/sampling_logp_difference/max": 2.4184985160827637, "sampling/sampling_logp_difference/mean": 0.29768359661102295, "step": 837, "step_time": 36.20877547899727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5216247886419296, "epoch": 0.00838, "grad_norm": 0.05986717343330383, "kl": 0.7153555098921061, "learning_rate": 7.999761863470376e-06, "loss": -0.0747, "step": 838, "step_time": 18.587502786016557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 4.559999942779541, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6418002247810364, "epoch": 0.00839, "frac_reward_zero_std": 0.0, "grad_norm": 0.19178500771522522, "kl": 0.30546669103205204, "learning_rate": 7.999761269251341e-06, "loss": -0.08, "num_tokens": 20837764.0, "reward": 0.5097397565841675, "reward_std": 0.8268043994903564, "rewards/rollout_reward_func/mean": 0.5097397565841675, "rewards/rollout_reward_func/std": 0.8268043994903564, "sampling/importance_sampling_ratio/max": 1.6005994081497192, "sampling/importance_sampling_ratio/mean": 0.7866666316986084, "sampling/importance_sampling_ratio/min": 6.8550962168956175e-06, "sampling/sampling_logp_difference/max": 1.8873310089111328, "sampling/sampling_logp_difference/mean": 0.28888553380966187, "step": 839, "step_time": 29.38404084200738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.640177644789219, "epoch": 0.0084, "grad_norm": 0.1777246594429016, "kl": 0.2991548730060458, "learning_rate": 7.999760674291884e-06, "loss": -0.0813, "step": 840, "step_time": 14.225803009001538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.40625, "completions/mean_terminated_length": 5.035714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4194196639582515, "epoch": 0.00841, "frac_reward_zero_std": 0.0, "grad_norm": 0.07551915943622589, "kl": 0.26557149924337864, "learning_rate": 7.999760078592008e-06, "loss": -0.1048, "num_tokens": 20887247.0, "reward": 0.5611945986747742, "reward_std": 0.9948896169662476, "rewards/rollout_reward_func/mean": 0.5611945986747742, "rewards/rollout_reward_func/std": 0.9948896169662476, "sampling/importance_sampling_ratio/max": 1.1686192750930786, "sampling/importance_sampling_ratio/mean": 0.7813953161239624, "sampling/importance_sampling_ratio/min": 8.91057716216892e-05, "sampling/sampling_logp_difference/max": 1.7364808320999146, "sampling/sampling_logp_difference/mean": 0.2525852918624878, "step": 841, "step_time": 27.259851189970504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4211697159335017, "epoch": 0.00842, "grad_norm": 0.07442601025104523, "kl": 0.27308090776205063, "learning_rate": 7.99975948215171e-06, "loss": -0.1049, "step": 842, "step_time": 13.785173473035684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.4375, "completions/mean_terminated_length": 4.733333587646484, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1300574820488691, "epoch": 0.00843, "frac_reward_zero_std": 0.25, "grad_norm": 0.03171219676733017, "kl": 0.5334328711032867, "learning_rate": 7.999758884970993e-06, "loss": -0.07, "num_tokens": 20939262.0, "reward": 0.8784995675086975, "reward_std": 0.7805498242378235, "rewards/rollout_reward_func/mean": 0.8784995675086975, "rewards/rollout_reward_func/std": 0.7805498242378235, "sampling/importance_sampling_ratio/max": 1.299763560295105, "sampling/importance_sampling_ratio/mean": 0.8741137385368347, "sampling/importance_sampling_ratio/min": 0.0001278651034226641, "sampling/sampling_logp_difference/max": 1.716679573059082, "sampling/sampling_logp_difference/mean": 0.23872658610343933, "step": 843, "step_time": 28.94678925300832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1240624282509089, "epoch": 0.00844, "grad_norm": 0.02873319946229458, "kl": 0.5178501270711422, "learning_rate": 7.999758287049855e-06, "loss": -0.0702, "step": 844, "step_time": 16.08760044904193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.09375, "completions/mean_terminated_length": 5.44444465637207, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7932228781282902, "epoch": 0.00845, "frac_reward_zero_std": 0.0, "grad_norm": 0.16125936806201935, "kl": 0.342668604105711, "learning_rate": 7.9997576883883e-06, "loss": -0.0725, "num_tokens": 20992511.0, "reward": 0.3338008224964142, "reward_std": 0.7440353631973267, "rewards/rollout_reward_func/mean": 0.3338008224964142, "rewards/rollout_reward_func/std": 0.7440354228019714, "sampling/importance_sampling_ratio/max": 1.1152280569076538, "sampling/importance_sampling_ratio/mean": 0.7118090391159058, "sampling/importance_sampling_ratio/min": 4.959669297477376e-08, "sampling/sampling_logp_difference/max": 1.9551517963409424, "sampling/sampling_logp_difference/mean": 0.3658042550086975, "step": 845, "step_time": 23.54832447503577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.7932862974703312, "epoch": 0.00846, "grad_norm": 0.05980284512042999, "kl": 0.33937768265604973, "learning_rate": 7.999757088986322e-06, "loss": -0.0732, "step": 846, "step_time": 11.596535079006571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.53125, "completions/mean_terminated_length": 4.833333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0698033915832639, "epoch": 0.00847, "frac_reward_zero_std": 0.0, "grad_norm": 0.01950639858841896, "kl": 0.5226735342293978, "learning_rate": 7.999756488843926e-06, "loss": -0.083, "num_tokens": 21043747.0, "reward": 1.1199183464050293, "reward_std": 0.7503204345703125, "rewards/rollout_reward_func/mean": 1.1199183464050293, "rewards/rollout_reward_func/std": 0.7503204345703125, "sampling/importance_sampling_ratio/max": 1.1118266582489014, "sampling/importance_sampling_ratio/mean": 0.8482118844985962, "sampling/importance_sampling_ratio/min": 0.00017766791279427707, "sampling/sampling_logp_difference/max": 2.0270442962646484, "sampling/sampling_logp_difference/mean": 0.2226937711238861, "step": 847, "step_time": 23.79996309793205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.066784628201276, "epoch": 0.00848, "grad_norm": 0.019643861800432205, "kl": 0.5339778270572424, "learning_rate": 7.99975588796111e-06, "loss": -0.083, "step": 848, "step_time": 12.374360406975029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.3125, "completions/mean_terminated_length": 4.2068963050842285, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7156760152429342, "epoch": 0.00849, "frac_reward_zero_std": 0.25, "grad_norm": 0.17452310025691986, "kl": 0.23084136843681335, "learning_rate": 7.999755286337875e-06, "loss": -0.0614, "num_tokens": 21089231.0, "reward": 0.5944763422012329, "reward_std": 0.7735413312911987, "rewards/rollout_reward_func/mean": 0.5944763422012329, "rewards/rollout_reward_func/std": 0.7735413312911987, "sampling/importance_sampling_ratio/max": 1.126417636871338, "sampling/importance_sampling_ratio/mean": 0.8783679008483887, "sampling/importance_sampling_ratio/min": 0.0028739157132804394, "sampling/sampling_logp_difference/max": 1.4196975231170654, "sampling/sampling_logp_difference/mean": 0.13622096180915833, "step": 849, "step_time": 23.443477847002214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.720147430896759, "epoch": 0.0085, "grad_norm": 0.04321613907814026, "kl": 0.23488591611385345, "learning_rate": 7.999754683974221e-06, "loss": -0.062, "step": 850, "step_time": 12.567270723026013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 5.043478488922119, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1879647504538298, "epoch": 0.00851, "frac_reward_zero_std": 0.0, "grad_norm": 0.05991362780332565, "kl": 0.4301621448248625, "learning_rate": 7.999754080870149e-06, "loss": -0.069, "num_tokens": 21143874.0, "reward": -0.010779650881886482, "reward_std": 0.6423336863517761, "rewards/rollout_reward_func/mean": -0.010779650881886482, "rewards/rollout_reward_func/std": 0.6423336863517761, "sampling/importance_sampling_ratio/max": 1.0931940078735352, "sampling/importance_sampling_ratio/mean": 0.662189245223999, "sampling/importance_sampling_ratio/min": 7.037773031015604e-08, "sampling/sampling_logp_difference/max": 2.523221969604492, "sampling/sampling_logp_difference/mean": 0.433884859085083, "step": 851, "step_time": 31.244606277003186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.185896795243025, "epoch": 0.00852, "grad_norm": 0.06051322817802429, "kl": 0.43479403853416443, "learning_rate": 7.999753477025658e-06, "loss": -0.069, "step": 852, "step_time": 13.747780212957878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.620689868927002, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0521988216787577, "epoch": 0.00853, "frac_reward_zero_std": 0.0, "grad_norm": 0.13431985676288605, "kl": 0.6329758297652006, "learning_rate": 7.999752872440747e-06, "loss": -0.0905, "num_tokens": 21201679.0, "reward": 0.8868067264556885, "reward_std": 0.7471116185188293, "rewards/rollout_reward_func/mean": 0.8868067264556885, "rewards/rollout_reward_func/std": 0.7471116781234741, "sampling/importance_sampling_ratio/max": 1.1515934467315674, "sampling/importance_sampling_ratio/mean": 0.7800489664077759, "sampling/importance_sampling_ratio/min": 3.458845987402981e-10, "sampling/sampling_logp_difference/max": 2.2741336822509766, "sampling/sampling_logp_difference/mean": 0.30040353536605835, "step": 853, "step_time": 26.283543179073604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.05447443947196, "epoch": 0.00854, "grad_norm": 0.044421665370464325, "kl": 0.6416714787483215, "learning_rate": 7.999752267115419e-06, "loss": -0.0909, "step": 854, "step_time": 14.14793380300398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.4375, "completions/mean_terminated_length": 4.583333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7989680729806423, "epoch": 0.00855, "frac_reward_zero_std": 0.0, "grad_norm": 0.07228460907936096, "kl": 0.4888775232248008, "learning_rate": 7.99975166104967e-06, "loss": -0.0683, "num_tokens": 21263540.0, "reward": 0.2415897250175476, "reward_std": 0.8940035104751587, "rewards/rollout_reward_func/mean": 0.2415897250175476, "rewards/rollout_reward_func/std": 0.8940035700798035, "sampling/importance_sampling_ratio/max": 1.2520071268081665, "sampling/importance_sampling_ratio/mean": 0.6216174960136414, "sampling/importance_sampling_ratio/min": 3.8447244151029736e-05, "sampling/sampling_logp_difference/max": 1.6548786163330078, "sampling/sampling_logp_difference/mean": 0.31676894426345825, "step": 855, "step_time": 34.59327066902188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8016827255487442, "epoch": 0.00856, "grad_norm": 0.07790413498878479, "kl": 0.47616976499557495, "learning_rate": 7.999751054243507e-06, "loss": -0.0685, "step": 856, "step_time": 16.597173492977163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 5.310344696044922, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.365335363894701, "epoch": 0.00857, "frac_reward_zero_std": 0.25, "grad_norm": 0.09895344078540802, "kl": 0.4235486835241318, "learning_rate": 7.999750446696924e-06, "loss": -0.0627, "num_tokens": 21309144.0, "reward": 0.613542914390564, "reward_std": 0.8059912919998169, "rewards/rollout_reward_func/mean": 0.613542914390564, "rewards/rollout_reward_func/std": 0.8059913516044617, "sampling/importance_sampling_ratio/max": 1.0589599609375, "sampling/importance_sampling_ratio/mean": 0.7579732537269592, "sampling/importance_sampling_ratio/min": 2.237887201772537e-06, "sampling/sampling_logp_difference/max": 2.4931745529174805, "sampling/sampling_logp_difference/mean": 0.27100566029548645, "step": 857, "step_time": 28.58607232698705 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.3663691887632012, "epoch": 0.00858, "grad_norm": 0.06694819033145905, "kl": 0.3936092481017113, "learning_rate": 7.999749838409923e-06, "loss": -0.063, "step": 858, "step_time": 14.774939807015471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5320466300472617, "epoch": 0.00859, "frac_reward_zero_std": 0.25, "grad_norm": 0.08274704217910767, "kl": 0.7656672243028879, "learning_rate": 7.999749229382504e-06, "loss": -0.0549, "num_tokens": 21343646.0, "reward": 1.2770084142684937, "reward_std": 0.6141352653503418, "rewards/rollout_reward_func/mean": 1.2770084142684937, "rewards/rollout_reward_func/std": 0.6141352653503418, "sampling/importance_sampling_ratio/max": 1.0671268701553345, "sampling/importance_sampling_ratio/mean": 0.9361504912376404, "sampling/importance_sampling_ratio/min": 4.934859134664293e-07, "sampling/sampling_logp_difference/max": 1.8671551942825317, "sampling/sampling_logp_difference/mean": 0.16056272387504578, "step": 859, "step_time": 19.260082680993946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5337666170671582, "epoch": 0.0086, "grad_norm": 0.07075296342372894, "kl": 0.7095090858638287, "learning_rate": 7.999748619614667e-06, "loss": -0.0551, "step": 860, "step_time": 10.835045606974745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.3125, "completions/mean_terminated_length": 4.2068963050842285, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7694349102675915, "epoch": 0.00861, "frac_reward_zero_std": 0.5, "grad_norm": 0.027999427169561386, "kl": 0.23552073817700148, "learning_rate": 7.999748009106413e-06, "loss": -0.0436, "num_tokens": 21382405.0, "reward": 0.6826968789100647, "reward_std": 0.9106844067573547, "rewards/rollout_reward_func/mean": 0.6826968789100647, "rewards/rollout_reward_func/std": 0.9106844067573547, "sampling/importance_sampling_ratio/max": 1.360127568244934, "sampling/importance_sampling_ratio/mean": 0.8695664405822754, "sampling/importance_sampling_ratio/min": 0.0002182511962018907, "sampling/sampling_logp_difference/max": 1.6130821704864502, "sampling/sampling_logp_difference/mean": 0.15305599570274353, "step": 861, "step_time": 23.288704812002834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7690400080755353, "epoch": 0.00862, "grad_norm": 0.03409843146800995, "kl": 0.23388342931866646, "learning_rate": 7.999747397857743e-06, "loss": -0.0435, "step": 862, "step_time": 11.64939851796953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2021759487688541, "epoch": 0.00863, "frac_reward_zero_std": 0.0, "grad_norm": 0.1423521339893341, "kl": 0.3499190751463175, "learning_rate": 7.999746785868654e-06, "loss": -0.0795, "num_tokens": 21437450.0, "reward": 0.9002432823181152, "reward_std": 0.8082186579704285, "rewards/rollout_reward_func/mean": 0.9002432823181152, "rewards/rollout_reward_func/std": 0.8082185983657837, "sampling/importance_sampling_ratio/max": 1.165474772453308, "sampling/importance_sampling_ratio/mean": 0.8181763291358948, "sampling/importance_sampling_ratio/min": 2.3202688381740444e-10, "sampling/sampling_logp_difference/max": 2.6803042888641357, "sampling/sampling_logp_difference/mean": 0.2946839928627014, "step": 863, "step_time": 29.19947068800684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2148121623322368, "epoch": 0.00864, "grad_norm": 0.1378137469291687, "kl": 0.32622237130999565, "learning_rate": 7.999746173139148e-06, "loss": -0.0801, "step": 864, "step_time": 15.544131616014056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.8125, "completions/mean_terminated_length": 4.357142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1575123770162463, "epoch": 0.00865, "frac_reward_zero_std": 0.5, "grad_norm": 0.03213086724281311, "kl": 0.5505830645561218, "learning_rate": 7.999745559669226e-06, "loss": -0.0529, "num_tokens": 21485015.0, "reward": 1.0082041025161743, "reward_std": 0.778749942779541, "rewards/rollout_reward_func/mean": 1.0082041025161743, "rewards/rollout_reward_func/std": 0.778749942779541, "sampling/importance_sampling_ratio/max": 1.1014479398727417, "sampling/importance_sampling_ratio/mean": 0.7882281541824341, "sampling/importance_sampling_ratio/min": 0.00015526721836067736, "sampling/sampling_logp_difference/max": 1.7039744853973389, "sampling/sampling_logp_difference/mean": 0.2275274693965912, "step": 865, "step_time": 25.867678701964905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1578052127733827, "epoch": 0.00866, "grad_norm": 0.032751552760601044, "kl": 0.5582005437463522, "learning_rate": 7.999744945458888e-06, "loss": -0.0529, "step": 866, "step_time": 13.90858610192663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 4.689655303955078, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0398760670796037, "epoch": 0.00867, "frac_reward_zero_std": 0.0, "grad_norm": 0.1034688726067543, "kl": 0.20137440226972103, "learning_rate": 7.999744330508132e-06, "loss": -0.0884, "num_tokens": 21536606.0, "reward": 0.8120173215866089, "reward_std": 0.7730097770690918, "rewards/rollout_reward_func/mean": 0.8120173215866089, "rewards/rollout_reward_func/std": 0.7730097770690918, "sampling/importance_sampling_ratio/max": 1.6152207851409912, "sampling/importance_sampling_ratio/mean": 0.881888210773468, "sampling/importance_sampling_ratio/min": 0.00010525863035582006, "sampling/sampling_logp_difference/max": 1.5967793464660645, "sampling/sampling_logp_difference/mean": 0.19911181926727295, "step": 867, "step_time": 25.508614316000603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.043171581812203, "epoch": 0.00868, "grad_norm": 0.09053878486156464, "kl": 0.20227015390992165, "learning_rate": 7.999743714816959e-06, "loss": -0.0885, "step": 868, "step_time": 12.7849612699938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 4.461538791656494, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8873477289453149, "epoch": 0.00869, "frac_reward_zero_std": 0.0, "grad_norm": 0.2162921130657196, "kl": 0.3141673281788826, "learning_rate": 7.999743098385372e-06, "loss": -0.0955, "num_tokens": 21591610.0, "reward": 0.4936842918395996, "reward_std": 0.945751428604126, "rewards/rollout_reward_func/mean": 0.4936842918395996, "rewards/rollout_reward_func/std": 0.945751428604126, "sampling/importance_sampling_ratio/max": 2.5984320640563965, "sampling/importance_sampling_ratio/mean": 0.7724018096923828, "sampling/importance_sampling_ratio/min": 8.851370836282513e-08, "sampling/sampling_logp_difference/max": 2.197659492492676, "sampling/sampling_logp_difference/mean": 0.3916291892528534, "step": 869, "step_time": 26.554382558038924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8856102842837572, "epoch": 0.0087, "grad_norm": 0.21225136518478394, "kl": 0.3125412091612816, "learning_rate": 7.999742481213369e-06, "loss": -0.0962, "step": 870, "step_time": 12.724029914970743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 4.375, "completions/mean_terminated_length": 4.375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7179352529346943, "epoch": 0.00871, "frac_reward_zero_std": 0.25, "grad_norm": 0.11991642415523529, "kl": 0.3478613793849945, "learning_rate": 7.999741863300948e-06, "loss": -0.0455, "num_tokens": 21645591.0, "reward": 0.5438740253448486, "reward_std": 0.7674954533576965, "rewards/rollout_reward_func/mean": 0.5438740253448486, "rewards/rollout_reward_func/std": 0.7674954533576965, "sampling/importance_sampling_ratio/max": 1.1621769666671753, "sampling/importance_sampling_ratio/mean": 0.882532000541687, "sampling/importance_sampling_ratio/min": 0.0024866245221346617, "sampling/sampling_logp_difference/max": 1.6569883823394775, "sampling/sampling_logp_difference/mean": 0.11627795547246933, "step": 871, "step_time": 27.637820418021875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.7327524982392788, "epoch": 0.00872, "grad_norm": 0.10480687022209167, "kl": 0.3485962487757206, "learning_rate": 7.999741244648114e-06, "loss": -0.0459, "step": 872, "step_time": 15.36501501800376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.125, "completions/mean_terminated_length": 4.1666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5653795395046473, "epoch": 0.00873, "frac_reward_zero_std": 0.0, "grad_norm": 0.09587699174880981, "kl": 0.47795652505010366, "learning_rate": 7.999740625254861e-06, "loss": -0.071, "num_tokens": 21699475.0, "reward": 0.5063357949256897, "reward_std": 0.9617334008216858, "rewards/rollout_reward_func/mean": 0.5063357949256897, "rewards/rollout_reward_func/std": 0.9617334008216858, "sampling/importance_sampling_ratio/max": 1.2747962474822998, "sampling/importance_sampling_ratio/mean": 0.7276739478111267, "sampling/importance_sampling_ratio/min": 1.0849239515664522e-05, "sampling/sampling_logp_difference/max": 1.821791410446167, "sampling/sampling_logp_difference/mean": 0.30574241280555725, "step": 873, "step_time": 26.445727787999203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.556608809158206, "epoch": 0.00874, "grad_norm": 0.10202331095933914, "kl": 0.5645038513466716, "learning_rate": 7.999740005121195e-06, "loss": -0.0708, "step": 874, "step_time": 12.742379792995052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5688754841685295, "epoch": 0.00875, "frac_reward_zero_std": 0.0, "grad_norm": 0.059775445610284805, "kl": 0.3840069631114602, "learning_rate": 7.999739384247114e-06, "loss": -0.0568, "num_tokens": 21748869.0, "reward": 0.36436861753463745, "reward_std": 0.9574419856071472, "rewards/rollout_reward_func/mean": 0.36436861753463745, "rewards/rollout_reward_func/std": 0.957442045211792, "sampling/importance_sampling_ratio/max": 1.1780452728271484, "sampling/importance_sampling_ratio/mean": 0.763798177242279, "sampling/importance_sampling_ratio/min": 1.0685428719625634e-07, "sampling/sampling_logp_difference/max": 1.9663769006729126, "sampling/sampling_logp_difference/mean": 0.2970349192619324, "step": 875, "step_time": 27.545596207957715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5706271678209305, "epoch": 0.00876, "grad_norm": 0.05795709788799286, "kl": 0.3829101035371423, "learning_rate": 7.999738762632617e-06, "loss": -0.0567, "step": 876, "step_time": 14.001627074962016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7479313649237156, "epoch": 0.00877, "frac_reward_zero_std": 0.5, "grad_norm": 0.030915338546037674, "kl": 0.46774757467210293, "learning_rate": 7.999738140277704e-06, "loss": -0.0413, "num_tokens": 21797061.0, "reward": 0.8207874298095703, "reward_std": 0.7145770788192749, "rewards/rollout_reward_func/mean": 0.8207874298095703, "rewards/rollout_reward_func/std": 0.7145770192146301, "sampling/importance_sampling_ratio/max": 1.2579838037490845, "sampling/importance_sampling_ratio/mean": 0.8884598016738892, "sampling/importance_sampling_ratio/min": 0.0010582072427496314, "sampling/sampling_logp_difference/max": 1.559689998626709, "sampling/sampling_logp_difference/mean": 0.14936037361621857, "step": 877, "step_time": 26.04279553203378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7504805969074368, "epoch": 0.00878, "grad_norm": 0.028350720182061195, "kl": 0.454400566406548, "learning_rate": 7.999737517182377e-06, "loss": -0.0413, "step": 878, "step_time": 14.006136981013697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.111111164093018, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.024369872175157, "epoch": 0.00879, "frac_reward_zero_std": 0.0, "grad_norm": 0.028130551800131798, "kl": 0.5161177460104227, "learning_rate": 7.999736893346636e-06, "loss": -0.0796, "num_tokens": 21845775.0, "reward": 0.4943339228630066, "reward_std": 0.7917827367782593, "rewards/rollout_reward_func/mean": 0.4943339228630066, "rewards/rollout_reward_func/std": 0.7917826771736145, "sampling/importance_sampling_ratio/max": 1.2439924478530884, "sampling/importance_sampling_ratio/mean": 0.8203704953193665, "sampling/importance_sampling_ratio/min": 0.00010361745808040723, "sampling/sampling_logp_difference/max": 2.1535544395446777, "sampling/sampling_logp_difference/mean": 0.24732449650764465, "step": 879, "step_time": 27.24711311297142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0303564239293337, "epoch": 0.0088, "grad_norm": 0.02673645131289959, "kl": 0.492619302123785, "learning_rate": 7.99973626877048e-06, "loss": -0.0796, "step": 880, "step_time": 14.278210786986165 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 4.5625, "completions/mean_terminated_length": 4.5625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6692850571125746, "epoch": 0.00881, "frac_reward_zero_std": 0.0, "grad_norm": 0.24301359057426453, "kl": 0.38325521163642406, "learning_rate": 7.99973564345391e-06, "loss": -0.0567, "num_tokens": 21884804.0, "reward": 0.5444132089614868, "reward_std": 0.6244533658027649, "rewards/rollout_reward_func/mean": 0.5444132089614868, "rewards/rollout_reward_func/std": 0.6244533061981201, "sampling/importance_sampling_ratio/max": 1.1149932146072388, "sampling/importance_sampling_ratio/mean": 0.8998132944107056, "sampling/importance_sampling_ratio/min": 4.503391028265469e-05, "sampling/sampling_logp_difference/max": 1.7243150472640991, "sampling/sampling_logp_difference/mean": 0.14674919843673706, "step": 881, "step_time": 17.404976910998812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.712042823433876, "epoch": 0.00882, "grad_norm": 0.1379757523536682, "kl": 0.37718312069773674, "learning_rate": 7.999735017396925e-06, "loss": -0.0577, "step": 882, "step_time": 10.50653374902322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 4.137930870056152, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9208228942006826, "epoch": 0.00883, "frac_reward_zero_std": 0.25, "grad_norm": 0.18364940583705902, "kl": 0.49642111640423536, "learning_rate": 7.999734390599525e-06, "loss": -0.0549, "num_tokens": 21932790.0, "reward": 0.7005401849746704, "reward_std": 0.740898847579956, "rewards/rollout_reward_func/mean": 0.7005401849746704, "rewards/rollout_reward_func/std": 0.740898847579956, "sampling/importance_sampling_ratio/max": 1.2046256065368652, "sampling/importance_sampling_ratio/mean": 0.8828502297401428, "sampling/importance_sampling_ratio/min": 0.00043416148400865495, "sampling/sampling_logp_difference/max": 2.482414722442627, "sampling/sampling_logp_difference/mean": 0.20081445574760437, "step": 883, "step_time": 25.81407062493963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.927555400878191, "epoch": 0.00884, "grad_norm": 0.16883400082588196, "kl": 0.4919731058180332, "learning_rate": 7.999733763061714e-06, "loss": -0.0549, "step": 884, "step_time": 14.133964733046014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 4.53125, "completions/mean_terminated_length": 4.53125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6353661576285958, "epoch": 0.00885, "frac_reward_zero_std": 0.0, "grad_norm": 0.07982888072729111, "kl": 0.7221965231001377, "learning_rate": 7.999733134783487e-06, "loss": -0.0513, "num_tokens": 21988278.0, "reward": 1.2090575695037842, "reward_std": 0.4604542553424835, "rewards/rollout_reward_func/mean": 1.2090575695037842, "rewards/rollout_reward_func/std": 0.4604542851448059, "sampling/importance_sampling_ratio/max": 1.2475954294204712, "sampling/importance_sampling_ratio/mean": 0.9613117575645447, "sampling/importance_sampling_ratio/min": 6.491100634775648e-07, "sampling/sampling_logp_difference/max": 1.5925971269607544, "sampling/sampling_logp_difference/mean": 0.1766582727432251, "step": 885, "step_time": 22.52665353898192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6330633889883757, "epoch": 0.00886, "grad_norm": 0.07107792794704437, "kl": 0.7230416312813759, "learning_rate": 7.999732505764847e-06, "loss": -0.0515, "step": 886, "step_time": 13.015256116952514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.15625, "completions/mean_terminated_length": 4.679999828338623, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6770851351320744, "epoch": 0.00887, "frac_reward_zero_std": 0.0, "grad_norm": 0.06897556781768799, "kl": 0.30833141691982746, "learning_rate": 7.999731876005793e-06, "loss": -0.0859, "num_tokens": 22040442.0, "reward": 0.6303984522819519, "reward_std": 0.8368256092071533, "rewards/rollout_reward_func/mean": 0.6303984522819519, "rewards/rollout_reward_func/std": 0.8368255496025085, "sampling/importance_sampling_ratio/max": 1.3028346300125122, "sampling/importance_sampling_ratio/mean": 0.7687972187995911, "sampling/importance_sampling_ratio/min": 6.172090394329643e-08, "sampling/sampling_logp_difference/max": 2.5416793823242188, "sampling/sampling_logp_difference/mean": 0.30277344584465027, "step": 887, "step_time": 30.514611074060667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6730965580791235, "epoch": 0.00888, "grad_norm": 0.06796608865261078, "kl": 0.30140422470867634, "learning_rate": 7.999731245506327e-06, "loss": -0.0858, "step": 888, "step_time": 13.74659206403885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.53125, "completions/mean_terminated_length": 4.777777671813965, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3743847412988544, "epoch": 0.00889, "frac_reward_zero_std": 0.0, "grad_norm": 0.13927750289440155, "kl": 0.6379409395158291, "learning_rate": 7.999730614266446e-06, "loss": -0.0805, "num_tokens": 22090861.0, "reward": 1.0579978227615356, "reward_std": 0.74067223072052, "rewards/rollout_reward_func/mean": 1.0579978227615356, "rewards/rollout_reward_func/std": 0.7406721711158752, "sampling/importance_sampling_ratio/max": 1.1238032579421997, "sampling/importance_sampling_ratio/mean": 0.7597352266311646, "sampling/importance_sampling_ratio/min": 2.4543576060409578e-08, "sampling/sampling_logp_difference/max": 2.3272995948791504, "sampling/sampling_logp_difference/mean": 0.29631301760673523, "step": 889, "step_time": 31.907323277002433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.374893432483077, "epoch": 0.0089, "grad_norm": 0.1126166582107544, "kl": 0.5629996750503778, "learning_rate": 7.999729982286154e-06, "loss": -0.0812, "step": 890, "step_time": 15.849667503091041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.44444465637207, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4340143222361803, "epoch": 0.00891, "frac_reward_zero_std": 0.0, "grad_norm": 0.18530450761318207, "kl": 0.5038095018826425, "learning_rate": 7.999729349565447e-06, "loss": -0.0505, "num_tokens": 22140105.0, "reward": 0.9662690162658691, "reward_std": 0.8606336712837219, "rewards/rollout_reward_func/mean": 0.9662690162658691, "rewards/rollout_reward_func/std": 0.8606336712837219, "sampling/importance_sampling_ratio/max": 1.1576720476150513, "sampling/importance_sampling_ratio/mean": 0.7800198197364807, "sampling/importance_sampling_ratio/min": 5.007624892527929e-08, "sampling/sampling_logp_difference/max": 2.622469425201416, "sampling/sampling_logp_difference/mean": 0.3046937882900238, "step": 891, "step_time": 25.53580737503944 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.4149983990937471, "epoch": 0.00892, "grad_norm": 0.1450139284133911, "kl": 0.5059900982305408, "learning_rate": 7.99972871610433e-06, "loss": -0.0514, "step": 892, "step_time": 12.670299235003768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2124386094510555, "epoch": 0.00893, "frac_reward_zero_std": 0.0, "grad_norm": 0.049090974032878876, "kl": 0.6745345005765557, "learning_rate": 7.999728081902798e-06, "loss": -0.0826, "num_tokens": 22186613.0, "reward": 0.9234403371810913, "reward_std": 0.8868592381477356, "rewards/rollout_reward_func/mean": 0.9234403371810913, "rewards/rollout_reward_func/std": 0.8868591785430908, "sampling/importance_sampling_ratio/max": 1.0671300888061523, "sampling/importance_sampling_ratio/mean": 0.7660285830497742, "sampling/importance_sampling_ratio/min": 1.9215987776988186e-05, "sampling/sampling_logp_difference/max": 1.9126181602478027, "sampling/sampling_logp_difference/mean": 0.26383599638938904, "step": 893, "step_time": 23.95336549700005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2025038693100214, "epoch": 0.00894, "grad_norm": 0.04765232279896736, "kl": 0.6318558668717742, "learning_rate": 7.999727446960856e-06, "loss": -0.0826, "step": 894, "step_time": 12.225012392998906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 4.360000133514404, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9051657244563103, "epoch": 0.00895, "frac_reward_zero_std": 0.25, "grad_norm": 0.11450601369142532, "kl": 0.468753382563591, "learning_rate": 7.999726811278499e-06, "loss": -0.0517, "num_tokens": 22238121.0, "reward": 0.46105653047561646, "reward_std": 1.0256937742233276, "rewards/rollout_reward_func/mean": 0.46105653047561646, "rewards/rollout_reward_func/std": 1.0256937742233276, "sampling/importance_sampling_ratio/max": 1.514304757118225, "sampling/importance_sampling_ratio/mean": 0.6418240070343018, "sampling/importance_sampling_ratio/min": 1.6260711390714278e-06, "sampling/sampling_logp_difference/max": 2.4402072429656982, "sampling/sampling_logp_difference/mean": 0.37311995029449463, "step": 895, "step_time": 28.51065624502371 }, { "clip_ratio/high_max": 0.044791667722165585, "clip_ratio/high_mean": 0.022395833861082792, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022395833861082792, "entropy": 1.8944275341928005, "epoch": 0.00896, "grad_norm": 0.08805623650550842, "kl": 0.4600503742694855, "learning_rate": 7.99972617485573e-06, "loss": -0.0519, "step": 896, "step_time": 14.608031591953477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 5.839999675750732, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9741748124361038, "epoch": 0.00897, "frac_reward_zero_std": 0.0, "grad_norm": 0.1753031313419342, "kl": 0.36283285170793533, "learning_rate": 7.999725537692552e-06, "loss": -0.0973, "num_tokens": 22293743.0, "reward": 0.6052749752998352, "reward_std": 0.8347136974334717, "rewards/rollout_reward_func/mean": 0.6052749752998352, "rewards/rollout_reward_func/std": 0.8347136974334717, "sampling/importance_sampling_ratio/max": 1.3196947574615479, "sampling/importance_sampling_ratio/mean": 0.6414891481399536, "sampling/importance_sampling_ratio/min": 9.363110287097243e-09, "sampling/sampling_logp_difference/max": 2.2141213417053223, "sampling/sampling_logp_difference/mean": 0.3760554790496826, "step": 897, "step_time": 30.168496749043697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9651178997009993, "epoch": 0.00898, "grad_norm": 0.17605043947696686, "kl": 0.3542188312858343, "learning_rate": 7.99972489978896e-06, "loss": -0.0976, "step": 898, "step_time": 14.239986437023617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.4375, "completions/mean_terminated_length": 4.583333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.979655921459198, "epoch": 0.00899, "frac_reward_zero_std": 0.0, "grad_norm": 0.08098568767309189, "kl": 0.35390456253662705, "learning_rate": 7.999724261144958e-06, "loss": -0.1, "num_tokens": 22352788.0, "reward": -0.005381390452384949, "reward_std": 0.7987860441207886, "rewards/rollout_reward_func/mean": -0.005381390452384949, "rewards/rollout_reward_func/std": 0.7987859845161438, "sampling/importance_sampling_ratio/max": 1.1891024112701416, "sampling/importance_sampling_ratio/mean": 0.676442563533783, "sampling/importance_sampling_ratio/min": 1.915584653033875e-05, "sampling/sampling_logp_difference/max": 2.266843557357788, "sampling/sampling_logp_difference/mean": 0.3262808918952942, "step": 899, "step_time": 36.8951707300148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.969397708773613, "epoch": 0.009, "grad_norm": 0.0722857117652893, "kl": 0.36077948193997145, "learning_rate": 7.999723621760544e-06, "loss": -0.1004, "step": 900, "step_time": 18.27800098602893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.518518447875977, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.176088022068143, "epoch": 0.00901, "frac_reward_zero_std": 0.25, "grad_norm": 0.027797898277640343, "kl": 0.23987339064478874, "learning_rate": 7.999722981635718e-06, "loss": -0.0587, "num_tokens": 22400502.0, "reward": 0.9359526634216309, "reward_std": 0.8702610731124878, "rewards/rollout_reward_func/mean": 0.9359526634216309, "rewards/rollout_reward_func/std": 0.870261013507843, "sampling/importance_sampling_ratio/max": 1.0940804481506348, "sampling/importance_sampling_ratio/mean": 0.8226872682571411, "sampling/importance_sampling_ratio/min": 8.531398520972289e-07, "sampling/sampling_logp_difference/max": 1.8362648487091064, "sampling/sampling_logp_difference/mean": 0.24410468339920044, "step": 901, "step_time": 27.139369357028045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1659327801316977, "epoch": 0.00902, "grad_norm": 0.021671384572982788, "kl": 0.2456695642322302, "learning_rate": 7.999722340770481e-06, "loss": -0.0585, "step": 902, "step_time": 13.324494375003269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.21875, "completions/mean_terminated_length": 4.103448390960693, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0463107377290726, "epoch": 0.00903, "frac_reward_zero_std": 0.0, "grad_norm": 0.08649323880672455, "kl": 0.4179291184991598, "learning_rate": 7.999721699164835e-06, "loss": -0.0599, "num_tokens": 22444191.0, "reward": 0.5722689032554626, "reward_std": 0.8522387742996216, "rewards/rollout_reward_func/mean": 0.5722689032554626, "rewards/rollout_reward_func/std": 0.8522387742996216, "sampling/importance_sampling_ratio/max": 1.316908836364746, "sampling/importance_sampling_ratio/mean": 0.9020798206329346, "sampling/importance_sampling_ratio/min": 3.498301737181464e-07, "sampling/sampling_logp_difference/max": 2.4435739517211914, "sampling/sampling_logp_difference/mean": 0.2613544464111328, "step": 903, "step_time": 22.459865751006873 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.0444579813629389, "epoch": 0.00904, "grad_norm": 0.08562354743480682, "kl": 0.4192890115082264, "learning_rate": 7.999721056818777e-06, "loss": -0.06, "step": 904, "step_time": 12.798630031989887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.413793087005615, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9098898628726602, "epoch": 0.00905, "frac_reward_zero_std": 0.25, "grad_norm": 0.023268060758709908, "kl": 0.373754957690835, "learning_rate": 7.999720413732308e-06, "loss": -0.0656, "num_tokens": 22488930.0, "reward": 1.1730575561523438, "reward_std": 0.5740941762924194, "rewards/rollout_reward_func/mean": 1.1730575561523438, "rewards/rollout_reward_func/std": 0.5740941762924194, "sampling/importance_sampling_ratio/max": 1.1104846000671387, "sampling/importance_sampling_ratio/mean": 0.9173790812492371, "sampling/importance_sampling_ratio/min": 1.4728448149980977e-07, "sampling/sampling_logp_difference/max": 2.5615627765655518, "sampling/sampling_logp_difference/mean": 0.2389748990535736, "step": 905, "step_time": 22.68448076400091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9087456045672297, "epoch": 0.00906, "grad_norm": 0.022739173844456673, "kl": 0.3717344831675291, "learning_rate": 7.999719769905428e-06, "loss": -0.0657, "step": 906, "step_time": 12.253692741011037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.5625, "completions/mean_terminated_length": 4.482758522033691, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0819529592990875, "epoch": 0.00907, "frac_reward_zero_std": 0.0, "grad_norm": 0.10788010060787201, "kl": 0.5425676535815, "learning_rate": 7.999719125338138e-06, "loss": -0.0572, "num_tokens": 22552338.0, "reward": 0.27790847420692444, "reward_std": 0.5883721709251404, "rewards/rollout_reward_func/mean": 0.27790847420692444, "rewards/rollout_reward_func/std": 0.5883722305297852, "sampling/importance_sampling_ratio/max": 1.735907793045044, "sampling/importance_sampling_ratio/mean": 0.8649970293045044, "sampling/importance_sampling_ratio/min": 1.1386229736842779e-08, "sampling/sampling_logp_difference/max": 2.4437122344970703, "sampling/sampling_logp_difference/mean": 0.2620721459388733, "step": 907, "step_time": 25.907179012981942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.081714054569602, "epoch": 0.00908, "grad_norm": 0.11015648394823074, "kl": 0.4947442524135113, "learning_rate": 7.999718480030437e-06, "loss": -0.0575, "step": 908, "step_time": 13.569080306013348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.03125, "completions/mean_terminated_length": 4.607142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1894458308815956, "epoch": 0.00909, "frac_reward_zero_std": 0.0, "grad_norm": 0.13149942457675934, "kl": 0.38491266407072544, "learning_rate": 7.999717833982328e-06, "loss": -0.0624, "num_tokens": 22613654.0, "reward": 0.5797015428543091, "reward_std": 0.866083025932312, "rewards/rollout_reward_func/mean": 0.5797015428543091, "rewards/rollout_reward_func/std": 0.866083025932312, "sampling/importance_sampling_ratio/max": 1.1953765153884888, "sampling/importance_sampling_ratio/mean": 0.8331044912338257, "sampling/importance_sampling_ratio/min": 0.00013326562475413084, "sampling/sampling_logp_difference/max": 1.903050422668457, "sampling/sampling_logp_difference/mean": 0.22704778611660004, "step": 909, "step_time": 29.514869836973958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.2038362883031368, "epoch": 0.0091, "grad_norm": 0.11161123216152191, "kl": 0.36648926697671413, "learning_rate": 7.99971718719381e-06, "loss": -0.063, "step": 910, "step_time": 13.927073231054237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 5.357142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.46691326610744, "epoch": 0.00911, "frac_reward_zero_std": 0.0, "grad_norm": 0.2086901068687439, "kl": 0.20625272206962109, "learning_rate": 7.999716539664878e-06, "loss": -0.0622, "num_tokens": 22667270.0, "reward": 0.619888961315155, "reward_std": 1.0157276391983032, "rewards/rollout_reward_func/mean": 0.619888961315155, "rewards/rollout_reward_func/std": 1.0157276391983032, "sampling/importance_sampling_ratio/max": 1.175982117652893, "sampling/importance_sampling_ratio/mean": 0.7908136248588562, "sampling/importance_sampling_ratio/min": 9.714859317000446e-08, "sampling/sampling_logp_difference/max": 2.3126919269561768, "sampling/sampling_logp_difference/mean": 0.24070250988006592, "step": 911, "step_time": 28.605589640967082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.4902139976620674, "epoch": 0.00912, "grad_norm": 0.1995689868927002, "kl": 0.20348487608134747, "learning_rate": 7.99971589139554e-06, "loss": -0.0631, "step": 912, "step_time": 13.374093799007824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.15625, "completions/mean_terminated_length": 4.433333396911621, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8936551101505756, "epoch": 0.00913, "frac_reward_zero_std": 0.25, "grad_norm": 0.12435796856880188, "kl": 0.3914880473166704, "learning_rate": 7.999715242385793e-06, "loss": -0.0326, "num_tokens": 22720490.0, "reward": 0.8023593425750732, "reward_std": 0.6808167099952698, "rewards/rollout_reward_func/mean": 0.8023593425750732, "rewards/rollout_reward_func/std": 0.680816650390625, "sampling/importance_sampling_ratio/max": 1.2495287656784058, "sampling/importance_sampling_ratio/mean": 0.8454866409301758, "sampling/importance_sampling_ratio/min": 0.0012199489865452051, "sampling/sampling_logp_difference/max": 1.615882396697998, "sampling/sampling_logp_difference/mean": 0.16701045632362366, "step": 913, "step_time": 23.97167684804299 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03693181835114956, "entropy": 0.9598204288631678, "epoch": 0.00914, "grad_norm": 0.09725892543792725, "kl": 0.36001908453181386, "learning_rate": 7.999714592635635e-06, "loss": -0.0335, "step": 914, "step_time": 12.650433350965613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 4.34375, "completions/mean_terminated_length": 4.34375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6599871832877398, "epoch": 0.00915, "frac_reward_zero_std": 0.5, "grad_norm": 0.031629979610443115, "kl": 0.4225150514394045, "learning_rate": 7.999713942145069e-06, "loss": -0.0168, "num_tokens": 22761602.0, "reward": 0.79908686876297, "reward_std": 0.8557532429695129, "rewards/rollout_reward_func/mean": 0.79908686876297, "rewards/rollout_reward_func/std": 0.8557531237602234, "sampling/importance_sampling_ratio/max": 1.242873191833496, "sampling/importance_sampling_ratio/mean": 0.9879738688468933, "sampling/importance_sampling_ratio/min": 1.8335429558646865e-05, "sampling/sampling_logp_difference/max": 2.1763908863067627, "sampling/sampling_logp_difference/mean": 0.1505524218082428, "step": 915, "step_time": 18.923210231034318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6739536672830582, "epoch": 0.00916, "grad_norm": 0.03025910258293152, "kl": 0.4053526446223259, "learning_rate": 7.999713290914094e-06, "loss": -0.0169, "step": 916, "step_time": 10.993975737015717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 5.310344696044922, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3425044976174831, "epoch": 0.00917, "frac_reward_zero_std": 0.0, "grad_norm": 0.06849322468042374, "kl": 0.237494389526546, "learning_rate": 7.99971263894271e-06, "loss": -0.0821, "num_tokens": 22812411.0, "reward": 0.9388636946678162, "reward_std": 0.763993501663208, "rewards/rollout_reward_func/mean": 0.9388636946678162, "rewards/rollout_reward_func/std": 0.7639934420585632, "sampling/importance_sampling_ratio/max": 1.210137128829956, "sampling/importance_sampling_ratio/mean": 0.7983508110046387, "sampling/importance_sampling_ratio/min": 1.686162960368165e-07, "sampling/sampling_logp_difference/max": 2.0166878700256348, "sampling/sampling_logp_difference/mean": 0.3169727921485901, "step": 917, "step_time": 27.364542650000658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.365520840510726, "epoch": 0.00918, "grad_norm": 0.08540967106819153, "kl": 0.233984992839396, "learning_rate": 7.999711986230917e-06, "loss": -0.0818, "step": 918, "step_time": 13.952472798002418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.535714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3398821763694286, "epoch": 0.00919, "frac_reward_zero_std": 0.0, "grad_norm": 0.1258709579706192, "kl": 0.58558982424438, "learning_rate": 7.999711332778717e-06, "loss": -0.0751, "num_tokens": 22872083.0, "reward": 0.7658137083053589, "reward_std": 0.8729739785194397, "rewards/rollout_reward_func/mean": 0.7658137083053589, "rewards/rollout_reward_func/std": 0.8729739785194397, "sampling/importance_sampling_ratio/max": 1.2832634449005127, "sampling/importance_sampling_ratio/mean": 0.772681474685669, "sampling/importance_sampling_ratio/min": 5.03408000440686e-06, "sampling/sampling_logp_difference/max": 2.1276021003723145, "sampling/sampling_logp_difference/mean": 0.27098381519317627, "step": 919, "step_time": 26.513730060018133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013494318351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013494318351149559, "entropy": 1.3761142753064632, "epoch": 0.0092, "grad_norm": 0.10617279261350632, "kl": 0.6098254434764385, "learning_rate": 7.999710678586108e-06, "loss": -0.0756, "step": 920, "step_time": 12.960760995018063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 4.692307949066162, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8703596889972687, "epoch": 0.00921, "frac_reward_zero_std": 0.0, "grad_norm": 0.08401909470558167, "kl": 0.28185683488845825, "learning_rate": 7.99971002365309e-06, "loss": -0.0706, "num_tokens": 22919267.0, "reward": 0.21829158067703247, "reward_std": 0.8175520896911621, "rewards/rollout_reward_func/mean": 0.21829158067703247, "rewards/rollout_reward_func/std": 0.8175520300865173, "sampling/importance_sampling_ratio/max": 2.207057476043701, "sampling/importance_sampling_ratio/mean": 0.728181004524231, "sampling/importance_sampling_ratio/min": 1.4977453943743058e-08, "sampling/sampling_logp_difference/max": 2.3187265396118164, "sampling/sampling_logp_difference/mean": 0.36182689666748047, "step": 921, "step_time": 22.40463401400484 }, { "clip_ratio/high_max": 0.005434782709926367, "clip_ratio/high_mean": 0.0027173913549631834, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027173913549631834, "entropy": 1.912403766065836, "epoch": 0.00922, "grad_norm": 0.10547620058059692, "kl": 0.27421101182699203, "learning_rate": 7.999709367979666e-06, "loss": -0.0708, "step": 922, "step_time": 10.899046538950643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 4.769230842590332, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6542005687952042, "epoch": 0.00923, "frac_reward_zero_std": 0.25, "grad_norm": 0.0994405522942543, "kl": 0.4697846872732043, "learning_rate": 7.999708711565832e-06, "loss": -0.0678, "num_tokens": 22977548.0, "reward": 0.6707121729850769, "reward_std": 0.8408855199813843, "rewards/rollout_reward_func/mean": 0.6707121729850769, "rewards/rollout_reward_func/std": 0.8408855199813843, "sampling/importance_sampling_ratio/max": 1.2651971578598022, "sampling/importance_sampling_ratio/mean": 0.6962140202522278, "sampling/importance_sampling_ratio/min": 6.447912710427772e-07, "sampling/sampling_logp_difference/max": 2.144226551055908, "sampling/sampling_logp_difference/mean": 0.35457080602645874, "step": 923, "step_time": 31.446619893948082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6478070002049208, "epoch": 0.00924, "grad_norm": 0.10424241423606873, "kl": 0.49235820956528187, "learning_rate": 7.999708054411592e-06, "loss": -0.0678, "step": 924, "step_time": 14.213622194045456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 5.043478488922119, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.477547898888588, "epoch": 0.00925, "frac_reward_zero_std": 0.0, "grad_norm": 0.10433192551136017, "kl": 0.33939627557992935, "learning_rate": 7.999707396516944e-06, "loss": -0.0961, "num_tokens": 23025160.0, "reward": 0.38635581731796265, "reward_std": 0.8557977080345154, "rewards/rollout_reward_func/mean": 0.38635581731796265, "rewards/rollout_reward_func/std": 0.8557976484298706, "sampling/importance_sampling_ratio/max": 1.1823967695236206, "sampling/importance_sampling_ratio/mean": 0.5199365615844727, "sampling/importance_sampling_ratio/min": 8.217548952416109e-07, "sampling/sampling_logp_difference/max": 2.387608528137207, "sampling/sampling_logp_difference/mean": 0.43114978075027466, "step": 925, "step_time": 28.364487896033097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.463664025068283, "epoch": 0.00926, "grad_norm": 0.0995684489607811, "kl": 0.3491918481886387, "learning_rate": 7.999706737881888e-06, "loss": -0.0964, "step": 926, "step_time": 14.791800299019087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.03125, "completions/mean_terminated_length": 4.607142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5442028678953648, "epoch": 0.00927, "frac_reward_zero_std": 0.0, "grad_norm": 0.07326677441596985, "kl": 0.3585684224963188, "learning_rate": 7.999706078506426e-06, "loss": -0.0831, "num_tokens": 23073277.0, "reward": 0.9790960550308228, "reward_std": 0.7558987736701965, "rewards/rollout_reward_func/mean": 0.9790960550308228, "rewards/rollout_reward_func/std": 0.7558987736701965, "sampling/importance_sampling_ratio/max": 1.2746491432189941, "sampling/importance_sampling_ratio/mean": 0.8028663992881775, "sampling/importance_sampling_ratio/min": 2.2782984387959004e-08, "sampling/sampling_logp_difference/max": 2.4084901809692383, "sampling/sampling_logp_difference/mean": 0.36680078506469727, "step": 927, "step_time": 27.55007411900442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5367631996050477, "epoch": 0.00928, "grad_norm": 0.07541690766811371, "kl": 0.3842451125383377, "learning_rate": 7.999705418390558e-06, "loss": -0.0833, "step": 928, "step_time": 14.282529410003917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.428571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.507721733301878, "epoch": 0.00929, "frac_reward_zero_std": 0.0, "grad_norm": 0.034740228205919266, "kl": 0.7373379748314619, "learning_rate": 7.999704757534282e-06, "loss": -0.0875, "num_tokens": 23121271.0, "reward": 0.4320027232170105, "reward_std": 0.8823693990707397, "rewards/rollout_reward_func/mean": 0.4320027232170105, "rewards/rollout_reward_func/std": 0.8823693990707397, "sampling/importance_sampling_ratio/max": 1.1126244068145752, "sampling/importance_sampling_ratio/mean": 0.7527333498001099, "sampling/importance_sampling_ratio/min": 2.9014276492489444e-07, "sampling/sampling_logp_difference/max": 2.263671398162842, "sampling/sampling_logp_difference/mean": 0.3239087462425232, "step": 929, "step_time": 25.489916169957723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4928417885676026, "epoch": 0.0093, "grad_norm": 0.03993317484855652, "kl": 0.7978508081287146, "learning_rate": 7.9997040959376e-06, "loss": -0.0874, "step": 930, "step_time": 12.497546758968383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.125, "completions/mean_terminated_length": 5.076923370361328, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0349615421146154, "epoch": 0.00931, "frac_reward_zero_std": 0.0, "grad_norm": 0.13249579071998596, "kl": 0.3720788098871708, "learning_rate": 7.999703433600512e-06, "loss": -0.0841, "num_tokens": 23168594.0, "reward": 0.9144363403320312, "reward_std": 0.8125824928283691, "rewards/rollout_reward_func/mean": 0.9144363403320312, "rewards/rollout_reward_func/std": 0.8125824332237244, "sampling/importance_sampling_ratio/max": 1.1136060953140259, "sampling/importance_sampling_ratio/mean": 0.6622442603111267, "sampling/importance_sampling_ratio/min": 8.074077300079807e-07, "sampling/sampling_logp_difference/max": 2.1104774475097656, "sampling/sampling_logp_difference/mean": 0.40995559096336365, "step": 931, "step_time": 25.72581218898995 }, { "clip_ratio/high_max": 0.046875, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 1.9849471468478441, "epoch": 0.00932, "grad_norm": 0.06970085948705673, "kl": 0.4000098165124655, "learning_rate": 7.999702770523015e-06, "loss": -0.0844, "step": 932, "step_time": 12.200842151039978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 4.148148059844971, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4379305858165026, "epoch": 0.00933, "frac_reward_zero_std": 0.25, "grad_norm": 0.020825771614909172, "kl": 0.3522974755614996, "learning_rate": 7.999702106705114e-06, "loss": -0.065, "num_tokens": 23217994.0, "reward": 0.5866529941558838, "reward_std": 0.8598408699035645, "rewards/rollout_reward_func/mean": 0.5866529941558838, "rewards/rollout_reward_func/std": 0.8598408102989197, "sampling/importance_sampling_ratio/max": 1.1370054483413696, "sampling/importance_sampling_ratio/mean": 0.7820338606834412, "sampling/importance_sampling_ratio/min": 1.1130082384624984e-05, "sampling/sampling_logp_difference/max": 1.6506211757659912, "sampling/sampling_logp_difference/mean": 0.27756035327911377, "step": 933, "step_time": 26.83624503892497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4217787962406874, "epoch": 0.00934, "grad_norm": 0.02064044028520584, "kl": 0.3710722913965583, "learning_rate": 7.999701442146808e-06, "loss": -0.065, "step": 934, "step_time": 13.364714779978385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.40625, "completions/mean_terminated_length": 4.700000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2122053373605013, "epoch": 0.00935, "frac_reward_zero_std": 0.25, "grad_norm": 0.2079433649778366, "kl": 0.36347050964832306, "learning_rate": 7.999700776848094e-06, "loss": -0.0417, "num_tokens": 23273066.0, "reward": 0.9108256697654724, "reward_std": 0.7949900031089783, "rewards/rollout_reward_func/mean": 0.9108256697654724, "rewards/rollout_reward_func/std": 0.7949899435043335, "sampling/importance_sampling_ratio/max": 1.1079844236373901, "sampling/importance_sampling_ratio/mean": 0.8002046346664429, "sampling/importance_sampling_ratio/min": 3.1416318961419165e-05, "sampling/sampling_logp_difference/max": 1.5503628253936768, "sampling/sampling_logp_difference/mean": 0.2371753454208374, "step": 935, "step_time": 25.002277696039528 }, { "clip_ratio/high_max": 0.03437500027939677, "clip_ratio/high_mean": 0.017187500139698386, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017187500139698386, "entropy": 1.1724393693730235, "epoch": 0.00936, "grad_norm": 0.0730540007352829, "kl": 0.3735422883182764, "learning_rate": 7.999700110808976e-06, "loss": -0.0426, "step": 936, "step_time": 13.897698022017721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 4.692307949066162, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9097963199019432, "epoch": 0.00937, "frac_reward_zero_std": 0.0, "grad_norm": 0.07423055917024612, "kl": 0.5621032528579235, "learning_rate": 7.999699444029451e-06, "loss": -0.0644, "num_tokens": 23323749.0, "reward": 0.2730479836463928, "reward_std": 0.9469143748283386, "rewards/rollout_reward_func/mean": 0.2730479836463928, "rewards/rollout_reward_func/std": 0.9469143748283386, "sampling/importance_sampling_ratio/max": 1.6393260955810547, "sampling/importance_sampling_ratio/mean": 0.7402635812759399, "sampling/importance_sampling_ratio/min": 3.633272127601117e-09, "sampling/sampling_logp_difference/max": 2.383903980255127, "sampling/sampling_logp_difference/mean": 0.41365692019462585, "step": 937, "step_time": 25.31196785598877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.899025585502386, "epoch": 0.00938, "grad_norm": 0.06828083097934723, "kl": 0.5430793762207031, "learning_rate": 7.999698776509521e-06, "loss": -0.0644, "step": 938, "step_time": 13.17781511505018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 4.814815044403076, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.39539257530123, "epoch": 0.00939, "frac_reward_zero_std": 0.25, "grad_norm": 0.10142716020345688, "kl": 0.8186987917870283, "learning_rate": 7.999698108249188e-06, "loss": -0.0687, "num_tokens": 23374473.0, "reward": 0.603664219379425, "reward_std": 0.8732309937477112, "rewards/rollout_reward_func/mean": 0.603664219379425, "rewards/rollout_reward_func/std": 0.8732309341430664, "sampling/importance_sampling_ratio/max": 1.145056962966919, "sampling/importance_sampling_ratio/mean": 0.7349386811256409, "sampling/importance_sampling_ratio/min": 1.6830677296297836e-08, "sampling/sampling_logp_difference/max": 2.848050117492676, "sampling/sampling_logp_difference/mean": 0.32909977436065674, "step": 939, "step_time": 28.525608209019993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3850604044273496, "epoch": 0.0094, "grad_norm": 0.08738960325717926, "kl": 0.8115687053650618, "learning_rate": 7.999697439248448e-06, "loss": -0.069, "step": 940, "step_time": 14.063580364949303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5513448747806251, "epoch": 0.00941, "frac_reward_zero_std": 0.25, "grad_norm": 0.0178349781781435, "kl": 0.34275903925299644, "learning_rate": 7.999696769507304e-06, "loss": -0.0378, "num_tokens": 23413149.0, "reward": 1.2542948722839355, "reward_std": 0.5584802627563477, "rewards/rollout_reward_func/mean": 1.2542948722839355, "rewards/rollout_reward_func/std": 0.5584802627563477, "sampling/importance_sampling_ratio/max": 1.0488042831420898, "sampling/importance_sampling_ratio/mean": 0.9633607864379883, "sampling/importance_sampling_ratio/min": 4.4855144598621166e-10, "sampling/sampling_logp_difference/max": 2.19370436668396, "sampling/sampling_logp_difference/mean": 0.18914110958576202, "step": 941, "step_time": 18.189691209001467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.553339421749115, "epoch": 0.00942, "grad_norm": 0.018212856724858284, "kl": 0.34261488914489746, "learning_rate": 7.999696099025756e-06, "loss": -0.0378, "step": 942, "step_time": 9.82570913998643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.28125, "completions/mean_terminated_length": 4.375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5344260879792273, "epoch": 0.00943, "frac_reward_zero_std": 0.0, "grad_norm": 0.13560490310192108, "kl": 0.21701028244569898, "learning_rate": 7.999695427803802e-06, "loss": -0.0615, "num_tokens": 23470318.0, "reward": 0.49108922481536865, "reward_std": 0.9099771976470947, "rewards/rollout_reward_func/mean": 0.49108922481536865, "rewards/rollout_reward_func/std": 0.9099772572517395, "sampling/importance_sampling_ratio/max": 1.1247512102127075, "sampling/importance_sampling_ratio/mean": 0.7060083150863647, "sampling/importance_sampling_ratio/min": 1.3547885036047091e-09, "sampling/sampling_logp_difference/max": 2.5875496864318848, "sampling/sampling_logp_difference/mean": 0.3027051091194153, "step": 943, "step_time": 27.412563547026366 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.536679099779576, "epoch": 0.00944, "grad_norm": 0.1341986060142517, "kl": 0.21213499549776316, "learning_rate": 7.999694755841444e-06, "loss": -0.0619, "step": 944, "step_time": 12.869758806016762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.78125, "completions/mean_terminated_length": 4.3214287757873535, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4691581595689058, "epoch": 0.00945, "frac_reward_zero_std": 0.25, "grad_norm": 0.026296349242329597, "kl": 0.5635986663401127, "learning_rate": 7.999694083138682e-06, "loss": -0.0767, "num_tokens": 23519212.0, "reward": 0.4408957362174988, "reward_std": 0.811499834060669, "rewards/rollout_reward_func/mean": 0.4408957362174988, "rewards/rollout_reward_func/std": 0.8114997744560242, "sampling/importance_sampling_ratio/max": 1.1084500551223755, "sampling/importance_sampling_ratio/mean": 0.7861619591712952, "sampling/importance_sampling_ratio/min": 1.1434191947046202e-05, "sampling/sampling_logp_difference/max": 2.02398681640625, "sampling/sampling_logp_difference/mean": 0.31682902574539185, "step": 945, "step_time": 25.160596724017523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4724883306771517, "epoch": 0.00946, "grad_norm": 0.02725011296570301, "kl": 0.5569944884628057, "learning_rate": 7.999693409695516e-06, "loss": -0.0767, "step": 946, "step_time": 13.083438233996276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9169216249138117, "epoch": 0.00947, "frac_reward_zero_std": 0.0, "grad_norm": 0.7712908387184143, "kl": 0.4219323918223381, "learning_rate": 7.999692735511946e-06, "loss": -0.0611, "num_tokens": 23583455.0, "reward": 0.6887768507003784, "reward_std": 0.8345611691474915, "rewards/rollout_reward_func/mean": 0.6887768507003784, "rewards/rollout_reward_func/std": 0.8345611691474915, "sampling/importance_sampling_ratio/max": 1.971645712852478, "sampling/importance_sampling_ratio/mean": 0.8072801828384399, "sampling/importance_sampling_ratio/min": 9.359838459843672e-10, "sampling/sampling_logp_difference/max": 3.3061959743499756, "sampling/sampling_logp_difference/mean": 0.45012417435646057, "step": 947, "step_time": 33.54249842799618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.028125000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028125000186264515, "entropy": 1.9641622193157673, "epoch": 0.00948, "grad_norm": 0.1720675230026245, "kl": 0.3920153072103858, "learning_rate": 7.999692060587974e-06, "loss": -0.0654, "step": 948, "step_time": 16.83225276801386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.5625, "completions/mean_terminated_length": 3.8000001907348633, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5675277076661587, "epoch": 0.00949, "frac_reward_zero_std": 0.25, "grad_norm": 0.030030567198991776, "kl": 0.3321639783680439, "learning_rate": 7.999691384923597e-06, "loss": -0.0448, "num_tokens": 23623314.0, "reward": 0.29316967725753784, "reward_std": 0.7810272574424744, "rewards/rollout_reward_func/mean": 0.29316967725753784, "rewards/rollout_reward_func/std": 0.7810271382331848, "sampling/importance_sampling_ratio/max": 1.0720078945159912, "sampling/importance_sampling_ratio/mean": 0.9646536111831665, "sampling/importance_sampling_ratio/min": 4.4254991848902137e-07, "sampling/sampling_logp_difference/max": 1.6017316579818726, "sampling/sampling_logp_difference/mean": 0.147699236869812, "step": 949, "step_time": 18.19434203102719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5739118456840515, "epoch": 0.0095, "grad_norm": 0.030959337949752808, "kl": 0.33309392631053925, "learning_rate": 7.999690708518818e-06, "loss": -0.0448, "step": 950, "step_time": 10.119400182069512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.5, "completions/mean_terminated_length": 4.129032135009766, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.40747478045523167, "epoch": 0.00951, "frac_reward_zero_std": 0.25, "grad_norm": 0.07694486528635025, "kl": 0.3969345558434725, "learning_rate": 7.999690031373634e-06, "loss": -0.0339, "num_tokens": 23664640.0, "reward": 0.4141799211502075, "reward_std": 0.5694112777709961, "rewards/rollout_reward_func/mean": 0.4141799211502075, "rewards/rollout_reward_func/std": 0.5694112777709961, "sampling/importance_sampling_ratio/max": 1.116501808166504, "sampling/importance_sampling_ratio/mean": 0.9769540429115295, "sampling/importance_sampling_ratio/min": 0.005812987219542265, "sampling/sampling_logp_difference/max": 1.7833845615386963, "sampling/sampling_logp_difference/mean": 0.08327144384384155, "step": 951, "step_time": 21.082814937923104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4136951379477978, "epoch": 0.00952, "grad_norm": 0.07931503653526306, "kl": 0.3754477519541979, "learning_rate": 7.999689353488049e-06, "loss": -0.0338, "step": 952, "step_time": 11.2273450520006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.40625, "completions/mean_terminated_length": 4.192307949066162, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3281416790559888, "epoch": 0.00953, "frac_reward_zero_std": 0.0, "grad_norm": 0.21391065418720245, "kl": 0.49848566204309464, "learning_rate": 7.99968867486206e-06, "loss": -0.0803, "num_tokens": 23710848.0, "reward": 0.808108389377594, "reward_std": 0.7792830467224121, "rewards/rollout_reward_func/mean": 0.808108389377594, "rewards/rollout_reward_func/std": 0.7792830467224121, "sampling/importance_sampling_ratio/max": 1.1032315492630005, "sampling/importance_sampling_ratio/mean": 0.697894811630249, "sampling/importance_sampling_ratio/min": 1.1263261967542348e-06, "sampling/sampling_logp_difference/max": 2.0921852588653564, "sampling/sampling_logp_difference/mean": 0.29460805654525757, "step": 953, "step_time": 24.404375327023445 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.304582055658102, "epoch": 0.00954, "grad_norm": 0.16998198628425598, "kl": 0.4774926695972681, "learning_rate": 7.999687995495668e-06, "loss": -0.0811, "step": 954, "step_time": 12.380370520986617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.34375, "completions/mean_terminated_length": 4.633333683013916, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.97039027325809, "epoch": 0.00955, "frac_reward_zero_std": 0.25, "grad_norm": 0.07916489243507385, "kl": 0.27305267192423344, "learning_rate": 7.999687315388874e-06, "loss": -0.0342, "num_tokens": 23761747.0, "reward": 0.31682541966438293, "reward_std": 0.6545910835266113, "rewards/rollout_reward_func/mean": 0.31682541966438293, "rewards/rollout_reward_func/std": 0.6545910835266113, "sampling/importance_sampling_ratio/max": 1.3758814334869385, "sampling/importance_sampling_ratio/mean": 0.8511897325515747, "sampling/importance_sampling_ratio/min": 7.65014483476989e-05, "sampling/sampling_logp_difference/max": 1.4193181991577148, "sampling/sampling_logp_difference/mean": 0.20090830326080322, "step": 955, "step_time": 29.648876125982497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9737186376005411, "epoch": 0.00956, "grad_norm": 0.07603607326745987, "kl": 0.2736922800540924, "learning_rate": 7.999686634541679e-06, "loss": -0.0343, "step": 956, "step_time": 16.488924856035737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 4.559999942779541, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9662402980029583, "epoch": 0.00957, "frac_reward_zero_std": 0.25, "grad_norm": 0.10446449369192123, "kl": 0.2303816918283701, "learning_rate": 7.99968595295408e-06, "loss": -0.0689, "num_tokens": 23822716.0, "reward": 0.5331829190254211, "reward_std": 0.8387282490730286, "rewards/rollout_reward_func/mean": 0.5331829190254211, "rewards/rollout_reward_func/std": 0.8387282490730286, "sampling/importance_sampling_ratio/max": 1.7638977766036987, "sampling/importance_sampling_ratio/mean": 0.781038761138916, "sampling/importance_sampling_ratio/min": 4.877610848552649e-08, "sampling/sampling_logp_difference/max": 2.2125282287597656, "sampling/sampling_logp_difference/mean": 0.37079155445098877, "step": 957, "step_time": 37.16386361501645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9802342019975185, "epoch": 0.00958, "grad_norm": 0.09079822897911072, "kl": 0.23370682448148727, "learning_rate": 7.99968527062608e-06, "loss": -0.0692, "step": 958, "step_time": 19.030482175992802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 4.642857551574707, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3796679973602295, "epoch": 0.00959, "frac_reward_zero_std": 0.0, "grad_norm": 0.04666095972061157, "kl": 0.5316657098010182, "learning_rate": 7.999684587557677e-06, "loss": -0.0912, "num_tokens": 23878466.0, "reward": 0.8457047939300537, "reward_std": 0.9039996862411499, "rewards/rollout_reward_func/mean": 0.8457047939300537, "rewards/rollout_reward_func/std": 0.9039996862411499, "sampling/importance_sampling_ratio/max": 1.1974998712539673, "sampling/importance_sampling_ratio/mean": 0.7699824571609497, "sampling/importance_sampling_ratio/min": 0.0002457245718687773, "sampling/sampling_logp_difference/max": 1.7090190649032593, "sampling/sampling_logp_difference/mean": 0.24694153666496277, "step": 959, "step_time": 33.87720065601752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3745914213359356, "epoch": 0.0096, "grad_norm": 0.046379681676626205, "kl": 0.561789708212018, "learning_rate": 7.999683903748873e-06, "loss": -0.0912, "step": 960, "step_time": 17.669675790006295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.0625, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.029961647465825, "epoch": 0.00961, "frac_reward_zero_std": 0.0, "grad_norm": 0.07906144857406616, "kl": 0.3413408976048231, "learning_rate": 7.99968321919967e-06, "loss": -0.0724, "num_tokens": 23937035.0, "reward": 1.090146780014038, "reward_std": 0.741532027721405, "rewards/rollout_reward_func/mean": 1.090146780014038, "rewards/rollout_reward_func/std": 0.741532027721405, "sampling/importance_sampling_ratio/max": 1.1949299573898315, "sampling/importance_sampling_ratio/mean": 0.8495534062385559, "sampling/importance_sampling_ratio/min": 6.177918112371117e-05, "sampling/sampling_logp_difference/max": 1.9022161960601807, "sampling/sampling_logp_difference/mean": 0.22697538137435913, "step": 961, "step_time": 26.554325487988535 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 1.0353858405724168, "epoch": 0.00962, "grad_norm": 0.07436013221740723, "kl": 0.3466418460011482, "learning_rate": 7.999682533910062e-06, "loss": -0.0726, "step": 962, "step_time": 13.858551926969085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 4.384615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3285001199692488, "epoch": 0.00963, "frac_reward_zero_std": 0.0, "grad_norm": 0.2095644176006317, "kl": 0.4238136853091419, "learning_rate": 7.999681847880053e-06, "loss": -0.097, "num_tokens": 23988548.0, "reward": 0.7120437026023865, "reward_std": 0.8469513654708862, "rewards/rollout_reward_func/mean": 0.7120437026023865, "rewards/rollout_reward_func/std": 0.8469513058662415, "sampling/importance_sampling_ratio/max": 2.5026888847351074, "sampling/importance_sampling_ratio/mean": 0.8154935240745544, "sampling/importance_sampling_ratio/min": 1.8154429426431307e-06, "sampling/sampling_logp_difference/max": 2.299757480621338, "sampling/sampling_logp_difference/mean": 0.30564773082733154, "step": 963, "step_time": 32.443406994018005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0024999999441206455, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024999999441206455, "entropy": 1.3193218931555748, "epoch": 0.00964, "grad_norm": 0.17185664176940918, "kl": 0.4406636208295822, "learning_rate": 7.999681161109645e-06, "loss": -0.0982, "step": 964, "step_time": 15.98627221895731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 4.84375, "completions/mean_terminated_length": 4.483870983123779, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0174752054736018, "epoch": 0.00965, "frac_reward_zero_std": 0.5, "grad_norm": 0.03728172555565834, "kl": 0.3321052622050047, "learning_rate": 7.999680473598836e-06, "loss": -0.0322, "num_tokens": 24029956.0, "reward": 0.5728114247322083, "reward_std": 0.7050812840461731, "rewards/rollout_reward_func/mean": 0.5728114247322083, "rewards/rollout_reward_func/std": 0.7050812244415283, "sampling/importance_sampling_ratio/max": 1.0892176628112793, "sampling/importance_sampling_ratio/mean": 0.90156090259552, "sampling/importance_sampling_ratio/min": 1.1824251487269066e-05, "sampling/sampling_logp_difference/max": 2.2055938243865967, "sampling/sampling_logp_difference/mean": 0.22666305303573608, "step": 965, "step_time": 21.693135737004923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0136826038360596, "epoch": 0.00966, "grad_norm": 0.03685673698782921, "kl": 0.32627877965569496, "learning_rate": 7.999679785347625e-06, "loss": -0.0322, "step": 966, "step_time": 12.51814958199975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 4.84375, "completions/mean_terminated_length": 4.84375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8382244762033224, "epoch": 0.00967, "frac_reward_zero_std": 0.0, "grad_norm": 0.0260144155472517, "kl": 0.5850261356681585, "learning_rate": 7.999679096356015e-06, "loss": -0.0781, "num_tokens": 24071214.0, "reward": 1.0282644033432007, "reward_std": 0.7013033628463745, "rewards/rollout_reward_func/mean": 1.0282644033432007, "rewards/rollout_reward_func/std": 0.7013033628463745, "sampling/importance_sampling_ratio/max": 1.1062023639678955, "sampling/importance_sampling_ratio/mean": 0.8799580335617065, "sampling/importance_sampling_ratio/min": 0.00028737567481584847, "sampling/sampling_logp_difference/max": 2.329253673553467, "sampling/sampling_logp_difference/mean": 0.19827806949615479, "step": 967, "step_time": 18.574976382020395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8390453904867172, "epoch": 0.00968, "grad_norm": 0.023504123091697693, "kl": 0.5868586897850037, "learning_rate": 7.999678406624005e-06, "loss": -0.0781, "step": 968, "step_time": 9.932871181081282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.15625, "completions/mean_terminated_length": 4.679999828338623, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1501291943714023, "epoch": 0.00969, "frac_reward_zero_std": 0.0, "grad_norm": 0.05824241042137146, "kl": 0.6162199564278126, "learning_rate": 7.999677716151594e-06, "loss": -0.0926, "num_tokens": 24137926.0, "reward": 0.41453737020492554, "reward_std": 0.828833818435669, "rewards/rollout_reward_func/mean": 0.41453737020492554, "rewards/rollout_reward_func/std": 0.8288337588310242, "sampling/importance_sampling_ratio/max": 1.3773653507232666, "sampling/importance_sampling_ratio/mean": 0.6782225370407104, "sampling/importance_sampling_ratio/min": 3.5236801920746075e-08, "sampling/sampling_logp_difference/max": 2.2958972454071045, "sampling/sampling_logp_difference/mean": 0.4615001678466797, "step": 969, "step_time": 32.44954676594352 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007932692533358932, "entropy": 2.1498346338048577, "epoch": 0.0097, "grad_norm": 0.05228882282972336, "kl": 0.629144330509007, "learning_rate": 7.999677024938784e-06, "loss": -0.0927, "step": 970, "step_time": 16.042850586993154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.111111164093018, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.258187361061573, "epoch": 0.00971, "frac_reward_zero_std": 0.25, "grad_norm": 0.08403174579143524, "kl": 0.4014084953814745, "learning_rate": 7.999676332985574e-06, "loss": -0.0657, "num_tokens": 24180774.0, "reward": 0.6910042762756348, "reward_std": 0.8392034769058228, "rewards/rollout_reward_func/mean": 0.6910042762756348, "rewards/rollout_reward_func/std": 0.8392034769058228, "sampling/importance_sampling_ratio/max": 1.1464744806289673, "sampling/importance_sampling_ratio/mean": 0.8426551222801208, "sampling/importance_sampling_ratio/min": 4.875280179561514e-10, "sampling/sampling_logp_difference/max": 2.649482011795044, "sampling/sampling_logp_difference/mean": 0.31858402490615845, "step": 971, "step_time": 21.85065249504987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2524576028808951, "epoch": 0.00972, "grad_norm": 0.07468491047620773, "kl": 0.38803022541105747, "learning_rate": 7.999675640291963e-06, "loss": -0.0661, "step": 972, "step_time": 11.90527337702224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 4.285714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1115829525515437, "epoch": 0.00973, "frac_reward_zero_std": 0.0, "grad_norm": 0.04001466557383537, "kl": 0.31086321361362934, "learning_rate": 7.999674946857955e-06, "loss": -0.0563, "num_tokens": 24242391.0, "reward": 0.5822052955627441, "reward_std": 0.8898717761039734, "rewards/rollout_reward_func/mean": 0.5822052955627441, "rewards/rollout_reward_func/std": 0.8898717164993286, "sampling/importance_sampling_ratio/max": 1.4765756130218506, "sampling/importance_sampling_ratio/mean": 0.8883424997329712, "sampling/importance_sampling_ratio/min": 0.00029968100716359913, "sampling/sampling_logp_difference/max": 1.830634593963623, "sampling/sampling_logp_difference/mean": 0.23216378688812256, "step": 973, "step_time": 30.834287474979647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1083308272063732, "epoch": 0.00974, "grad_norm": 0.03651030361652374, "kl": 0.32284104637801647, "learning_rate": 7.999674252683546e-06, "loss": -0.0562, "step": 974, "step_time": 16.010037897969596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.4375, "completions/mean_terminated_length": 4.344827651977539, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2252789242193103, "epoch": 0.00975, "frac_reward_zero_std": 0.0, "grad_norm": 0.13411962985992432, "kl": 0.4821621812880039, "learning_rate": 7.999673557768738e-06, "loss": -0.0699, "num_tokens": 24291697.0, "reward": 0.36546796560287476, "reward_std": 0.8711330890655518, "rewards/rollout_reward_func/mean": 0.36546796560287476, "rewards/rollout_reward_func/std": 0.8711330890655518, "sampling/importance_sampling_ratio/max": 1.1437263488769531, "sampling/importance_sampling_ratio/mean": 0.818313479423523, "sampling/importance_sampling_ratio/min": 5.642543854378346e-08, "sampling/sampling_logp_difference/max": 2.4914448261260986, "sampling/sampling_logp_difference/mean": 0.3122180998325348, "step": 975, "step_time": 30.211551910004346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2301438124850392, "epoch": 0.00976, "grad_norm": 0.11681310832500458, "kl": 0.5037675462663174, "learning_rate": 7.999672862113531e-06, "loss": -0.0701, "step": 976, "step_time": 15.114760876022046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.603866333141923, "epoch": 0.00977, "frac_reward_zero_std": 0.25, "grad_norm": 0.01777075231075287, "kl": 0.26389795541763306, "learning_rate": 7.999672165717924e-06, "loss": -0.0399, "num_tokens": 24342796.0, "reward": 1.2037436962127686, "reward_std": 0.6407324075698853, "rewards/rollout_reward_func/mean": 1.2037436962127686, "rewards/rollout_reward_func/std": 0.6407324075698853, "sampling/importance_sampling_ratio/max": 1.1788493394851685, "sampling/importance_sampling_ratio/mean": 0.9416807889938354, "sampling/importance_sampling_ratio/min": 0.00016153091564774513, "sampling/sampling_logp_difference/max": 1.2233448028564453, "sampling/sampling_logp_difference/mean": 0.14069990813732147, "step": 977, "step_time": 28.50466184396646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6050926768220961, "epoch": 0.00978, "grad_norm": 0.03735882416367531, "kl": 0.26354565285146236, "learning_rate": 7.999671468581921e-06, "loss": -0.04, "step": 978, "step_time": 15.57936983901891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.34375, "completions/mean_terminated_length": 4.458333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0290047265589237, "epoch": 0.00979, "frac_reward_zero_std": 0.0, "grad_norm": 0.0772705152630806, "kl": 0.21788005530834198, "learning_rate": 7.99967077070552e-06, "loss": -0.0819, "num_tokens": 24414477.0, "reward": 0.2113140970468521, "reward_std": 0.6545332074165344, "rewards/rollout_reward_func/mean": 0.2113140970468521, "rewards/rollout_reward_func/std": 0.6545332074165344, "sampling/importance_sampling_ratio/max": 1.1576826572418213, "sampling/importance_sampling_ratio/mean": 0.6174291372299194, "sampling/importance_sampling_ratio/min": 2.5372179024429897e-08, "sampling/sampling_logp_difference/max": 3.1392483711242676, "sampling/sampling_logp_difference/mean": 0.41779810190200806, "step": 979, "step_time": 36.39140966598643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0295609729364514, "epoch": 0.0098, "grad_norm": 0.08089490234851837, "kl": 0.2185215149074793, "learning_rate": 7.999670072088718e-06, "loss": -0.082, "step": 980, "step_time": 17.822507756995037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.44444465637207, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2226973581127822, "epoch": 0.00981, "frac_reward_zero_std": 0.0, "grad_norm": 0.09154073148965836, "kl": 0.4086402514949441, "learning_rate": 7.999669372731521e-06, "loss": -0.0593, "num_tokens": 24465985.0, "reward": 0.8362336754798889, "reward_std": 0.8270033597946167, "rewards/rollout_reward_func/mean": 0.8362336754798889, "rewards/rollout_reward_func/std": 0.8270033001899719, "sampling/importance_sampling_ratio/max": 1.489600658416748, "sampling/importance_sampling_ratio/mean": 0.8322529792785645, "sampling/importance_sampling_ratio/min": 1.9026167308311415e-08, "sampling/sampling_logp_difference/max": 2.8844192028045654, "sampling/sampling_logp_difference/mean": 0.2847195267677307, "step": 981, "step_time": 28.505229744012468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.222776618320495, "epoch": 0.00982, "grad_norm": 0.08953507989645004, "kl": 0.4204940628260374, "learning_rate": 7.999668672633923e-06, "loss": -0.0594, "step": 982, "step_time": 14.133715874020709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.15625, "completions/mean_terminated_length": 4.590909004211426, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.6012264639139175, "epoch": 0.00983, "frac_reward_zero_std": 0.0, "grad_norm": 0.3769051432609558, "kl": 0.24583742022514343, "learning_rate": 7.99966797179593e-06, "loss": -0.1087, "num_tokens": 24529087.0, "reward": 0.15077193081378937, "reward_std": 0.9440815448760986, "rewards/rollout_reward_func/mean": 0.15077193081378937, "rewards/rollout_reward_func/std": 0.9440814852714539, "sampling/importance_sampling_ratio/max": 2.386925220489502, "sampling/importance_sampling_ratio/mean": 0.6670384407043457, "sampling/importance_sampling_ratio/min": 8.248449034908845e-08, "sampling/sampling_logp_difference/max": 2.395963430404663, "sampling/sampling_logp_difference/mean": 0.45946070551872253, "step": 983, "step_time": 31.960708459984744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5935132652521133, "epoch": 0.00984, "grad_norm": 0.09423355013132095, "kl": 0.31766564678400755, "learning_rate": 7.999667270217539e-06, "loss": -0.11, "step": 984, "step_time": 15.155104258126812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.09375, "completions/mean_terminated_length": 5.038461685180664, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8976745610125363, "epoch": 0.00985, "frac_reward_zero_std": 0.0, "grad_norm": 0.1706695854663849, "kl": 1.2125058034434915, "learning_rate": 7.99966656789875e-06, "loss": -0.0838, "num_tokens": 24586222.0, "reward": 0.6900360584259033, "reward_std": 0.7976811528205872, "rewards/rollout_reward_func/mean": 0.6900360584259033, "rewards/rollout_reward_func/std": 0.7976811528205872, "sampling/importance_sampling_ratio/max": 1.1491283178329468, "sampling/importance_sampling_ratio/mean": 0.6606246829032898, "sampling/importance_sampling_ratio/min": 5.162207106224059e-09, "sampling/sampling_logp_difference/max": 2.3218629360198975, "sampling/sampling_logp_difference/mean": 0.44099152088165283, "step": 985, "step_time": 29.718255866959225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8964032046496868, "epoch": 0.00986, "grad_norm": 0.16330432891845703, "kl": 1.0885926373302937, "learning_rate": 7.999665864839563e-06, "loss": -0.0845, "step": 986, "step_time": 15.111461114982376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.34375, "completions/mean_terminated_length": 4.555555820465088, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3939378429204226, "epoch": 0.00987, "frac_reward_zero_std": 0.25, "grad_norm": 0.07067158818244934, "kl": 0.4507772997021675, "learning_rate": 7.999665161039979e-06, "loss": -0.0591, "num_tokens": 24636004.0, "reward": 0.837064266204834, "reward_std": 0.8726196885108948, "rewards/rollout_reward_func/mean": 0.837064266204834, "rewards/rollout_reward_func/std": 0.8726196885108948, "sampling/importance_sampling_ratio/max": 1.130234956741333, "sampling/importance_sampling_ratio/mean": 0.8110643625259399, "sampling/importance_sampling_ratio/min": 4.0137398960027326e-10, "sampling/sampling_logp_difference/max": 2.388025999069214, "sampling/sampling_logp_difference/mean": 0.315324604511261, "step": 987, "step_time": 25.944608841004083 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 1.4068492255173624, "epoch": 0.00988, "grad_norm": 0.1278820037841797, "kl": 0.3921603448688984, "learning_rate": 7.9996644565e-06, "loss": -0.0596, "step": 988, "step_time": 12.412908466969384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.928571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2788025909103453, "epoch": 0.00989, "frac_reward_zero_std": 0.0, "grad_norm": 0.07173425704240799, "kl": 0.5949792135506868, "learning_rate": 7.999663751219622e-06, "loss": -0.0823, "num_tokens": 24690840.0, "reward": 0.8012639880180359, "reward_std": 0.8887642621994019, "rewards/rollout_reward_func/mean": 0.8012639880180359, "rewards/rollout_reward_func/std": 0.8887643218040466, "sampling/importance_sampling_ratio/max": 1.1214581727981567, "sampling/importance_sampling_ratio/mean": 0.732921302318573, "sampling/importance_sampling_ratio/min": 7.627018931088969e-06, "sampling/sampling_logp_difference/max": 2.2917826175689697, "sampling/sampling_logp_difference/mean": 0.32130587100982666, "step": 989, "step_time": 27.160623584029963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2882041023112833, "epoch": 0.0099, "grad_norm": 0.060948554426431656, "kl": 0.559480456635356, "learning_rate": 7.99966304519885e-06, "loss": -0.0826, "step": 990, "step_time": 13.799307061039144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.90625, "completions/mean_terminated_length": 4.862069129943848, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.524212708696723, "epoch": 0.00991, "frac_reward_zero_std": 0.5, "grad_norm": 0.05354689434170723, "kl": 0.4923878964036703, "learning_rate": 7.99966233843768e-06, "loss": -0.0263, "num_tokens": 24733876.0, "reward": 0.26026761531829834, "reward_std": 0.8282477259635925, "rewards/rollout_reward_func/mean": 0.26026761531829834, "rewards/rollout_reward_func/std": 0.8282477259635925, "sampling/importance_sampling_ratio/max": 1.0843030214309692, "sampling/importance_sampling_ratio/mean": 0.6755706071853638, "sampling/importance_sampling_ratio/min": 4.5576143747894093e-05, "sampling/sampling_logp_difference/max": 1.9863581657409668, "sampling/sampling_logp_difference/mean": 0.35108429193496704, "step": 991, "step_time": 23.28077315693372 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.012276785913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020089285913854837, "entropy": 1.5473335664719343, "epoch": 0.00992, "grad_norm": 0.04491403326392174, "kl": 0.4516163310036063, "learning_rate": 7.999661630936115e-06, "loss": -0.0265, "step": 992, "step_time": 12.224999961035792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.71875, "completions/mean_terminated_length": 4.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.472357926890254, "epoch": 0.00993, "frac_reward_zero_std": 0.25, "grad_norm": 0.09323031455278397, "kl": 1.2008825540542603, "learning_rate": 7.999660922694153e-06, "loss": -0.0365, "num_tokens": 24786677.0, "reward": 0.4562743306159973, "reward_std": 0.873878538608551, "rewards/rollout_reward_func/mean": 0.4562743306159973, "rewards/rollout_reward_func/std": 0.8738784790039062, "sampling/importance_sampling_ratio/max": 1.1120715141296387, "sampling/importance_sampling_ratio/mean": 0.7710137367248535, "sampling/importance_sampling_ratio/min": 6.567443961102981e-06, "sampling/sampling_logp_difference/max": 1.9326728582382202, "sampling/sampling_logp_difference/mean": 0.31022512912750244, "step": 993, "step_time": 26.45319288407336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.484700865112245, "epoch": 0.00994, "grad_norm": 0.07870698720216751, "kl": 1.019757941365242, "learning_rate": 7.999660213711798e-06, "loss": -0.0371, "step": 994, "step_time": 13.955292627972085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.46875, "completions/mean_terminated_length": 4.379310131072998, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0128598446026444, "epoch": 0.00995, "frac_reward_zero_std": 0.0, "grad_norm": 0.12534521520137787, "kl": 0.2885991185903549, "learning_rate": 7.999659503989044e-06, "loss": -0.0526, "num_tokens": 24847478.0, "reward": 0.7202211618423462, "reward_std": 0.7912712693214417, "rewards/rollout_reward_func/mean": 0.7202211618423462, "rewards/rollout_reward_func/std": 0.7912712097167969, "sampling/importance_sampling_ratio/max": 1.736283779144287, "sampling/importance_sampling_ratio/mean": 0.9005610942840576, "sampling/importance_sampling_ratio/min": 5.0928565542562865e-06, "sampling/sampling_logp_difference/max": 1.6584573984146118, "sampling/sampling_logp_difference/mean": 0.20663617551326752, "step": 995, "step_time": 29.840922518022126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.046716159209609, "epoch": 0.00996, "grad_norm": 0.14608724415302277, "kl": 0.28606214188039303, "learning_rate": 7.999658793525895e-06, "loss": -0.0532, "step": 996, "step_time": 15.359778779034968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.03125, "completions/mean_terminated_length": 4.677419185638428, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0093003436923027, "epoch": 0.00997, "frac_reward_zero_std": 0.0, "grad_norm": 0.13384690880775452, "kl": 0.524357171729207, "learning_rate": 7.999658082322352e-06, "loss": -0.0523, "num_tokens": 24903460.0, "reward": 0.49576905369758606, "reward_std": 0.7722134590148926, "rewards/rollout_reward_func/mean": 0.49576905369758606, "rewards/rollout_reward_func/std": 0.7722134590148926, "sampling/importance_sampling_ratio/max": 1.1739468574523926, "sampling/importance_sampling_ratio/mean": 0.8438267707824707, "sampling/importance_sampling_ratio/min": 7.042866491246969e-05, "sampling/sampling_logp_difference/max": 1.9002742767333984, "sampling/sampling_logp_difference/mean": 0.21260926127433777, "step": 997, "step_time": 29.590515170042636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011101973708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011101973708719015, "entropy": 1.1304868459701538, "epoch": 0.00998, "grad_norm": 0.14239975810050964, "kl": 0.48473382368683815, "learning_rate": 7.999657370378414e-06, "loss": -0.0533, "step": 998, "step_time": 16.06967582498328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.875, "completions/mean_terminated_length": 5.1666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.513737380504608, "epoch": 0.00999, "frac_reward_zero_std": 0.0, "grad_norm": 0.09892933815717697, "kl": 0.5463092476129532, "learning_rate": 7.999656657694079e-06, "loss": -0.0822, "num_tokens": 24956232.0, "reward": 0.2654304504394531, "reward_std": 0.9821290373802185, "rewards/rollout_reward_func/mean": 0.2654304504394531, "rewards/rollout_reward_func/std": 0.9821290373802185, "sampling/importance_sampling_ratio/max": 1.1700724363327026, "sampling/importance_sampling_ratio/mean": 0.45436596870422363, "sampling/importance_sampling_ratio/min": 4.720854374085093e-09, "sampling/sampling_logp_difference/max": 2.482985258102417, "sampling/sampling_logp_difference/mean": 0.5200001001358032, "step": 999, "step_time": 26.63719819902326 }, { "clip_ratio/high_max": 0.011684782803058624, "clip_ratio/high_mean": 0.005842391401529312, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013654891401529312, "entropy": 2.5585206896066666, "epoch": 0.01, "grad_norm": 0.10770517587661743, "kl": 0.5027860123664141, "learning_rate": 7.99965594426935e-06, "loss": -0.0819, "step": 1000, "step_time": 13.221240183978807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 10.25, "completions/mean_terminated_length": 7.238095283508301, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.4368470907211304, "epoch": 0.01001, "frac_reward_zero_std": 0.0, "grad_norm": 0.10753044486045837, "kl": 0.220701458863914, "learning_rate": 7.999655230104228e-06, "loss": -0.0931, "num_tokens": 25013269.0, "reward": 0.034616924822330475, "reward_std": 0.8267871141433716, "rewards/rollout_reward_func/mean": 0.034616924822330475, "rewards/rollout_reward_func/std": 0.8267870545387268, "sampling/importance_sampling_ratio/max": 1.1133301258087158, "sampling/importance_sampling_ratio/mean": 0.3450254201889038, "sampling/importance_sampling_ratio/min": 8.208929358488604e-08, "sampling/sampling_logp_difference/max": 2.420990467071533, "sampling/sampling_logp_difference/mean": 0.561111330986023, "step": 1001, "step_time": 34.00307848202647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.4351938366889954, "epoch": 0.01002, "grad_norm": 0.11652161180973053, "kl": 0.2187406411394477, "learning_rate": 7.999654515198711e-06, "loss": -0.0929, "step": 1002, "step_time": 13.784607798967045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.46875, "completions/mean_terminated_length": 5.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.273480474948883, "epoch": 0.01003, "frac_reward_zero_std": 0.0, "grad_norm": 0.07983560860157013, "kl": 0.22912293672561646, "learning_rate": 7.999653799552799e-06, "loss": -0.0832, "num_tokens": 25072076.0, "reward": 0.6684910655021667, "reward_std": 0.9325807094573975, "rewards/rollout_reward_func/mean": 0.6684910655021667, "rewards/rollout_reward_func/std": 0.9325807094573975, "sampling/importance_sampling_ratio/max": 1.0582847595214844, "sampling/importance_sampling_ratio/mean": 0.5348913073539734, "sampling/importance_sampling_ratio/min": 4.251266091159778e-06, "sampling/sampling_logp_difference/max": 2.225867748260498, "sampling/sampling_logp_difference/mean": 0.3728792667388916, "step": 1003, "step_time": 30.24646631194628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.248265229165554, "epoch": 0.01004, "grad_norm": 0.0775284618139267, "kl": 0.23284409195184708, "learning_rate": 7.999653083166494e-06, "loss": -0.0835, "step": 1004, "step_time": 13.694402160967002 }, { "clip_ratio/high_max": 0.007352941203862429, "clip_ratio/high_mean": 0.0036764706019312143, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036764706019312143, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.84375, "completions/mean_terminated_length": 5.559999942779541, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.532159373164177, "epoch": 0.01005, "frac_reward_zero_std": 0.0, "grad_norm": 0.19782020151615143, "kl": 0.3365274630486965, "learning_rate": 7.999652366039795e-06, "loss": -0.0527, "num_tokens": 25133479.0, "reward": 0.27915775775909424, "reward_std": 0.8473196625709534, "rewards/rollout_reward_func/mean": 0.27915775775909424, "rewards/rollout_reward_func/std": 0.8473196029663086, "sampling/importance_sampling_ratio/max": 1.1820564270019531, "sampling/importance_sampling_ratio/mean": 0.542805552482605, "sampling/importance_sampling_ratio/min": 1.0816205708863436e-08, "sampling/sampling_logp_difference/max": 2.5828497409820557, "sampling/sampling_logp_difference/mean": 0.4405112564563751, "step": 1005, "step_time": 33.50100282696076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4809859916567802, "epoch": 0.01006, "grad_norm": 0.15501032769680023, "kl": 0.36894582118839025, "learning_rate": 7.999651648172701e-06, "loss": -0.0538, "step": 1006, "step_time": 16.046363484027097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4009408801794052, "epoch": 0.01007, "frac_reward_zero_std": 0.0, "grad_norm": 0.23452141880989075, "kl": 0.2878057286143303, "learning_rate": 7.999650929565216e-06, "loss": -0.0737, "num_tokens": 25182078.0, "reward": 0.4954906105995178, "reward_std": 0.843955934047699, "rewards/rollout_reward_func/mean": 0.4954906105995178, "rewards/rollout_reward_func/std": 0.8439559936523438, "sampling/importance_sampling_ratio/max": 1.0634119510650635, "sampling/importance_sampling_ratio/mean": 0.7093083262443542, "sampling/importance_sampling_ratio/min": 9.601655619917437e-05, "sampling/sampling_logp_difference/max": 1.9535748958587646, "sampling/sampling_logp_difference/mean": 0.2337462604045868, "step": 1007, "step_time": 22.80010672204662 }, { "clip_ratio/high_max": 0.125, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 1.2296659052371979, "epoch": 0.01008, "grad_norm": 0.0572548434138298, "kl": 0.2919000927358866, "learning_rate": 7.999650210217335e-06, "loss": -0.0753, "step": 1008, "step_time": 11.565075374004664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.90625, "completions/mean_terminated_length": 4.464285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3127407021820545, "epoch": 0.01009, "frac_reward_zero_std": 0.0, "grad_norm": 0.20816972851753235, "kl": 0.30015850253403187, "learning_rate": 7.999649490129062e-06, "loss": -0.0688, "num_tokens": 25235827.0, "reward": -0.030553504824638367, "reward_std": 0.7334412932395935, "rewards/rollout_reward_func/mean": -0.030553504824638367, "rewards/rollout_reward_func/std": 0.7334412932395935, "sampling/importance_sampling_ratio/max": 1.5170739889144897, "sampling/importance_sampling_ratio/mean": 0.8402962684631348, "sampling/importance_sampling_ratio/min": 0.00014124118024483323, "sampling/sampling_logp_difference/max": 1.582235336303711, "sampling/sampling_logp_difference/mean": 0.2246939241886139, "step": 1009, "step_time": 26.833243258995935 }, { "clip_ratio/high_max": 0.046875, "clip_ratio/high_mean": 0.0303819440305233, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0303819440305233, "entropy": 1.1573157384991646, "epoch": 0.0101, "grad_norm": 0.06254486739635468, "kl": 0.3132360205054283, "learning_rate": 7.999648769300397e-06, "loss": -0.0705, "step": 1010, "step_time": 13.836454290983966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.15625, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1031665047630668, "epoch": 0.01011, "frac_reward_zero_std": 0.25, "grad_norm": 0.02769267000257969, "kl": 0.39613919518887997, "learning_rate": 7.999648047731338e-06, "loss": -0.0682, "num_tokens": 25288268.0, "reward": 0.564225971698761, "reward_std": 0.8433457016944885, "rewards/rollout_reward_func/mean": 0.564225971698761, "rewards/rollout_reward_func/std": 0.8433457016944885, "sampling/importance_sampling_ratio/max": 1.2814617156982422, "sampling/importance_sampling_ratio/mean": 0.8032741546630859, "sampling/importance_sampling_ratio/min": 0.0002310886193299666, "sampling/sampling_logp_difference/max": 1.8805806636810303, "sampling/sampling_logp_difference/mean": 0.22557476162910461, "step": 1011, "step_time": 31.525500422983896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.084548780694604, "epoch": 0.01012, "grad_norm": 0.02349921315908432, "kl": 0.4204964432865381, "learning_rate": 7.999647325421885e-06, "loss": -0.0684, "step": 1012, "step_time": 15.95904022900504 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 4.666666507720947, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7293214201927185, "epoch": 0.01013, "frac_reward_zero_std": 0.0, "grad_norm": 0.25780969858169556, "kl": 0.37125479988753796, "learning_rate": 7.999646602372042e-06, "loss": -0.0699, "num_tokens": 25346384.0, "reward": 0.6058610677719116, "reward_std": 0.906910240650177, "rewards/rollout_reward_func/mean": 0.6058610677719116, "rewards/rollout_reward_func/std": 0.9069101810455322, "sampling/importance_sampling_ratio/max": 1.4349218606948853, "sampling/importance_sampling_ratio/mean": 0.7444782853126526, "sampling/importance_sampling_ratio/min": 1.9624354224978546e-12, "sampling/sampling_logp_difference/max": 3.142092704772949, "sampling/sampling_logp_difference/mean": 0.38985177874565125, "step": 1013, "step_time": 27.872448764013825 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.6965338215231895, "epoch": 0.01014, "grad_norm": 0.07542229443788528, "kl": 0.37409085035324097, "learning_rate": 7.999645878581808e-06, "loss": -0.0705, "step": 1014, "step_time": 13.587014131975593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1488780975341797, "epoch": 0.01015, "frac_reward_zero_std": 0.0, "grad_norm": 0.04368795454502106, "kl": 0.7740523125976324, "learning_rate": 7.99964515405118e-06, "loss": -0.0896, "num_tokens": 25412403.0, "reward": 0.9309349060058594, "reward_std": 0.7366132736206055, "rewards/rollout_reward_func/mean": 0.9309349060058594, "rewards/rollout_reward_func/std": 0.7366132736206055, "sampling/importance_sampling_ratio/max": 1.1629129648208618, "sampling/importance_sampling_ratio/mean": 0.8272961378097534, "sampling/importance_sampling_ratio/min": 1.821873229346238e-05, "sampling/sampling_logp_difference/max": 1.7601732015609741, "sampling/sampling_logp_difference/mean": 0.25226837396621704, "step": 1015, "step_time": 34.3305019599793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1368798101320863, "epoch": 0.01016, "grad_norm": 0.04176396504044533, "kl": 0.7967377584427595, "learning_rate": 7.999644428780159e-06, "loss": -0.0897, "step": 1016, "step_time": 18.173197372962022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 4.222222328186035, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3106029191985726, "epoch": 0.01017, "frac_reward_zero_std": 0.0, "grad_norm": 0.08937916904687881, "kl": 0.3002934791147709, "learning_rate": 7.999643702768747e-06, "loss": -0.0417, "num_tokens": 25463447.0, "reward": 0.3787499666213989, "reward_std": 0.8842175006866455, "rewards/rollout_reward_func/mean": 0.3787499666213989, "rewards/rollout_reward_func/std": 0.8842174410820007, "sampling/importance_sampling_ratio/max": 1.308382272720337, "sampling/importance_sampling_ratio/mean": 0.7596741318702698, "sampling/importance_sampling_ratio/min": 7.898071999079548e-06, "sampling/sampling_logp_difference/max": 2.283466100692749, "sampling/sampling_logp_difference/mean": 0.3224884867668152, "step": 1017, "step_time": 24.2578164919978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.301754773594439, "epoch": 0.01018, "grad_norm": 0.09593816846609116, "kl": 0.2964946571737528, "learning_rate": 7.999642976016945e-06, "loss": -0.0416, "step": 1018, "step_time": 12.624610034981743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.71875, "completions/mean_terminated_length": 4.655172348022461, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2894747946411371, "epoch": 0.01019, "frac_reward_zero_std": 0.0, "grad_norm": 0.023055177181959152, "kl": 0.39732776768505573, "learning_rate": 7.99964224852475e-06, "loss": -0.0873, "num_tokens": 25507165.0, "reward": 0.3939964175224304, "reward_std": 0.7928268313407898, "rewards/rollout_reward_func/mean": 0.3939964175224304, "rewards/rollout_reward_func/std": 0.7928268313407898, "sampling/importance_sampling_ratio/max": 1.1784088611602783, "sampling/importance_sampling_ratio/mean": 0.90118807554245, "sampling/importance_sampling_ratio/min": 4.279271692553266e-08, "sampling/sampling_logp_difference/max": 2.5433061122894287, "sampling/sampling_logp_difference/mean": 0.36397644877433777, "step": 1019, "step_time": 19.233597996033495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2872156854718924, "epoch": 0.0102, "grad_norm": 0.022012416273355484, "kl": 0.407942209392786, "learning_rate": 7.999641520292164e-06, "loss": -0.0873, "step": 1020, "step_time": 10.200100943009602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.375, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.41023494908586144, "epoch": 0.01021, "frac_reward_zero_std": 0.25, "grad_norm": 0.0286050233989954, "kl": 0.2979819905012846, "learning_rate": 7.999640791319187e-06, "loss": -0.0099, "num_tokens": 25559757.0, "reward": 1.0365087985992432, "reward_std": 0.5527331233024597, "rewards/rollout_reward_func/mean": 1.0365087985992432, "rewards/rollout_reward_func/std": 0.5527330636978149, "sampling/importance_sampling_ratio/max": 1.1225279569625854, "sampling/importance_sampling_ratio/mean": 0.976063072681427, "sampling/importance_sampling_ratio/min": 8.293800783576444e-05, "sampling/sampling_logp_difference/max": 1.335959553718567, "sampling/sampling_logp_difference/mean": 0.0846288651227951, "step": 1021, "step_time": 25.74241275404347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40944062173366547, "epoch": 0.01022, "grad_norm": 0.02903112582862377, "kl": 0.29965456388890743, "learning_rate": 7.999640061605819e-06, "loss": -0.0098, "step": 1022, "step_time": 14.094351366016781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 4.71875, "completions/mean_terminated_length": 4.3548383712768555, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5105734653770924, "epoch": 0.01023, "frac_reward_zero_std": 0.0, "grad_norm": 0.10600956529378891, "kl": 0.29764452390372753, "learning_rate": 7.999639331152063e-06, "loss": -0.0238, "num_tokens": 25612499.0, "reward": 0.12532562017440796, "reward_std": 0.6701933145523071, "rewards/rollout_reward_func/mean": 0.12532562017440796, "rewards/rollout_reward_func/std": 0.6701933145523071, "sampling/importance_sampling_ratio/max": 1.1154471635818481, "sampling/importance_sampling_ratio/mean": 0.9359226226806641, "sampling/importance_sampling_ratio/min": 0.0009230133146047592, "sampling/sampling_logp_difference/max": 1.8009154796600342, "sampling/sampling_logp_difference/mean": 0.11467942595481873, "step": 1023, "step_time": 23.015171970968368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.5088886232115328, "epoch": 0.01024, "grad_norm": 0.07599986344575882, "kl": 0.304377106949687, "learning_rate": 7.999638599957913e-06, "loss": -0.0241, "step": 1024, "step_time": 12.662964766001096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 7.40625, "completions/mean_terminated_length": 4.043478488922119, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.859477423131466, "epoch": 0.01025, "frac_reward_zero_std": 0.0, "grad_norm": 0.08385756611824036, "kl": 0.244257771410048, "learning_rate": 7.999637868023374e-06, "loss": -0.087, "num_tokens": 25681634.0, "reward": 0.363259494304657, "reward_std": 0.869448184967041, "rewards/rollout_reward_func/mean": 0.363259494304657, "rewards/rollout_reward_func/std": 0.8694482445716858, "sampling/importance_sampling_ratio/max": 1.2684147357940674, "sampling/importance_sampling_ratio/mean": 0.7232117056846619, "sampling/importance_sampling_ratio/min": 7.606502094859025e-07, "sampling/sampling_logp_difference/max": 2.745006561279297, "sampling/sampling_logp_difference/mean": 0.36312538385391235, "step": 1025, "step_time": 34.06568338701618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8562577292323112, "epoch": 0.01026, "grad_norm": 0.07754570990800858, "kl": 0.25238298811018467, "learning_rate": 7.999637135348445e-06, "loss": -0.0871, "step": 1026, "step_time": 15.804064376978204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8537971614859998, "epoch": 0.01027, "frac_reward_zero_std": 0.0, "grad_norm": 0.04608182981610298, "kl": 0.2393227219581604, "learning_rate": 7.999636401933128e-06, "loss": -0.0528, "num_tokens": 25742455.0, "reward": 0.8767220973968506, "reward_std": 0.7889049053192139, "rewards/rollout_reward_func/mean": 0.8767220973968506, "rewards/rollout_reward_func/std": 0.7889048457145691, "sampling/importance_sampling_ratio/max": 1.1056331396102905, "sampling/importance_sampling_ratio/mean": 0.8739585876464844, "sampling/importance_sampling_ratio/min": 4.751557298732223e-06, "sampling/sampling_logp_difference/max": 1.7711451053619385, "sampling/sampling_logp_difference/mean": 0.21062803268432617, "step": 1027, "step_time": 27.130076366040157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.8487925855442882, "epoch": 0.01028, "grad_norm": 0.05584335699677467, "kl": 0.23444640450179577, "learning_rate": 7.999635667777419e-06, "loss": -0.0528, "step": 1028, "step_time": 14.062109091988532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 4.90625, "completions/mean_terminated_length": 4.548387050628662, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.607589312363416, "epoch": 0.01029, "frac_reward_zero_std": 0.25, "grad_norm": 0.06220438703894615, "kl": 0.2643270641565323, "learning_rate": 7.99963493288132e-06, "loss": -0.0457, "num_tokens": 25785573.0, "reward": 0.6184359788894653, "reward_std": 0.9346098303794861, "rewards/rollout_reward_func/mean": 0.6184359788894653, "rewards/rollout_reward_func/std": 0.9346098303794861, "sampling/importance_sampling_ratio/max": 1.1519911289215088, "sampling/importance_sampling_ratio/mean": 0.9471458196640015, "sampling/importance_sampling_ratio/min": 0.0005157383857294917, "sampling/sampling_logp_difference/max": 1.6820532083511353, "sampling/sampling_logp_difference/mean": 0.13978925347328186, "step": 1029, "step_time": 22.977891905989964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6085125091485679, "epoch": 0.0103, "grad_norm": 0.0672125369310379, "kl": 0.25852163694798946, "learning_rate": 7.999634197244832e-06, "loss": -0.0458, "step": 1030, "step_time": 12.293424344970845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4394264817237854, "epoch": 0.01031, "frac_reward_zero_std": 0.0, "grad_norm": 0.10023626685142517, "kl": 0.523641450330615, "learning_rate": 7.999633460867956e-06, "loss": -0.1008, "num_tokens": 25850814.0, "reward": 0.7455192804336548, "reward_std": 0.8782613277435303, "rewards/rollout_reward_func/mean": 0.7455192804336548, "rewards/rollout_reward_func/std": 0.8782613277435303, "sampling/importance_sampling_ratio/max": 1.093123435974121, "sampling/importance_sampling_ratio/mean": 0.7077658176422119, "sampling/importance_sampling_ratio/min": 3.412650585232768e-06, "sampling/sampling_logp_difference/max": 2.4854817390441895, "sampling/sampling_logp_difference/mean": 0.30328118801116943, "step": 1031, "step_time": 32.08520502896863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4326853435486555, "epoch": 0.01032, "grad_norm": 0.09809757024049759, "kl": 0.4999159835278988, "learning_rate": 7.99963272375069e-06, "loss": -0.1009, "step": 1032, "step_time": 15.1409194280277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.34375, "completions/mean_terminated_length": 4.920000076293945, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9592608660459518, "epoch": 0.01033, "frac_reward_zero_std": 0.0, "grad_norm": 0.10829456895589828, "kl": 0.34046126157045364, "learning_rate": 7.999631985893037e-06, "loss": -0.0845, "num_tokens": 25900575.0, "reward": 0.639772355556488, "reward_std": 1.0116432905197144, "rewards/rollout_reward_func/mean": 0.639772355556488, "rewards/rollout_reward_func/std": 1.0116432905197144, "sampling/importance_sampling_ratio/max": 1.1082663536071777, "sampling/importance_sampling_ratio/mean": 0.6664931774139404, "sampling/importance_sampling_ratio/min": 2.8849651556583922e-08, "sampling/sampling_logp_difference/max": 2.111692428588867, "sampling/sampling_logp_difference/mean": 0.3829815983772278, "step": 1033, "step_time": 27.21582012900035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.958114042878151, "epoch": 0.01034, "grad_norm": 0.1153850182890892, "kl": 0.33973973244428635, "learning_rate": 7.999631247294993e-06, "loss": -0.0842, "step": 1034, "step_time": 13.562320559023647 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0795778706669807, "epoch": 0.01035, "frac_reward_zero_std": 0.25, "grad_norm": 0.05220704525709152, "kl": 0.2857478391379118, "learning_rate": 7.99963050795656e-06, "loss": -0.0629, "num_tokens": 25944843.0, "reward": 0.5335413217544556, "reward_std": 0.9376468062400818, "rewards/rollout_reward_func/mean": 0.5335413217544556, "rewards/rollout_reward_func/std": 0.937646746635437, "sampling/importance_sampling_ratio/max": 1.224579095840454, "sampling/importance_sampling_ratio/mean": 0.899223804473877, "sampling/importance_sampling_ratio/min": 5.832177066622535e-06, "sampling/sampling_logp_difference/max": 2.121814250946045, "sampling/sampling_logp_difference/mean": 0.24329477548599243, "step": 1035, "step_time": 24.426564073888585 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 1.0772689543664455, "epoch": 0.01036, "grad_norm": 0.0504940003156662, "kl": 0.28516583517193794, "learning_rate": 7.999629767877742e-06, "loss": -0.063, "step": 1036, "step_time": 12.435735139006283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6979668783023953, "epoch": 0.01037, "frac_reward_zero_std": 0.0, "grad_norm": 0.0784522145986557, "kl": 0.351235656067729, "learning_rate": 7.999629027058533e-06, "loss": -0.0387, "num_tokens": 26003277.0, "reward": 0.1141006126999855, "reward_std": 0.6216707229614258, "rewards/rollout_reward_func/mean": 0.1141006126999855, "rewards/rollout_reward_func/std": 0.621670663356781, "sampling/importance_sampling_ratio/max": 1.2959201335906982, "sampling/importance_sampling_ratio/mean": 0.9437395334243774, "sampling/importance_sampling_ratio/min": 4.153413465246558e-05, "sampling/sampling_logp_difference/max": 1.5611910820007324, "sampling/sampling_logp_difference/mean": 0.17587681114673615, "step": 1037, "step_time": 27.246234923019074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6985842538997531, "epoch": 0.01038, "grad_norm": 0.07896026968955994, "kl": 0.35157251730561256, "learning_rate": 7.999628285498937e-06, "loss": -0.0387, "step": 1038, "step_time": 14.368716592056444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 4.4347825050354, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9497490115463734, "epoch": 0.01039, "frac_reward_zero_std": 0.0, "grad_norm": 0.08438494056463242, "kl": 0.33453385066241026, "learning_rate": 7.999627543198954e-06, "loss": -0.0729, "num_tokens": 26057851.0, "reward": 0.34987694025039673, "reward_std": 0.8435917496681213, "rewards/rollout_reward_func/mean": 0.34987694025039673, "rewards/rollout_reward_func/std": 0.8435918092727661, "sampling/importance_sampling_ratio/max": 1.2509719133377075, "sampling/importance_sampling_ratio/mean": 0.6638823747634888, "sampling/importance_sampling_ratio/min": 1.1480596384672026e-07, "sampling/sampling_logp_difference/max": 2.292346715927124, "sampling/sampling_logp_difference/mean": 0.3970002233982086, "step": 1039, "step_time": 26.48770520198741 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0020833334419876337, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020833334419876337, "entropy": 1.9490411467850208, "epoch": 0.0104, "grad_norm": 0.07634957134723663, "kl": 0.3172117853537202, "learning_rate": 7.999626800158583e-06, "loss": -0.0731, "step": 1040, "step_time": 13.906647798023187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 4.814815044403076, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.264735884964466, "epoch": 0.01041, "frac_reward_zero_std": 0.0, "grad_norm": 0.05892793461680412, "kl": 0.7367868311703205, "learning_rate": 7.999626056377823e-06, "loss": -0.0702, "num_tokens": 26117492.0, "reward": 0.713205099105835, "reward_std": 0.865324854850769, "rewards/rollout_reward_func/mean": 0.713205099105835, "rewards/rollout_reward_func/std": 0.8653249144554138, "sampling/importance_sampling_ratio/max": 1.2554055452346802, "sampling/importance_sampling_ratio/mean": 0.8152981996536255, "sampling/importance_sampling_ratio/min": 1.822947342589032e-05, "sampling/sampling_logp_difference/max": 2.139857053756714, "sampling/sampling_logp_difference/mean": 0.31003549695014954, "step": 1041, "step_time": 28.915927861002274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.267833492718637, "epoch": 0.01042, "grad_norm": 0.0631469339132309, "kl": 0.7355336872860789, "learning_rate": 7.99962531185668e-06, "loss": -0.0702, "step": 1042, "step_time": 14.032750296028098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 4.384615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6011845814064145, "epoch": 0.01043, "frac_reward_zero_std": 0.0, "grad_norm": 0.02480051852762699, "kl": 0.2953355275094509, "learning_rate": 7.999624566595146e-06, "loss": -0.0962, "num_tokens": 26171373.0, "reward": 0.811791718006134, "reward_std": 0.8458827137947083, "rewards/rollout_reward_func/mean": 0.811791718006134, "rewards/rollout_reward_func/std": 0.8458827137947083, "sampling/importance_sampling_ratio/max": 1.1574232578277588, "sampling/importance_sampling_ratio/mean": 0.795282781124115, "sampling/importance_sampling_ratio/min": 8.3381479498712e-09, "sampling/sampling_logp_difference/max": 2.4358181953430176, "sampling/sampling_logp_difference/mean": 0.36993101239204407, "step": 1043, "step_time": 29.58064519200707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6049338388256729, "epoch": 0.01044, "grad_norm": 0.024719564244151115, "kl": 0.2964727282524109, "learning_rate": 7.999623820593227e-06, "loss": -0.0961, "step": 1044, "step_time": 13.640011448966106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.625, "completions/mean_terminated_length": 5.279999732971191, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9774752939119935, "epoch": 0.01045, "frac_reward_zero_std": 0.0, "grad_norm": 0.015938807278871536, "kl": 0.2672564499080181, "learning_rate": 7.99962307385092e-06, "loss": -0.0942, "num_tokens": 26219243.0, "reward": 0.5984796285629272, "reward_std": 0.9965600967407227, "rewards/rollout_reward_func/mean": 0.5984796285629272, "rewards/rollout_reward_func/std": 0.9965600371360779, "sampling/importance_sampling_ratio/max": 1.1761327981948853, "sampling/importance_sampling_ratio/mean": 0.7001282572746277, "sampling/importance_sampling_ratio/min": 3.438502593766657e-09, "sampling/sampling_logp_difference/max": 2.442091464996338, "sampling/sampling_logp_difference/mean": 0.4183262586593628, "step": 1045, "step_time": 26.292319083033362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.977855141274631, "epoch": 0.01046, "grad_norm": 0.015354039147496223, "kl": 0.2709483224898577, "learning_rate": 7.999622326368228e-06, "loss": -0.0942, "step": 1046, "step_time": 12.681699017062783 }, { "clip_ratio/high_max": 0.00657894741743803, "clip_ratio/high_mean": 0.003289473708719015, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.34375, "completions/mean_terminated_length": 4.458333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8341478519141674, "epoch": 0.01047, "frac_reward_zero_std": 0.0, "grad_norm": 0.048816781491041183, "kl": 0.4797322154045105, "learning_rate": 7.999621578145148e-06, "loss": -0.077, "num_tokens": 26265516.0, "reward": 0.578135073184967, "reward_std": 0.9666311144828796, "rewards/rollout_reward_func/mean": 0.578135073184967, "rewards/rollout_reward_func/std": 0.9666311144828796, "sampling/importance_sampling_ratio/max": 1.2030999660491943, "sampling/importance_sampling_ratio/mean": 0.7390655875205994, "sampling/importance_sampling_ratio/min": 5.766940702756074e-08, "sampling/sampling_logp_difference/max": 2.4289326667785645, "sampling/sampling_logp_difference/mean": 0.35738545656204224, "step": 1047, "step_time": 25.21401856097509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.833910258486867, "epoch": 0.01048, "grad_norm": 0.04820830747485161, "kl": 0.4938307795673609, "learning_rate": 7.999620829181686e-06, "loss": -0.0771, "step": 1048, "step_time": 12.480064693052555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.59375, "completions/mean_terminated_length": 4.107142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8388865967281163, "epoch": 0.01049, "frac_reward_zero_std": 0.0, "grad_norm": 0.022397929802536964, "kl": 0.2981364391744137, "learning_rate": 7.999620079477833e-06, "loss": -0.0587, "num_tokens": 26312190.0, "reward": 1.1084498167037964, "reward_std": 0.8335981965065002, "rewards/rollout_reward_func/mean": 1.1084498167037964, "rewards/rollout_reward_func/std": 0.8335981369018555, "sampling/importance_sampling_ratio/max": 1.2045842409133911, "sampling/importance_sampling_ratio/mean": 0.8886469602584839, "sampling/importance_sampling_ratio/min": 0.00013766663323622197, "sampling/sampling_logp_difference/max": 1.8224669694900513, "sampling/sampling_logp_difference/mean": 0.19429033994674683, "step": 1049, "step_time": 27.60062896096497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8385606138035655, "epoch": 0.0105, "grad_norm": 0.021302802488207817, "kl": 0.3023253809660673, "learning_rate": 7.999619329033598e-06, "loss": -0.0588, "step": 1050, "step_time": 15.981967739993706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.15625, "completions/mean_terminated_length": 6.538461685180664, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.3886390924453735, "epoch": 0.01051, "frac_reward_zero_std": 0.0, "grad_norm": 0.12743279337882996, "kl": 0.11407631752081215, "learning_rate": 7.999618577848975e-06, "loss": -0.0674, "num_tokens": 26367711.0, "reward": -0.35548192262649536, "reward_std": 0.6817828416824341, "rewards/rollout_reward_func/mean": -0.35548192262649536, "rewards/rollout_reward_func/std": 0.6817828416824341, "sampling/importance_sampling_ratio/max": 1.886876106262207, "sampling/importance_sampling_ratio/mean": 0.2894814610481262, "sampling/importance_sampling_ratio/min": 8.157281428111673e-08, "sampling/sampling_logp_difference/max": 2.3707480430603027, "sampling/sampling_logp_difference/mean": 0.5245403051376343, "step": 1051, "step_time": 30.08113122897339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.389799490571022, "epoch": 0.01052, "grad_norm": 0.10819672048091888, "kl": 0.12089178478345275, "learning_rate": 7.999617825923968e-06, "loss": -0.068, "step": 1052, "step_time": 12.681326596997678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.076923370361328, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1120661227032542, "epoch": 0.01053, "frac_reward_zero_std": 0.0, "grad_norm": 0.059359874576330185, "kl": 0.347964771091938, "learning_rate": 7.999617073258575e-06, "loss": -0.0729, "num_tokens": 26417136.0, "reward": 0.7694432735443115, "reward_std": 0.8053447008132935, "rewards/rollout_reward_func/mean": 0.7694432735443115, "rewards/rollout_reward_func/std": 0.8053447008132935, "sampling/importance_sampling_ratio/max": 1.4264754056930542, "sampling/importance_sampling_ratio/mean": 0.8344230651855469, "sampling/importance_sampling_ratio/min": 5.0487390268472154e-08, "sampling/sampling_logp_difference/max": 2.0694384574890137, "sampling/sampling_logp_difference/mean": 0.21204398572444916, "step": 1053, "step_time": 28.96720981100225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1070093214511871, "epoch": 0.01054, "grad_norm": 0.06435319781303406, "kl": 0.37175029143691063, "learning_rate": 7.999616319852798e-06, "loss": -0.073, "step": 1054, "step_time": 14.266356890002498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.15625, "completions/mean_terminated_length": 4.433333396911621, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9716001562774181, "epoch": 0.01055, "frac_reward_zero_std": 0.25, "grad_norm": 0.0391727052628994, "kl": 0.5182013865560293, "learning_rate": 7.999615565706636e-06, "loss": -0.0542, "num_tokens": 26460724.0, "reward": 0.638779878616333, "reward_std": 0.8878429532051086, "rewards/rollout_reward_func/mean": 0.638779878616333, "rewards/rollout_reward_func/std": 0.8878428936004639, "sampling/importance_sampling_ratio/max": 1.2493047714233398, "sampling/importance_sampling_ratio/mean": 0.786449134349823, "sampling/importance_sampling_ratio/min": 0.0004088733985554427, "sampling/sampling_logp_difference/max": 2.1537694931030273, "sampling/sampling_logp_difference/mean": 0.22711241245269775, "step": 1055, "step_time": 22.41965575600625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9656623024493456, "epoch": 0.01056, "grad_norm": 0.044729772955179214, "kl": 0.5104060415178537, "learning_rate": 7.999614810820089e-06, "loss": -0.0541, "step": 1056, "step_time": 12.013741030998062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.518518447875977, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5544773004949093, "epoch": 0.01057, "frac_reward_zero_std": 0.0, "grad_norm": 0.1665910929441452, "kl": 0.7701618727296591, "learning_rate": 7.999614055193157e-06, "loss": -0.074, "num_tokens": 26519268.0, "reward": 0.20189061760902405, "reward_std": 0.6640229225158691, "rewards/rollout_reward_func/mean": 0.20189061760902405, "rewards/rollout_reward_func/std": 0.6640229225158691, "sampling/importance_sampling_ratio/max": 1.2856037616729736, "sampling/importance_sampling_ratio/mean": 0.7080177068710327, "sampling/importance_sampling_ratio/min": 5.117909313412383e-05, "sampling/sampling_logp_difference/max": 2.087937355041504, "sampling/sampling_logp_difference/mean": 0.32302239537239075, "step": 1057, "step_time": 28.174330150999594 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.5518646985292435, "epoch": 0.01058, "grad_norm": 0.11514497548341751, "kl": 0.7282306775450706, "learning_rate": 7.999613298825842e-06, "loss": -0.0747, "step": 1058, "step_time": 13.95350359895383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.44444465637207, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1903227884322405, "epoch": 0.01059, "frac_reward_zero_std": 0.0, "grad_norm": 0.16179914772510529, "kl": 0.5923201041296124, "learning_rate": 7.999612541718143e-06, "loss": -0.0611, "num_tokens": 26582394.0, "reward": 0.5644488334655762, "reward_std": 0.7535356283187866, "rewards/rollout_reward_func/mean": 0.5644488334655762, "rewards/rollout_reward_func/std": 0.7535356283187866, "sampling/importance_sampling_ratio/max": 1.856963038444519, "sampling/importance_sampling_ratio/mean": 0.8940020799636841, "sampling/importance_sampling_ratio/min": 2.8282383368605224e-07, "sampling/sampling_logp_difference/max": 1.982658863067627, "sampling/sampling_logp_difference/mean": 0.2805423140525818, "step": 1059, "step_time": 31.587256184982834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1872525103390217, "epoch": 0.0106, "grad_norm": 0.1562565118074417, "kl": 0.6393430922180414, "learning_rate": 7.99961178387006e-06, "loss": -0.0608, "step": 1060, "step_time": 16.25551612401614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.375, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.23676354391500354, "epoch": 0.01061, "frac_reward_zero_std": 0.75, "grad_norm": 0.04762514308094978, "kl": 0.27119675651192665, "learning_rate": 7.999611025281593e-06, "loss": -0.0207, "num_tokens": 26633193.0, "reward": 1.2937867641448975, "reward_std": 0.38665100932121277, "rewards/rollout_reward_func/mean": 1.2937867641448975, "rewards/rollout_reward_func/std": 0.38665100932121277, "sampling/importance_sampling_ratio/max": 1.0701961517333984, "sampling/importance_sampling_ratio/mean": 0.9792904853820801, "sampling/importance_sampling_ratio/min": 0.00031002683681435883, "sampling/sampling_logp_difference/max": 1.0869003534317017, "sampling/sampling_logp_difference/mean": 0.06710917502641678, "step": 1061, "step_time": 21.72836178104626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.23824567953124642, "epoch": 0.01062, "grad_norm": 0.0075493440963327885, "kl": 0.278557563200593, "learning_rate": 7.999610265952743e-06, "loss": -0.0208, "step": 1062, "step_time": 11.758658885984914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.65625, "completions/mean_terminated_length": 4.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7536249933764338, "epoch": 0.01063, "frac_reward_zero_std": 0.25, "grad_norm": 0.021945824846625328, "kl": 0.45513145718723536, "learning_rate": 7.99960950588351e-06, "loss": -0.0601, "num_tokens": 26684320.0, "reward": 0.48931246995925903, "reward_std": 1.0174270868301392, "rewards/rollout_reward_func/mean": 0.48931246995925903, "rewards/rollout_reward_func/std": 1.0174270868301392, "sampling/importance_sampling_ratio/max": 1.098853588104248, "sampling/importance_sampling_ratio/mean": 0.6274430751800537, "sampling/importance_sampling_ratio/min": 0.00012858683476224542, "sampling/sampling_logp_difference/max": 2.170872449874878, "sampling/sampling_logp_difference/mean": 0.3087206482887268, "step": 1063, "step_time": 26.210703880002256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7535760956816375, "epoch": 0.01064, "grad_norm": 0.021673711016774178, "kl": 0.4572009574621916, "learning_rate": 7.999608745073893e-06, "loss": -0.0601, "step": 1064, "step_time": 13.683984178031096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5788431689143181, "epoch": 0.01065, "frac_reward_zero_std": 0.25, "grad_norm": 0.0185138750821352, "kl": 0.22434347309172153, "learning_rate": 7.999607983523892e-06, "loss": -0.0343, "num_tokens": 26733587.0, "reward": 0.6250417828559875, "reward_std": 0.8556649088859558, "rewards/rollout_reward_func/mean": 0.6250417828559875, "rewards/rollout_reward_func/std": 0.8556649684906006, "sampling/importance_sampling_ratio/max": 1.964563250541687, "sampling/importance_sampling_ratio/mean": 0.9822453856468201, "sampling/importance_sampling_ratio/min": 8.210518899431918e-06, "sampling/sampling_logp_difference/max": 2.01294207572937, "sampling/sampling_logp_difference/mean": 0.1392151415348053, "step": 1065, "step_time": 28.47803044799366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5785665037110448, "epoch": 0.01066, "grad_norm": 0.01529177837073803, "kl": 0.22452315222471952, "learning_rate": 7.999607221233511e-06, "loss": -0.0343, "step": 1066, "step_time": 15.177515269984724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.78125, "completions/mean_terminated_length": 4.033333778381348, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5377941285260022, "epoch": 0.01067, "frac_reward_zero_std": 0.25, "grad_norm": 0.2359813153743744, "kl": 0.3274244926869869, "learning_rate": 7.999606458202746e-06, "loss": -0.0183, "num_tokens": 26787171.0, "reward": 0.9530245065689087, "reward_std": 0.7617155909538269, "rewards/rollout_reward_func/mean": 0.9530245065689087, "rewards/rollout_reward_func/std": 0.7617155909538269, "sampling/importance_sampling_ratio/max": 2.0047802925109863, "sampling/importance_sampling_ratio/mean": 0.9883561134338379, "sampling/importance_sampling_ratio/min": 5.407884600572288e-05, "sampling/sampling_logp_difference/max": 1.6707454919815063, "sampling/sampling_logp_difference/mean": 0.1486213207244873, "step": 1067, "step_time": 25.602051017980557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5476522101089358, "epoch": 0.01068, "grad_norm": 0.25227460265159607, "kl": 0.32276763021945953, "learning_rate": 7.999605694431597e-06, "loss": -0.0193, "step": 1068, "step_time": 13.711854700988624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.0625, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.741799701936543, "epoch": 0.01069, "frac_reward_zero_std": 0.0, "grad_norm": 0.13932518661022186, "kl": 0.4807331208139658, "learning_rate": 7.99960492992007e-06, "loss": -0.0668, "num_tokens": 26834364.0, "reward": 0.8916168808937073, "reward_std": 0.8400612473487854, "rewards/rollout_reward_func/mean": 0.8916168808937073, "rewards/rollout_reward_func/std": 0.8400612473487854, "sampling/importance_sampling_ratio/max": 1.142200231552124, "sampling/importance_sampling_ratio/mean": 0.8335471153259277, "sampling/importance_sampling_ratio/min": 0.000559779058676213, "sampling/sampling_logp_difference/max": 1.7908302545547485, "sampling/sampling_logp_difference/mean": 0.1926230788230896, "step": 1069, "step_time": 25.41831825202098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.7677491214126348, "epoch": 0.0107, "grad_norm": 0.038504112511873245, "kl": 0.5097724217921495, "learning_rate": 7.999604164668158e-06, "loss": -0.0671, "step": 1070, "step_time": 12.757163136993768 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.65625, "completions/mean_terminated_length": 4.290322303771973, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7931600231677294, "epoch": 0.01071, "frac_reward_zero_std": 0.25, "grad_norm": 0.05752637982368469, "kl": 0.5124847516417503, "learning_rate": 7.999603398675864e-06, "loss": -0.038, "num_tokens": 26886822.0, "reward": 0.8821181058883667, "reward_std": 0.7527596950531006, "rewards/rollout_reward_func/mean": 0.8821181058883667, "rewards/rollout_reward_func/std": 0.7527596950531006, "sampling/importance_sampling_ratio/max": 1.1759997606277466, "sampling/importance_sampling_ratio/mean": 0.8500691652297974, "sampling/importance_sampling_ratio/min": 4.056526449858211e-05, "sampling/sampling_logp_difference/max": 2.0412111282348633, "sampling/sampling_logp_difference/mean": 0.1963983178138733, "step": 1071, "step_time": 24.710049846966285 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 0.8072717031463981, "epoch": 0.01072, "grad_norm": 0.057970043271780014, "kl": 0.5024426132440567, "learning_rate": 7.99960263194319e-06, "loss": -0.0379, "step": 1072, "step_time": 13.463768168992829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 4.592592716217041, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0608705151826143, "epoch": 0.01073, "frac_reward_zero_std": 0.25, "grad_norm": 0.03237394988536835, "kl": 0.2821674421429634, "learning_rate": 7.999601864470135e-06, "loss": -0.066, "num_tokens": 26942811.0, "reward": 0.1744016855955124, "reward_std": 0.6708880662918091, "rewards/rollout_reward_func/mean": 0.1744016855955124, "rewards/rollout_reward_func/std": 0.6708880662918091, "sampling/importance_sampling_ratio/max": 1.2307742834091187, "sampling/importance_sampling_ratio/mean": 0.7996739745140076, "sampling/importance_sampling_ratio/min": 2.484045944584068e-05, "sampling/sampling_logp_difference/max": 2.229781150817871, "sampling/sampling_logp_difference/mean": 0.20867863297462463, "step": 1073, "step_time": 30.050742761057336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.062740284949541, "epoch": 0.01074, "grad_norm": 0.03170390427112579, "kl": 0.2791458796709776, "learning_rate": 7.999601096256697e-06, "loss": -0.0659, "step": 1074, "step_time": 15.085557401005644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.0625, "completions/mean_terminated_length": 4.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.24188508093357086, "epoch": 0.01075, "frac_reward_zero_std": 0.5, "grad_norm": 0.03869382664561272, "kl": 0.451209656894207, "learning_rate": 7.999600327302877e-06, "loss": -0.0356, "num_tokens": 26984051.0, "reward": 1.0679638385772705, "reward_std": 0.34278637170791626, "rewards/rollout_reward_func/mean": 1.0679638385772705, "rewards/rollout_reward_func/std": 0.34278634190559387, "sampling/importance_sampling_ratio/max": 1.3101118803024292, "sampling/importance_sampling_ratio/mean": 1.0135740041732788, "sampling/importance_sampling_ratio/min": 0.07863125950098038, "sampling/sampling_logp_difference/max": 1.5195308923721313, "sampling/sampling_logp_difference/mean": 0.04783829674124718, "step": 1075, "step_time": 17.618566463002935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24218585900962353, "epoch": 0.01076, "grad_norm": 0.039455339312553406, "kl": 0.44222186878323555, "learning_rate": 7.999599557608678e-06, "loss": -0.0356, "step": 1076, "step_time": 10.059188786981395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.46875, "completions/mean_terminated_length": 4.703703880310059, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1198929660022259, "epoch": 0.01077, "frac_reward_zero_std": 0.0, "grad_norm": 0.15522101521492004, "kl": 0.6647283639758825, "learning_rate": 7.9995987871741e-06, "loss": -0.0436, "num_tokens": 27043376.0, "reward": 0.5977281928062439, "reward_std": 0.8315749168395996, "rewards/rollout_reward_func/mean": 0.5977281928062439, "rewards/rollout_reward_func/std": 0.8315749168395996, "sampling/importance_sampling_ratio/max": 1.2534550428390503, "sampling/importance_sampling_ratio/mean": 0.7757146954536438, "sampling/importance_sampling_ratio/min": 0.00020188612688798457, "sampling/sampling_logp_difference/max": 1.8720004558563232, "sampling/sampling_logp_difference/mean": 0.2435052990913391, "step": 1077, "step_time": 30.263356393988943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1309077693149447, "epoch": 0.01078, "grad_norm": 0.15596319735050201, "kl": 0.6347361132502556, "learning_rate": 7.999598015999138e-06, "loss": -0.0444, "step": 1078, "step_time": 15.497010669962037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.625, "completions/mean_terminated_length": 4.142857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.107116480357945, "epoch": 0.01079, "frac_reward_zero_std": 0.0, "grad_norm": 0.10127949714660645, "kl": 0.4092725906521082, "learning_rate": 7.999597244083798e-06, "loss": -0.073, "num_tokens": 27088702.0, "reward": 1.0849759578704834, "reward_std": 0.6725935339927673, "rewards/rollout_reward_func/mean": 1.0849759578704834, "rewards/rollout_reward_func/std": 0.6725935339927673, "sampling/importance_sampling_ratio/max": 1.1736737489700317, "sampling/importance_sampling_ratio/mean": 0.8643243312835693, "sampling/importance_sampling_ratio/min": 1.3503164097983245e-07, "sampling/sampling_logp_difference/max": 2.2509446144104004, "sampling/sampling_logp_difference/mean": 0.27903640270233154, "step": 1079, "step_time": 24.82269374097814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1100114602595568, "epoch": 0.0108, "grad_norm": 0.11058591306209564, "kl": 0.39062653109431267, "learning_rate": 7.999596471428079e-06, "loss": -0.0728, "step": 1080, "step_time": 13.38771851695492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.59375, "completions/mean_terminated_length": 4.42307710647583, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.395514603704214, "epoch": 0.01081, "frac_reward_zero_std": 0.0, "grad_norm": 0.1010318323969841, "kl": 0.5396357458084822, "learning_rate": 7.999595698031978e-06, "loss": -0.0495, "num_tokens": 27154726.0, "reward": 0.7410896420478821, "reward_std": 0.7816513180732727, "rewards/rollout_reward_func/mean": 0.7410896420478821, "rewards/rollout_reward_func/std": 0.7816513180732727, "sampling/importance_sampling_ratio/max": 1.1959702968597412, "sampling/importance_sampling_ratio/mean": 0.8283277750015259, "sampling/importance_sampling_ratio/min": 5.314394729794003e-05, "sampling/sampling_logp_difference/max": 1.7560102939605713, "sampling/sampling_logp_difference/mean": 0.2476344108581543, "step": 1081, "step_time": 28.794151611975394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.3999626319855452, "epoch": 0.01082, "grad_norm": 0.159662663936615, "kl": 0.5155203118920326, "learning_rate": 7.999594923895498e-06, "loss": -0.0502, "step": 1082, "step_time": 14.286089654022362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.09375, "completions/mean_terminated_length": 4.599999904632568, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.692215601913631, "epoch": 0.01083, "frac_reward_zero_std": 0.0, "grad_norm": 0.07360640913248062, "kl": 0.3383703278377652, "learning_rate": 7.999594149018638e-06, "loss": -0.0856, "num_tokens": 27207288.0, "reward": 0.44831281900405884, "reward_std": 0.9634280800819397, "rewards/rollout_reward_func/mean": 0.44831281900405884, "rewards/rollout_reward_func/std": 0.9634280204772949, "sampling/importance_sampling_ratio/max": 1.3137660026550293, "sampling/importance_sampling_ratio/mean": 0.7237979173660278, "sampling/importance_sampling_ratio/min": 2.5280866289278947e-09, "sampling/sampling_logp_difference/max": 2.4504761695861816, "sampling/sampling_logp_difference/mean": 0.33990222215652466, "step": 1083, "step_time": 27.989333646022715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6902548065409064, "epoch": 0.01084, "grad_norm": 0.06622300297021866, "kl": 0.3330006040632725, "learning_rate": 7.9995933734014e-06, "loss": -0.0857, "step": 1084, "step_time": 13.58254193002358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.71875, "completions/mean_terminated_length": 4.4782609939575195, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7161580994725227, "epoch": 0.01085, "frac_reward_zero_std": 0.0, "grad_norm": 0.16394944489002228, "kl": 0.1667672023177147, "learning_rate": 7.99959259704378e-06, "loss": -0.0686, "num_tokens": 27265913.0, "reward": 0.2680191397666931, "reward_std": 0.843768298625946, "rewards/rollout_reward_func/mean": 0.2680191397666931, "rewards/rollout_reward_func/std": 0.843768298625946, "sampling/importance_sampling_ratio/max": 1.2733932733535767, "sampling/importance_sampling_ratio/mean": 0.7503199577331543, "sampling/importance_sampling_ratio/min": 5.220789844884166e-08, "sampling/sampling_logp_difference/max": 2.2092645168304443, "sampling/sampling_logp_difference/mean": 0.33978545665740967, "step": 1085, "step_time": 29.06513265304966 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.7152606304734945, "epoch": 0.01086, "grad_norm": 0.07973279803991318, "kl": 0.16717864852398634, "learning_rate": 7.999591819945785e-06, "loss": -0.069, "step": 1086, "step_time": 13.857661771995481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.15625, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2159977853298187, "epoch": 0.01087, "frac_reward_zero_std": 0.25, "grad_norm": 0.18742269277572632, "kl": 0.2592754624783993, "learning_rate": 7.999591042107408e-06, "loss": -0.0536, "num_tokens": 27318787.0, "reward": 0.5463196039199829, "reward_std": 0.8825015425682068, "rewards/rollout_reward_func/mean": 0.5463196039199829, "rewards/rollout_reward_func/std": 0.8825016021728516, "sampling/importance_sampling_ratio/max": 1.2010352611541748, "sampling/importance_sampling_ratio/mean": 0.7673697471618652, "sampling/importance_sampling_ratio/min": 0.0005118182743899524, "sampling/sampling_logp_difference/max": 1.8944296836853027, "sampling/sampling_logp_difference/mean": 0.20749954879283905, "step": 1087, "step_time": 28.3814659609925 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.1950358040630817, "epoch": 0.01088, "grad_norm": 0.11309702694416046, "kl": 0.2600354701280594, "learning_rate": 7.999590263528655e-06, "loss": -0.0546, "step": 1088, "step_time": 13.547534166980768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.3125, "completions/mean_terminated_length": 4.967741966247559, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7982019260525703, "epoch": 0.01089, "frac_reward_zero_std": 0.0, "grad_norm": 0.06084809452295303, "kl": 0.4089621603488922, "learning_rate": 7.999589484209522e-06, "loss": -0.0434, "num_tokens": 27369898.0, "reward": 0.8032878637313843, "reward_std": 0.8400328159332275, "rewards/rollout_reward_func/mean": 0.8032878637313843, "rewards/rollout_reward_func/std": 0.8400328159332275, "sampling/importance_sampling_ratio/max": 1.1570603847503662, "sampling/importance_sampling_ratio/mean": 0.8876146674156189, "sampling/importance_sampling_ratio/min": 0.0006335093057714403, "sampling/sampling_logp_difference/max": 1.1865172386169434, "sampling/sampling_logp_difference/mean": 0.14228373765945435, "step": 1089, "step_time": 24.798988381022355 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 0.7994012543931603, "epoch": 0.0109, "grad_norm": 0.056541334837675095, "kl": 0.3749306295067072, "learning_rate": 7.999588704150011e-06, "loss": -0.0435, "step": 1090, "step_time": 12.368538202019408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.6875, "completions/mean_terminated_length": 5.82608699798584, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.494956612586975, "epoch": 0.01091, "frac_reward_zero_std": 0.0, "grad_norm": 0.08483348041772842, "kl": 0.21890167985111475, "learning_rate": 7.999587923350123e-06, "loss": -0.0851, "num_tokens": 27432343.0, "reward": 0.2707487940788269, "reward_std": 0.836334764957428, "rewards/rollout_reward_func/mean": 0.2707487940788269, "rewards/rollout_reward_func/std": 0.836334764957428, "sampling/importance_sampling_ratio/max": 1.3180878162384033, "sampling/importance_sampling_ratio/mean": 0.5918010473251343, "sampling/importance_sampling_ratio/min": 1.6206037622978897e-09, "sampling/sampling_logp_difference/max": 2.1700563430786133, "sampling/sampling_logp_difference/mean": 0.4442150890827179, "step": 1091, "step_time": 31.772279447002802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5033877193927765, "epoch": 0.01092, "grad_norm": 0.08077457547187805, "kl": 0.21359989419579506, "learning_rate": 7.999587141809856e-06, "loss": -0.0852, "step": 1092, "step_time": 14.618438191944733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.71875, "completions/mean_terminated_length": 4.4782609939575195, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8837150819599628, "epoch": 0.01093, "frac_reward_zero_std": 0.0, "grad_norm": 0.08996737003326416, "kl": 0.9144596029073, "learning_rate": 7.999586359529212e-06, "loss": -0.0603, "num_tokens": 27491480.0, "reward": 0.25884389877319336, "reward_std": 0.8802130222320557, "rewards/rollout_reward_func/mean": 0.25884389877319336, "rewards/rollout_reward_func/std": 0.8802130222320557, "sampling/importance_sampling_ratio/max": 1.3615739345550537, "sampling/importance_sampling_ratio/mean": 0.6634594202041626, "sampling/importance_sampling_ratio/min": 6.520522788378003e-07, "sampling/sampling_logp_difference/max": 1.9200713634490967, "sampling/sampling_logp_difference/mean": 0.38267210125923157, "step": 1093, "step_time": 30.170831618044758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.887624528259039, "epoch": 0.01094, "grad_norm": 0.08403024077415466, "kl": 0.7900398690253496, "learning_rate": 7.99958557650819e-06, "loss": -0.0606, "step": 1094, "step_time": 14.972953175019938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 4.319999694824219, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8827758319675922, "epoch": 0.01095, "frac_reward_zero_std": 0.0, "grad_norm": 0.04931695759296417, "kl": 0.3767146486788988, "learning_rate": 7.999584792746792e-06, "loss": -0.097, "num_tokens": 27554990.0, "reward": 0.7284295558929443, "reward_std": 0.8109264373779297, "rewards/rollout_reward_func/mean": 0.7284295558929443, "rewards/rollout_reward_func/std": 0.8109263777732849, "sampling/importance_sampling_ratio/max": 1.2339473962783813, "sampling/importance_sampling_ratio/mean": 0.7486103773117065, "sampling/importance_sampling_ratio/min": 6.181704037544478e-08, "sampling/sampling_logp_difference/max": 2.028778314590454, "sampling/sampling_logp_difference/mean": 0.4078971743583679, "step": 1095, "step_time": 30.028783896996174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8811013605445623, "epoch": 0.01096, "grad_norm": 0.0449095256626606, "kl": 0.37754162587225437, "learning_rate": 7.999584008245017e-06, "loss": -0.097, "step": 1096, "step_time": 15.304669872974046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 5.000000476837158, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2628335002809763, "epoch": 0.01097, "frac_reward_zero_std": 0.25, "grad_norm": 0.11159075051546097, "kl": 0.31291194818913937, "learning_rate": 7.999583223002866e-06, "loss": -0.0445, "num_tokens": 27605829.0, "reward": 0.9277898073196411, "reward_std": 0.6684461236000061, "rewards/rollout_reward_func/mean": 0.9277898073196411, "rewards/rollout_reward_func/std": 0.6684461236000061, "sampling/importance_sampling_ratio/max": 1.1673873662948608, "sampling/importance_sampling_ratio/mean": 0.8587594032287598, "sampling/importance_sampling_ratio/min": 8.65684455675364e-07, "sampling/sampling_logp_difference/max": 1.9583892822265625, "sampling/sampling_logp_difference/mean": 0.28229862451553345, "step": 1097, "step_time": 23.367973623040598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.2715274123474956, "epoch": 0.01098, "grad_norm": 0.05453598126769066, "kl": 0.31935273110866547, "learning_rate": 7.999582437020337e-06, "loss": -0.0449, "step": 1098, "step_time": 12.777575129992329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.0625, "completions/mean_terminated_length": 4.709677219390869, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9417966073378921, "epoch": 0.01099, "frac_reward_zero_std": 0.25, "grad_norm": 0.07362730801105499, "kl": 0.37050160951912403, "learning_rate": 7.999581650297431e-06, "loss": -0.0354, "num_tokens": 27650857.0, "reward": 0.6681634187698364, "reward_std": 0.7526491284370422, "rewards/rollout_reward_func/mean": 0.6681634187698364, "rewards/rollout_reward_func/std": 0.7526490688323975, "sampling/importance_sampling_ratio/max": 1.1969783306121826, "sampling/importance_sampling_ratio/mean": 0.8526241183280945, "sampling/importance_sampling_ratio/min": 0.00034044834319502115, "sampling/sampling_logp_difference/max": 1.6752550601959229, "sampling/sampling_logp_difference/mean": 0.1809091567993164, "step": 1099, "step_time": 20.454656545014586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9408228443935513, "epoch": 0.011, "grad_norm": 0.07099852710962296, "kl": 0.3533950708806515, "learning_rate": 7.999580862834148e-06, "loss": -0.0356, "step": 1100, "step_time": 11.045758683030726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.15625, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5201077554374933, "epoch": 0.01101, "frac_reward_zero_std": 0.0, "grad_norm": 0.12619683146476746, "kl": 0.6545114442706108, "learning_rate": 7.999580074630491e-06, "loss": -0.0527, "num_tokens": 27705246.0, "reward": 0.6139380931854248, "reward_std": 0.9030216932296753, "rewards/rollout_reward_func/mean": 0.6139380931854248, "rewards/rollout_reward_func/std": 0.9030216336250305, "sampling/importance_sampling_ratio/max": 1.2141258716583252, "sampling/importance_sampling_ratio/mean": 0.7836637496948242, "sampling/importance_sampling_ratio/min": 3.320790042948829e-08, "sampling/sampling_logp_difference/max": 2.3461251258850098, "sampling/sampling_logp_difference/mean": 0.36397290229797363, "step": 1101, "step_time": 27.703111323033227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5229118866845965, "epoch": 0.01102, "grad_norm": 0.11101428419351578, "kl": 0.589977165684104, "learning_rate": 7.999579285686457e-06, "loss": -0.0533, "step": 1102, "step_time": 14.397864099009894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.375, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.274555753916502, "epoch": 0.01103, "frac_reward_zero_std": 0.75, "grad_norm": 0.008371083997189999, "kl": 0.2891363110393286, "learning_rate": 7.99957849600205e-06, "loss": -0.0193, "num_tokens": 27750203.0, "reward": 1.3567571640014648, "reward_std": 0.4593237638473511, "rewards/rollout_reward_func/mean": 1.3567571640014648, "rewards/rollout_reward_func/std": 0.4593237340450287, "sampling/importance_sampling_ratio/max": 1.1484237909317017, "sampling/importance_sampling_ratio/mean": 1.0155956745147705, "sampling/importance_sampling_ratio/min": 0.0002427429863018915, "sampling/sampling_logp_difference/max": 1.4304654598236084, "sampling/sampling_logp_difference/mean": 0.06533681601285934, "step": 1103, "step_time": 18.556671021011425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.27685591764748096, "epoch": 0.01104, "grad_norm": 0.00827211607247591, "kl": 0.287651302292943, "learning_rate": 7.999577705577265e-06, "loss": -0.0193, "step": 1104, "step_time": 11.175704643042991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7967025935649872, "epoch": 0.01105, "frac_reward_zero_std": 0.25, "grad_norm": 0.05055079609155655, "kl": 0.5816978290677071, "learning_rate": 7.999576914412106e-06, "loss": -0.0283, "num_tokens": 27796582.0, "reward": 0.6792356371879578, "reward_std": 0.8754976391792297, "rewards/rollout_reward_func/mean": 0.6792356371879578, "rewards/rollout_reward_func/std": 0.8754976987838745, "sampling/importance_sampling_ratio/max": 1.129196286201477, "sampling/importance_sampling_ratio/mean": 0.9019234776496887, "sampling/importance_sampling_ratio/min": 2.1726444174419157e-05, "sampling/sampling_logp_difference/max": 1.6686499118804932, "sampling/sampling_logp_difference/mean": 0.14869104325771332, "step": 1105, "step_time": 25.21929497597739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7977808229625225, "epoch": 0.01106, "grad_norm": 0.04898470267653465, "kl": 0.549048276618123, "learning_rate": 7.999576122506571e-06, "loss": -0.0283, "step": 1106, "step_time": 13.868713645962998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.34375, "completions/mean_terminated_length": 4.633333683013916, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0428810007870197, "epoch": 0.01107, "frac_reward_zero_std": 0.0, "grad_norm": 0.11546846479177475, "kl": 0.5373518634587526, "learning_rate": 7.999575329860661e-06, "loss": -0.0786, "num_tokens": 27842126.0, "reward": 0.6382906436920166, "reward_std": 0.7533772587776184, "rewards/rollout_reward_func/mean": 0.6382906436920166, "rewards/rollout_reward_func/std": 0.7533772587776184, "sampling/importance_sampling_ratio/max": 1.086676836013794, "sampling/importance_sampling_ratio/mean": 0.7975103855133057, "sampling/importance_sampling_ratio/min": 2.535248029289505e-07, "sampling/sampling_logp_difference/max": 2.283233880996704, "sampling/sampling_logp_difference/mean": 0.22685506939888, "step": 1107, "step_time": 22.950352660031058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.0534856002777815, "epoch": 0.01108, "grad_norm": 0.07724656909704208, "kl": 0.5135297402739525, "learning_rate": 7.999574536474376e-06, "loss": -0.079, "step": 1108, "step_time": 13.48606336498051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 4.375, "completions/mean_terminated_length": 4.375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.4558852966874838, "epoch": 0.01109, "frac_reward_zero_std": 0.5, "grad_norm": 0.08068346977233887, "kl": 0.5328901410102844, "learning_rate": 7.999573742347718e-06, "loss": -0.028, "num_tokens": 27882930.0, "reward": 1.3055241107940674, "reward_std": 0.24002529680728912, "rewards/rollout_reward_func/mean": 1.3055241107940674, "rewards/rollout_reward_func/std": 0.24002531170845032, "sampling/importance_sampling_ratio/max": 1.164634346961975, "sampling/importance_sampling_ratio/mean": 0.9420111179351807, "sampling/importance_sampling_ratio/min": 0.003862243378534913, "sampling/sampling_logp_difference/max": 1.9980273246765137, "sampling/sampling_logp_difference/mean": 0.09902510046958923, "step": 1109, "step_time": 19.23816637697746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4628714732825756, "epoch": 0.0111, "grad_norm": 0.07615643739700317, "kl": 0.5162745527923107, "learning_rate": 7.999572947480686e-06, "loss": -0.0286, "step": 1110, "step_time": 11.03586586000165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.962963104248047, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7007253505289555, "epoch": 0.01111, "frac_reward_zero_std": 0.25, "grad_norm": 0.11668775230646133, "kl": 0.2174767404794693, "learning_rate": 7.999572151873277e-06, "loss": -0.0632, "num_tokens": 27931690.0, "reward": 0.5237236618995667, "reward_std": 0.9269609451293945, "rewards/rollout_reward_func/mean": 0.5237236618995667, "rewards/rollout_reward_func/std": 0.9269609451293945, "sampling/importance_sampling_ratio/max": 1.2202728986740112, "sampling/importance_sampling_ratio/mean": 0.6982954740524292, "sampling/importance_sampling_ratio/min": 1.728476672724355e-05, "sampling/sampling_logp_difference/max": 1.736557126045227, "sampling/sampling_logp_difference/mean": 0.3141605854034424, "step": 1111, "step_time": 24.741466114966897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6981408521533012, "epoch": 0.01112, "grad_norm": 0.12029024213552475, "kl": 0.21676550339907408, "learning_rate": 7.999571355525498e-06, "loss": -0.0632, "step": 1112, "step_time": 12.699076534976484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.4375, "completions/mean_terminated_length": 4.4375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6095091048628092, "epoch": 0.01113, "frac_reward_zero_std": 0.0, "grad_norm": 0.12763604521751404, "kl": 0.41244934499263763, "learning_rate": 7.999570558437343e-06, "loss": -0.0527, "num_tokens": 27983194.0, "reward": 1.088507056236267, "reward_std": 0.6716748476028442, "rewards/rollout_reward_func/mean": 1.088507056236267, "rewards/rollout_reward_func/std": 0.6716748476028442, "sampling/importance_sampling_ratio/max": 1.399689793586731, "sampling/importance_sampling_ratio/mean": 0.9248620271682739, "sampling/importance_sampling_ratio/min": 0.021966148167848587, "sampling/sampling_logp_difference/max": 1.6708526611328125, "sampling/sampling_logp_difference/mean": 0.10949663817882538, "step": 1113, "step_time": 22.962009278940968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6067034155130386, "epoch": 0.01114, "grad_norm": 0.12669211626052856, "kl": 0.43235333263874054, "learning_rate": 7.999569760608814e-06, "loss": -0.053, "step": 1114, "step_time": 12.978003452997655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.46875, "completions/mean_terminated_length": 4.703703880310059, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6370867565274239, "epoch": 0.01115, "frac_reward_zero_std": 0.0, "grad_norm": 0.04663031920790672, "kl": 0.442406490445137, "learning_rate": 7.999568962039914e-06, "loss": -0.0678, "num_tokens": 28033930.0, "reward": 0.36121076345443726, "reward_std": 0.7931832075119019, "rewards/rollout_reward_func/mean": 0.36121076345443726, "rewards/rollout_reward_func/std": 0.7931831479072571, "sampling/importance_sampling_ratio/max": 1.2018189430236816, "sampling/importance_sampling_ratio/mean": 0.7054392099380493, "sampling/importance_sampling_ratio/min": 3.44725776812993e-08, "sampling/sampling_logp_difference/max": 2.3873419761657715, "sampling/sampling_logp_difference/mean": 0.3738034963607788, "step": 1115, "step_time": 29.0253672109975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 1.6417580619454384, "epoch": 0.01116, "grad_norm": 0.04670179635286331, "kl": 0.46590590476989746, "learning_rate": 7.99956816273064e-06, "loss": -0.0678, "step": 1116, "step_time": 15.66085549001582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 5.793103218078613, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6933664232492447, "epoch": 0.01117, "frac_reward_zero_std": 0.0, "grad_norm": 0.07593940943479538, "kl": 0.4086558409035206, "learning_rate": 7.999567362680992e-06, "loss": -0.0523, "num_tokens": 28096671.0, "reward": 0.25601521134376526, "reward_std": 0.8259345889091492, "rewards/rollout_reward_func/mean": 0.25601521134376526, "rewards/rollout_reward_func/std": 0.8259345889091492, "sampling/importance_sampling_ratio/max": 1.495574712753296, "sampling/importance_sampling_ratio/mean": 0.6978200078010559, "sampling/importance_sampling_ratio/min": 3.274247137596831e-05, "sampling/sampling_logp_difference/max": 1.6602245569229126, "sampling/sampling_logp_difference/mean": 0.28683173656463623, "step": 1117, "step_time": 31.624937303946353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6971088126301765, "epoch": 0.01118, "grad_norm": 0.0721430703997612, "kl": 0.3941487278789282, "learning_rate": 7.999566561890972e-06, "loss": -0.0523, "step": 1118, "step_time": 15.823750888986979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 4.645161151885986, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7844058014452457, "epoch": 0.01119, "frac_reward_zero_std": 0.0, "grad_norm": 0.09218008071184158, "kl": 0.42582755722105503, "learning_rate": 7.999565760360578e-06, "loss": -0.0618, "num_tokens": 28154293.0, "reward": 0.7742316722869873, "reward_std": 0.7867228984832764, "rewards/rollout_reward_func/mean": 0.7742316722869873, "rewards/rollout_reward_func/std": 0.7867229580879211, "sampling/importance_sampling_ratio/max": 1.2084195613861084, "sampling/importance_sampling_ratio/mean": 0.8581212759017944, "sampling/importance_sampling_ratio/min": 9.720453090267256e-05, "sampling/sampling_logp_difference/max": 1.430261492729187, "sampling/sampling_logp_difference/mean": 0.17500519752502441, "step": 1119, "step_time": 24.39861817000201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.790983397513628, "epoch": 0.0112, "grad_norm": 0.10472710430622101, "kl": 0.41921994648873806, "learning_rate": 7.999564958089814e-06, "loss": -0.0622, "step": 1120, "step_time": 12.512401064974256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.53125, "completions/mean_terminated_length": 4.34615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5743271615356207, "epoch": 0.01121, "frac_reward_zero_std": 0.25, "grad_norm": 0.09141779690980911, "kl": 0.3727414524182677, "learning_rate": 7.999564155078678e-06, "loss": -0.0475, "num_tokens": 28211391.0, "reward": 0.467988520860672, "reward_std": 0.8665144443511963, "rewards/rollout_reward_func/mean": 0.467988520860672, "rewards/rollout_reward_func/std": 0.8665144443511963, "sampling/importance_sampling_ratio/max": 1.3343961238861084, "sampling/importance_sampling_ratio/mean": 0.762763261795044, "sampling/importance_sampling_ratio/min": 4.988465889255167e-07, "sampling/sampling_logp_difference/max": 1.8583488464355469, "sampling/sampling_logp_difference/mean": 0.32322463393211365, "step": 1121, "step_time": 31.554811713984236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5983696803450584, "epoch": 0.01122, "grad_norm": 0.0999358594417572, "kl": 0.3564037987962365, "learning_rate": 7.999563351327168e-06, "loss": -0.0478, "step": 1122, "step_time": 16.168752119963756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.21875, "completions/mean_terminated_length": 5.192307949066162, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.033863492310047, "epoch": 0.01123, "frac_reward_zero_std": 0.0, "grad_norm": 0.03071599267423153, "kl": 0.3667858634144068, "learning_rate": 7.999562546835289e-06, "loss": -0.065, "num_tokens": 28269620.0, "reward": 0.19233065843582153, "reward_std": 0.7946234345436096, "rewards/rollout_reward_func/mean": 0.19233065843582153, "rewards/rollout_reward_func/std": 0.7946233749389648, "sampling/importance_sampling_ratio/max": 1.2893526554107666, "sampling/importance_sampling_ratio/mean": 0.7034240961074829, "sampling/importance_sampling_ratio/min": 1.4271864579029625e-08, "sampling/sampling_logp_difference/max": 1.947873592376709, "sampling/sampling_logp_difference/mean": 0.4361262619495392, "step": 1123, "step_time": 30.231971978006186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0421373769640923, "epoch": 0.01124, "grad_norm": 0.030638597905635834, "kl": 0.3586860513314605, "learning_rate": 7.999561741603036e-06, "loss": -0.065, "step": 1124, "step_time": 15.561151372996392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.59375, "completions/mean_terminated_length": 4.900000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2757777161896229, "epoch": 0.01125, "frac_reward_zero_std": 0.0, "grad_norm": 0.0540999174118042, "kl": 0.23798677511513233, "learning_rate": 7.999560935630413e-06, "loss": -0.0713, "num_tokens": 28320048.0, "reward": 0.5641242265701294, "reward_std": 0.6736428737640381, "rewards/rollout_reward_func/mean": 0.5641242265701294, "rewards/rollout_reward_func/std": 0.6736428737640381, "sampling/importance_sampling_ratio/max": 1.157816767692566, "sampling/importance_sampling_ratio/mean": 0.8873739242553711, "sampling/importance_sampling_ratio/min": 4.668596602641628e-07, "sampling/sampling_logp_difference/max": 2.1773319244384766, "sampling/sampling_logp_difference/mean": 0.2940559983253479, "step": 1125, "step_time": 24.820956396026304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2938492372632027, "epoch": 0.01126, "grad_norm": 0.05514319986104965, "kl": 0.23735192604362965, "learning_rate": 7.99956012891742e-06, "loss": -0.0713, "step": 1126, "step_time": 13.522300774988253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.65625, "completions/mean_terminated_length": 5.318181991577148, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5893696770071983, "epoch": 0.01127, "frac_reward_zero_std": 0.0, "grad_norm": 0.09655987471342087, "kl": 0.5714166443794966, "learning_rate": 7.999559321464054e-06, "loss": -0.074, "num_tokens": 28381044.0, "reward": 0.1863614022731781, "reward_std": 0.7397279143333435, "rewards/rollout_reward_func/mean": 0.1863614022731781, "rewards/rollout_reward_func/std": 0.7397278547286987, "sampling/importance_sampling_ratio/max": 1.2397985458374023, "sampling/importance_sampling_ratio/mean": 0.5398560166358948, "sampling/importance_sampling_ratio/min": 1.3648316254943893e-08, "sampling/sampling_logp_difference/max": 2.089930534362793, "sampling/sampling_logp_difference/mean": 0.44748353958129883, "step": 1127, "step_time": 28.96176437701797 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 2.594654582440853, "epoch": 0.01128, "grad_norm": 0.08690794557332993, "kl": 0.5497425682842731, "learning_rate": 7.999558513270316e-06, "loss": -0.0741, "step": 1128, "step_time": 14.483130699023604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.59375, "completions/mean_terminated_length": 4.42307710647583, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.459850611165166, "epoch": 0.01129, "frac_reward_zero_std": 0.25, "grad_norm": 0.10162422060966492, "kl": 0.2760089756920934, "learning_rate": 7.999557704336211e-06, "loss": -0.0443, "num_tokens": 28432538.0, "reward": 0.7264350652694702, "reward_std": 0.9784923195838928, "rewards/rollout_reward_func/mean": 0.7264350652694702, "rewards/rollout_reward_func/std": 0.9784923195838928, "sampling/importance_sampling_ratio/max": 1.3587480783462524, "sampling/importance_sampling_ratio/mean": 0.7440637946128845, "sampling/importance_sampling_ratio/min": 1.7789433570669644e-07, "sampling/sampling_logp_difference/max": 2.360417127609253, "sampling/sampling_logp_difference/mean": 0.2804567813873291, "step": 1129, "step_time": 26.565370002994314 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.4610159993171692, "epoch": 0.0113, "grad_norm": 0.09611433744430542, "kl": 0.2735385373234749, "learning_rate": 7.999556894661735e-06, "loss": -0.0443, "step": 1130, "step_time": 13.636857707024319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.5625, "completions/mean_terminated_length": 4.866666793823242, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5627721287310123, "epoch": 0.01131, "frac_reward_zero_std": 0.0, "grad_norm": 0.08336160331964493, "kl": 0.2320869155228138, "learning_rate": 7.999556084246888e-06, "loss": -0.0791, "num_tokens": 28482165.0, "reward": 0.40352383255958557, "reward_std": 0.9398354887962341, "rewards/rollout_reward_func/mean": 0.40352383255958557, "rewards/rollout_reward_func/std": 0.9398354291915894, "sampling/importance_sampling_ratio/max": 1.208203911781311, "sampling/importance_sampling_ratio/mean": 0.7423546314239502, "sampling/importance_sampling_ratio/min": 3.493760959827341e-05, "sampling/sampling_logp_difference/max": 2.3671517372131348, "sampling/sampling_logp_difference/mean": 0.297619104385376, "step": 1131, "step_time": 29.671799562958768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5636191684752703, "epoch": 0.01132, "grad_norm": 0.0828373059630394, "kl": 0.22727334685623646, "learning_rate": 7.999555273091671e-06, "loss": -0.0792, "step": 1132, "step_time": 15.083587536995765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.40625, "completions/mean_terminated_length": 4.700000286102295, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7707128375768661, "epoch": 0.01133, "frac_reward_zero_std": 0.0, "grad_norm": 0.2041316032409668, "kl": 0.7038651369512081, "learning_rate": 7.999554461196083e-06, "loss": -0.082, "num_tokens": 28528326.0, "reward": 0.7559110522270203, "reward_std": 0.7944782376289368, "rewards/rollout_reward_func/mean": 0.7559110522270203, "rewards/rollout_reward_func/std": 0.7944782972335815, "sampling/importance_sampling_ratio/max": 1.2871606349945068, "sampling/importance_sampling_ratio/mean": 0.7407619953155518, "sampling/importance_sampling_ratio/min": 1.1842318770050042e-07, "sampling/sampling_logp_difference/max": 2.575093984603882, "sampling/sampling_logp_difference/mean": 0.4071342349052429, "step": 1133, "step_time": 24.948817854980007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7694233935326338, "epoch": 0.01134, "grad_norm": 0.19759483635425568, "kl": 0.6323520801961422, "learning_rate": 7.999553648560128e-06, "loss": -0.0833, "step": 1134, "step_time": 13.298804245976498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 4.666666507720947, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6182121466845274, "epoch": 0.01135, "frac_reward_zero_std": 0.0, "grad_norm": 0.028832213953137398, "kl": 0.3662097752094269, "learning_rate": 7.999552835183802e-06, "loss": -0.0796, "num_tokens": 28588555.0, "reward": 0.6499607563018799, "reward_std": 0.8363470435142517, "rewards/rollout_reward_func/mean": 0.6499607563018799, "rewards/rollout_reward_func/std": 0.8363470435142517, "sampling/importance_sampling_ratio/max": 1.1458053588867188, "sampling/importance_sampling_ratio/mean": 0.7127428650856018, "sampling/importance_sampling_ratio/min": 5.20586991115124e-06, "sampling/sampling_logp_difference/max": 2.1391701698303223, "sampling/sampling_logp_difference/mean": 0.3241958022117615, "step": 1135, "step_time": 29.72760653606383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6141938837245107, "epoch": 0.01136, "grad_norm": 0.029522396624088287, "kl": 0.3820768240839243, "learning_rate": 7.99955202106711e-06, "loss": -0.0795, "step": 1136, "step_time": 15.664361657982226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.9375, "completions/mean_terminated_length": 4.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6352473199367523, "epoch": 0.01137, "frac_reward_zero_std": 0.0, "grad_norm": 0.12971346080303192, "kl": 0.8793640621006489, "learning_rate": 7.999551206210045e-06, "loss": -0.0599, "num_tokens": 28645934.0, "reward": 0.815144419670105, "reward_std": 0.7730497717857361, "rewards/rollout_reward_func/mean": 0.815144419670105, "rewards/rollout_reward_func/std": 0.7730497717857361, "sampling/importance_sampling_ratio/max": 1.3399434089660645, "sampling/importance_sampling_ratio/mean": 0.7630950212478638, "sampling/importance_sampling_ratio/min": 2.180862708200948e-07, "sampling/sampling_logp_difference/max": 2.1989500522613525, "sampling/sampling_logp_difference/mean": 0.3648661971092224, "step": 1137, "step_time": 29.918927995080594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6357666198164225, "epoch": 0.01138, "grad_norm": 0.14260877668857574, "kl": 0.8741324041038752, "learning_rate": 7.999550390612613e-06, "loss": -0.0602, "step": 1138, "step_time": 16.391673772974173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.53125, "completions/mean_terminated_length": 4.833333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2219742145389318, "epoch": 0.01139, "frac_reward_zero_std": 0.0, "grad_norm": 0.17430110275745392, "kl": 0.5223731528967619, "learning_rate": 7.999549574274814e-06, "loss": -0.08, "num_tokens": 28703644.0, "reward": 0.9829816818237305, "reward_std": 0.7332189083099365, "rewards/rollout_reward_func/mean": 0.9829816818237305, "rewards/rollout_reward_func/std": 0.7332189083099365, "sampling/importance_sampling_ratio/max": 1.3788530826568604, "sampling/importance_sampling_ratio/mean": 0.8288230895996094, "sampling/importance_sampling_ratio/min": 6.48748346065986e-06, "sampling/sampling_logp_difference/max": 2.260298013687134, "sampling/sampling_logp_difference/mean": 0.26007670164108276, "step": 1139, "step_time": 29.115087403042708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2312775943428278, "epoch": 0.0114, "grad_norm": 0.15741077065467834, "kl": 0.5174311809241772, "learning_rate": 7.999548757196645e-06, "loss": -0.0808, "step": 1140, "step_time": 15.683221704006428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 5.111111164093018, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6932601314038038, "epoch": 0.01141, "frac_reward_zero_std": 0.25, "grad_norm": 0.08516411483287811, "kl": 0.6219026036560535, "learning_rate": 7.999547939378108e-06, "loss": -0.0664, "num_tokens": 28754158.0, "reward": 0.643324613571167, "reward_std": 0.8905876278877258, "rewards/rollout_reward_func/mean": 0.643324613571167, "rewards/rollout_reward_func/std": 0.8905876874923706, "sampling/importance_sampling_ratio/max": 1.1166819334030151, "sampling/importance_sampling_ratio/mean": 0.6965505480766296, "sampling/importance_sampling_ratio/min": 3.7507624783827964e-10, "sampling/sampling_logp_difference/max": 2.269167900085449, "sampling/sampling_logp_difference/mean": 0.3776492476463318, "step": 1141, "step_time": 26.942625088995555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7088230717927217, "epoch": 0.01142, "grad_norm": 0.09709306061267853, "kl": 0.6436809059232473, "learning_rate": 7.999547120819201e-06, "loss": -0.0669, "step": 1142, "step_time": 13.524776939942967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8808791749179363, "epoch": 0.01143, "frac_reward_zero_std": 0.25, "grad_norm": 0.041502755135297775, "kl": 0.7749587614089251, "learning_rate": 7.99954630151993e-06, "loss": -0.063, "num_tokens": 28799058.0, "reward": 1.1137733459472656, "reward_std": 0.7271196246147156, "rewards/rollout_reward_func/mean": 1.1137733459472656, "rewards/rollout_reward_func/std": 0.7271196246147156, "sampling/importance_sampling_ratio/max": 1.1143178939819336, "sampling/importance_sampling_ratio/mean": 0.8962124586105347, "sampling/importance_sampling_ratio/min": 2.892007557875331e-07, "sampling/sampling_logp_difference/max": 2.862084150314331, "sampling/sampling_logp_difference/mean": 0.17926745116710663, "step": 1143, "step_time": 22.932456126000034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8888403521850705, "epoch": 0.01144, "grad_norm": 0.04314439371228218, "kl": 0.8199003394693136, "learning_rate": 7.99954548148029e-06, "loss": -0.0631, "step": 1144, "step_time": 11.40511423695716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.03125, "completions/mean_terminated_length": 4.185185432434082, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3736863322556019, "epoch": 0.01145, "frac_reward_zero_std": 0.0, "grad_norm": 0.03471096605062485, "kl": 0.3701706063002348, "learning_rate": 7.999544660700282e-06, "loss": -0.0747, "num_tokens": 28856960.0, "reward": 0.6775282621383667, "reward_std": 0.8269968032836914, "rewards/rollout_reward_func/mean": 0.6775282621383667, "rewards/rollout_reward_func/std": 0.8269967436790466, "sampling/importance_sampling_ratio/max": 1.3419514894485474, "sampling/importance_sampling_ratio/mean": 0.778494119644165, "sampling/importance_sampling_ratio/min": 8.394346878048964e-06, "sampling/sampling_logp_difference/max": 1.9298536777496338, "sampling/sampling_logp_difference/mean": 0.2878992259502411, "step": 1145, "step_time": 33.326084500033176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.3795894030481577, "epoch": 0.01146, "grad_norm": 0.03636859729886055, "kl": 0.3669969104230404, "learning_rate": 7.999543839179908e-06, "loss": -0.0747, "step": 1146, "step_time": 15.756995538045885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 7.40625, "completions/mean_terminated_length": 4.043478488922119, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.944906341843307, "epoch": 0.01147, "frac_reward_zero_std": 0.0, "grad_norm": 0.06705453991889954, "kl": 0.5090720057487488, "learning_rate": 7.999543016919168e-06, "loss": -0.0791, "num_tokens": 28921507.0, "reward": 0.5649512410163879, "reward_std": 0.8857100605964661, "rewards/rollout_reward_func/mean": 0.5649512410163879, "rewards/rollout_reward_func/std": 0.8857100605964661, "sampling/importance_sampling_ratio/max": 1.5771644115447998, "sampling/importance_sampling_ratio/mean": 0.7501393556594849, "sampling/importance_sampling_ratio/min": 3.1099855846150604e-07, "sampling/sampling_logp_difference/max": 1.958561658859253, "sampling/sampling_logp_difference/mean": 0.34808099269866943, "step": 1147, "step_time": 32.690501651057275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9495217576622963, "epoch": 0.01148, "grad_norm": 0.065544992685318, "kl": 0.5288780322298408, "learning_rate": 7.99954219391806e-06, "loss": -0.0792, "step": 1148, "step_time": 16.8804549210181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 4.799999713897705, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.028648978099227, "epoch": 0.01149, "frac_reward_zero_std": 0.0, "grad_norm": 0.12279189378023148, "kl": 0.2857389412820339, "learning_rate": 7.999541370176585e-06, "loss": -0.0738, "num_tokens": 28968382.0, "reward": -0.23292408883571625, "reward_std": 0.5415806174278259, "rewards/rollout_reward_func/mean": -0.23292408883571625, "rewards/rollout_reward_func/std": 0.5415806174278259, "sampling/importance_sampling_ratio/max": 1.1482524871826172, "sampling/importance_sampling_ratio/mean": 0.6090469360351562, "sampling/importance_sampling_ratio/min": 4.4907242369163214e-08, "sampling/sampling_logp_difference/max": 2.039273262023926, "sampling/sampling_logp_difference/mean": 0.37145769596099854, "step": 1149, "step_time": 26.86522878994583 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 2.0189263690263033, "epoch": 0.0115, "grad_norm": 0.11932036280632019, "kl": 0.2862999141216278, "learning_rate": 7.999540545694743e-06, "loss": -0.0742, "step": 1150, "step_time": 12.49395702194306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 4.153846263885498, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7381248511373997, "epoch": 0.01151, "frac_reward_zero_std": 0.0, "grad_norm": 0.18305085599422455, "kl": 0.5496522597968578, "learning_rate": 7.999539720472537e-06, "loss": -0.0813, "num_tokens": 29026913.0, "reward": 0.6838446855545044, "reward_std": 0.8757977485656738, "rewards/rollout_reward_func/mean": 0.6838446855545044, "rewards/rollout_reward_func/std": 0.8757978081703186, "sampling/importance_sampling_ratio/max": 1.377885103225708, "sampling/importance_sampling_ratio/mean": 0.7393343448638916, "sampling/importance_sampling_ratio/min": 9.11524411506548e-10, "sampling/sampling_logp_difference/max": 2.5456061363220215, "sampling/sampling_logp_difference/mean": 0.44416022300720215, "step": 1151, "step_time": 29.84505172300851 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013020833488553762, "entropy": 1.729506753385067, "epoch": 0.01152, "grad_norm": 0.1355818808078766, "kl": 0.6294887755066156, "learning_rate": 7.999538894509965e-06, "loss": -0.0821, "step": 1152, "step_time": 15.03859735498554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 5.359999656677246, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2307929322123528, "epoch": 0.01153, "frac_reward_zero_std": 0.0, "grad_norm": 0.08178479969501495, "kl": 0.3396254535764456, "learning_rate": 7.999538067807026e-06, "loss": -0.0851, "num_tokens": 29092562.0, "reward": 0.32614952325820923, "reward_std": 0.903400719165802, "rewards/rollout_reward_func/mean": 0.32614952325820923, "rewards/rollout_reward_func/std": 0.903400719165802, "sampling/importance_sampling_ratio/max": 1.0816587209701538, "sampling/importance_sampling_ratio/mean": 0.5488303303718567, "sampling/importance_sampling_ratio/min": 1.099873303544996e-09, "sampling/sampling_logp_difference/max": 2.5549373626708984, "sampling/sampling_logp_difference/mean": 0.4006088674068451, "step": 1153, "step_time": 31.882570879970444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 2.2250209152698517, "epoch": 0.01154, "grad_norm": 0.0828624963760376, "kl": 0.34476254880428314, "learning_rate": 7.999537240363722e-06, "loss": -0.0851, "step": 1154, "step_time": 14.139531674067257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.928571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3635074067860842, "epoch": 0.01155, "frac_reward_zero_std": 0.0, "grad_norm": 0.22508887946605682, "kl": 0.4554061535745859, "learning_rate": 7.999536412180054e-06, "loss": -0.0793, "num_tokens": 29143561.0, "reward": 0.6641652584075928, "reward_std": 0.8810175061225891, "rewards/rollout_reward_func/mean": 0.6641652584075928, "rewards/rollout_reward_func/std": 0.8810175061225891, "sampling/importance_sampling_ratio/max": 1.3157501220703125, "sampling/importance_sampling_ratio/mean": 0.7339812517166138, "sampling/importance_sampling_ratio/min": 0.00012265043915249407, "sampling/sampling_logp_difference/max": 2.0937116146087646, "sampling/sampling_logp_difference/mean": 0.26212650537490845, "step": 1155, "step_time": 25.15598736301763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.3574711345136166, "epoch": 0.01156, "grad_norm": 0.11285072565078735, "kl": 0.43243440985679626, "learning_rate": 7.99953558325602e-06, "loss": -0.0802, "step": 1156, "step_time": 11.803849271993386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.625, "completions/mean_terminated_length": 5.279999732971191, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.206005111336708, "epoch": 0.01157, "frac_reward_zero_std": 0.0, "grad_norm": 0.47113072872161865, "kl": 1.6671564299613237, "learning_rate": 7.999534753591622e-06, "loss": -0.0992, "num_tokens": 29198437.0, "reward": 0.5203148126602173, "reward_std": 0.8717857003211975, "rewards/rollout_reward_func/mean": 0.5203148126602173, "rewards/rollout_reward_func/std": 0.8717857599258423, "sampling/importance_sampling_ratio/max": 1.1012309789657593, "sampling/importance_sampling_ratio/mean": 0.5793471336364746, "sampling/importance_sampling_ratio/min": 4.036655525396782e-07, "sampling/sampling_logp_difference/max": 2.3375658988952637, "sampling/sampling_logp_difference/mean": 0.4255361557006836, "step": 1157, "step_time": 28.701183262019185 }, { "clip_ratio/high_max": 0.045138888992369175, "clip_ratio/high_mean": 0.022569444496184587, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022569444496184587, "entropy": 2.17034250497818, "epoch": 0.01158, "grad_norm": 0.13535061478614807, "kl": 1.424778993241489, "learning_rate": 7.999533923186858e-06, "loss": -0.1016, "step": 1158, "step_time": 13.899260736012366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.40625, "completions/mean_terminated_length": 4.700000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9847514359280467, "epoch": 0.01159, "frac_reward_zero_std": 0.0, "grad_norm": 0.058308109641075134, "kl": 0.2826336231082678, "learning_rate": 7.99953309204173e-06, "loss": -0.0813, "num_tokens": 29247567.0, "reward": 0.8937000036239624, "reward_std": 0.8198422789573669, "rewards/rollout_reward_func/mean": 0.8937000036239624, "rewards/rollout_reward_func/std": 0.8198422789573669, "sampling/importance_sampling_ratio/max": 1.0959010124206543, "sampling/importance_sampling_ratio/mean": 0.8484035730361938, "sampling/importance_sampling_ratio/min": 0.0005356877809390426, "sampling/sampling_logp_difference/max": 2.0158753395080566, "sampling/sampling_logp_difference/mean": 0.22031252086162567, "step": 1159, "step_time": 26.161127339000814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.97753069922328, "epoch": 0.0116, "grad_norm": 0.04654529690742493, "kl": 0.2925539165735245, "learning_rate": 7.999532260156239e-06, "loss": -0.0815, "step": 1160, "step_time": 13.368530728999758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 5.230769634246826, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0999517599120736, "epoch": 0.01161, "frac_reward_zero_std": 0.0, "grad_norm": 0.11147695779800415, "kl": 0.6544353673234582, "learning_rate": 7.999531427530382e-06, "loss": -0.0925, "num_tokens": 29312171.0, "reward": 0.7272863984107971, "reward_std": 0.8115964531898499, "rewards/rollout_reward_func/mean": 0.7272863984107971, "rewards/rollout_reward_func/std": 0.8115964531898499, "sampling/importance_sampling_ratio/max": 1.1088327169418335, "sampling/importance_sampling_ratio/mean": 0.5826045870780945, "sampling/importance_sampling_ratio/min": 5.077064884062565e-07, "sampling/sampling_logp_difference/max": 2.148916006088257, "sampling/sampling_logp_difference/mean": 0.3893384337425232, "step": 1161, "step_time": 31.95774592793896 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 2.1006786851212382, "epoch": 0.01162, "grad_norm": 0.07080685347318649, "kl": 0.5904875406995416, "learning_rate": 7.999530594164162e-06, "loss": -0.0931, "step": 1162, "step_time": 16.401492162956856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 4.533333778381348, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.100873644463718, "epoch": 0.01163, "frac_reward_zero_std": 0.0, "grad_norm": 0.20386303961277008, "kl": 0.7890944499522448, "learning_rate": 7.999529760057578e-06, "loss": -0.0662, "num_tokens": 29370285.0, "reward": 1.1474183797836304, "reward_std": 0.4854794442653656, "rewards/rollout_reward_func/mean": 1.1474183797836304, "rewards/rollout_reward_func/std": 0.4854794442653656, "sampling/importance_sampling_ratio/max": 1.3839365243911743, "sampling/importance_sampling_ratio/mean": 0.855322539806366, "sampling/importance_sampling_ratio/min": 4.334300228947541e-06, "sampling/sampling_logp_difference/max": 1.8243603706359863, "sampling/sampling_logp_difference/mean": 0.23055881261825562, "step": 1163, "step_time": 25.101404172950424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0889451373368502, "epoch": 0.01164, "grad_norm": 0.24031934142112732, "kl": 0.7644859459251165, "learning_rate": 7.999528925210631e-06, "loss": -0.067, "step": 1164, "step_time": 13.56599345698487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 4.153846263885498, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6048184605315328, "epoch": 0.01165, "frac_reward_zero_std": 0.5, "grad_norm": 0.03922462463378906, "kl": 0.2826161775738001, "learning_rate": 7.999528089623321e-06, "loss": -0.0421, "num_tokens": 29421019.0, "reward": 0.5439841151237488, "reward_std": 0.8785635232925415, "rewards/rollout_reward_func/mean": 0.5439841151237488, "rewards/rollout_reward_func/std": 0.8785635232925415, "sampling/importance_sampling_ratio/max": 1.0594605207443237, "sampling/importance_sampling_ratio/mean": 0.7293480038642883, "sampling/importance_sampling_ratio/min": 1.3556680222848172e-08, "sampling/sampling_logp_difference/max": 2.595201015472412, "sampling/sampling_logp_difference/mean": 0.4070305824279785, "step": 1165, "step_time": 27.629843003029237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.606285865418613, "epoch": 0.01166, "grad_norm": 0.044649045914411545, "kl": 0.2769528403878212, "learning_rate": 7.999527253295647e-06, "loss": -0.0423, "step": 1166, "step_time": 14.483627002045978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.53125, "completions/mean_terminated_length": 5.193548202514648, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2556651309132576, "epoch": 0.01167, "frac_reward_zero_std": 0.0, "grad_norm": 0.04519518464803696, "kl": 0.450175404548645, "learning_rate": 7.99952641622761e-06, "loss": -0.0832, "num_tokens": 29466737.0, "reward": 0.6807021498680115, "reward_std": 0.8733326196670532, "rewards/rollout_reward_func/mean": 0.6807021498680115, "rewards/rollout_reward_func/std": 0.8733326196670532, "sampling/importance_sampling_ratio/max": 1.1379643678665161, "sampling/importance_sampling_ratio/mean": 0.7681499719619751, "sampling/importance_sampling_ratio/min": 0.00026525664725340903, "sampling/sampling_logp_difference/max": 1.9341226816177368, "sampling/sampling_logp_difference/mean": 0.2339651882648468, "step": 1167, "step_time": 30.30842731698067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.256263840943575, "epoch": 0.01168, "grad_norm": 0.047005198895931244, "kl": 0.45525331795215607, "learning_rate": 7.999525578419213e-06, "loss": -0.0831, "step": 1168, "step_time": 15.146493628970347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 4.239999771118164, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.755420264787972, "epoch": 0.01169, "frac_reward_zero_std": 0.0, "grad_norm": 0.052884023636579514, "kl": 0.3425041064620018, "learning_rate": 7.999524739870451e-06, "loss": -0.0715, "num_tokens": 29525383.0, "reward": 0.5051231384277344, "reward_std": 0.875386655330658, "rewards/rollout_reward_func/mean": 0.5051231384277344, "rewards/rollout_reward_func/std": 0.875386655330658, "sampling/importance_sampling_ratio/max": 1.251879334449768, "sampling/importance_sampling_ratio/mean": 0.7003111839294434, "sampling/importance_sampling_ratio/min": 2.192851752624847e-06, "sampling/sampling_logp_difference/max": 1.989992380142212, "sampling/sampling_logp_difference/mean": 0.3672078847885132, "step": 1169, "step_time": 26.9310657659953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7601290242746472, "epoch": 0.0117, "grad_norm": 0.0511772520840168, "kl": 0.33866982720792294, "learning_rate": 7.999523900581328e-06, "loss": -0.0716, "step": 1170, "step_time": 13.747375592036406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 4.615384578704834, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7977804318070412, "epoch": 0.01171, "frac_reward_zero_std": 0.0, "grad_norm": 0.12125994265079498, "kl": 0.41713523864746094, "learning_rate": 7.999523060551842e-06, "loss": -0.0456, "num_tokens": 29583868.0, "reward": 0.1853494942188263, "reward_std": 0.7290804386138916, "rewards/rollout_reward_func/mean": 0.1853494942188263, "rewards/rollout_reward_func/std": 0.7290804386138916, "sampling/importance_sampling_ratio/max": 1.217477798461914, "sampling/importance_sampling_ratio/mean": 0.7078735828399658, "sampling/importance_sampling_ratio/min": 9.632948660964757e-09, "sampling/sampling_logp_difference/max": 1.9625272750854492, "sampling/sampling_logp_difference/mean": 0.3921039402484894, "step": 1171, "step_time": 27.942986281996127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7855716161429882, "epoch": 0.01172, "grad_norm": 0.12229787558317184, "kl": 0.4234994500875473, "learning_rate": 7.999522219781996e-06, "loss": -0.046, "step": 1172, "step_time": 14.598038583993912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 5.379310131072998, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4961004182696342, "epoch": 0.01173, "frac_reward_zero_std": 0.0, "grad_norm": 0.08375044912099838, "kl": 0.30473888758569956, "learning_rate": 7.999521378271788e-06, "loss": -0.0847, "num_tokens": 29638674.0, "reward": 0.41079479455947876, "reward_std": 0.8231473565101624, "rewards/rollout_reward_func/mean": 0.41079479455947876, "rewards/rollout_reward_func/std": 0.8231473565101624, "sampling/importance_sampling_ratio/max": 1.3382558822631836, "sampling/importance_sampling_ratio/mean": 0.7594799995422363, "sampling/importance_sampling_ratio/min": 1.4666810557173449e-06, "sampling/sampling_logp_difference/max": 1.9803024530410767, "sampling/sampling_logp_difference/mean": 0.29335981607437134, "step": 1173, "step_time": 29.258942888962338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.4906298890709877, "epoch": 0.01174, "grad_norm": 0.03935740143060684, "kl": 0.31133562605828047, "learning_rate": 7.99952053602122e-06, "loss": -0.085, "step": 1174, "step_time": 15.543980657996144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.03125, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2048192285001278, "epoch": 0.01175, "frac_reward_zero_std": 0.0, "grad_norm": 0.09931350499391556, "kl": 0.6097894366830587, "learning_rate": 7.999519693030286e-06, "loss": -0.0629, "num_tokens": 29702604.0, "reward": 0.9668591618537903, "reward_std": 0.6516844630241394, "rewards/rollout_reward_func/mean": 0.9668591618537903, "rewards/rollout_reward_func/std": 0.6516844630241394, "sampling/importance_sampling_ratio/max": 1.4617923498153687, "sampling/importance_sampling_ratio/mean": 0.8395920991897583, "sampling/importance_sampling_ratio/min": 2.4153294475581788e-08, "sampling/sampling_logp_difference/max": 2.3232369422912598, "sampling/sampling_logp_difference/mean": 0.31018733978271484, "step": 1175, "step_time": 25.791594128008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.206980524584651, "epoch": 0.01176, "grad_norm": 0.10237502306699753, "kl": 0.6143670827150345, "learning_rate": 7.999518849298995e-06, "loss": -0.0631, "step": 1176, "step_time": 13.738177584978985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.3125, "completions/mean_terminated_length": 4.880000114440918, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1452485909685493, "epoch": 0.01177, "frac_reward_zero_std": 0.0, "grad_norm": 0.12186715751886368, "kl": 0.5045125056058168, "learning_rate": 7.999518004827344e-06, "loss": -0.0842, "num_tokens": 29760274.0, "reward": 0.5501746535301208, "reward_std": 0.9414211511611938, "rewards/rollout_reward_func/mean": 0.5501746535301208, "rewards/rollout_reward_func/std": 0.9414211511611938, "sampling/importance_sampling_ratio/max": 1.1199333667755127, "sampling/importance_sampling_ratio/mean": 0.6180053949356079, "sampling/importance_sampling_ratio/min": 1.9889798252847868e-08, "sampling/sampling_logp_difference/max": 2.3545327186584473, "sampling/sampling_logp_difference/mean": 0.45072999596595764, "step": 1177, "step_time": 33.36003744797199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 2.150829080492258, "epoch": 0.01178, "grad_norm": 0.11465168744325638, "kl": 0.4995476845651865, "learning_rate": 7.99951715961533e-06, "loss": -0.0845, "step": 1178, "step_time": 16.195590812974842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.71875, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2324710953980684, "epoch": 0.01179, "frac_reward_zero_std": 0.0, "grad_norm": 0.09808818995952606, "kl": 0.30485634692013264, "learning_rate": 7.999516313662957e-06, "loss": -0.0722, "num_tokens": 29808050.0, "reward": 0.7788550853729248, "reward_std": 0.8875980973243713, "rewards/rollout_reward_func/mean": 0.7788550853729248, "rewards/rollout_reward_func/std": 0.8875980973243713, "sampling/importance_sampling_ratio/max": 1.0636351108551025, "sampling/importance_sampling_ratio/mean": 0.8351342678070068, "sampling/importance_sampling_ratio/min": 2.3626533618426038e-07, "sampling/sampling_logp_difference/max": 1.9747728109359741, "sampling/sampling_logp_difference/mean": 0.3497483730316162, "step": 1179, "step_time": 25.10980400297558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 1.2796379923820496, "epoch": 0.0118, "grad_norm": 0.06566229462623596, "kl": 0.3081505922600627, "learning_rate": 7.999515466970224e-06, "loss": -0.0727, "step": 1180, "step_time": 12.255630770989228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.40625, "completions/mean_terminated_length": 4.192307949066162, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7230886593461037, "epoch": 0.01181, "frac_reward_zero_std": 0.0, "grad_norm": 0.06273259222507477, "kl": 0.22054286114871502, "learning_rate": 7.99951461953713e-06, "loss": -0.0634, "num_tokens": 29870140.0, "reward": 0.3945254683494568, "reward_std": 0.7813011407852173, "rewards/rollout_reward_func/mean": 0.3945254683494568, "rewards/rollout_reward_func/std": 0.7813012003898621, "sampling/importance_sampling_ratio/max": 1.1563186645507812, "sampling/importance_sampling_ratio/mean": 0.7554506063461304, "sampling/importance_sampling_ratio/min": 4.919899865285515e-08, "sampling/sampling_logp_difference/max": 2.4959330558776855, "sampling/sampling_logp_difference/mean": 0.35663923621177673, "step": 1181, "step_time": 29.520389226992847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.728789297863841, "epoch": 0.01182, "grad_norm": 0.06578200310468674, "kl": 0.22008861601352692, "learning_rate": 7.999513771363676e-06, "loss": -0.0633, "step": 1182, "step_time": 15.518012410029769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 5.037036895751953, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0813280940055847, "epoch": 0.01183, "frac_reward_zero_std": 0.0, "grad_norm": 0.07928889244794846, "kl": 0.4837647173553705, "learning_rate": 7.999512922449865e-06, "loss": -0.0516, "num_tokens": 29921852.0, "reward": 0.57100510597229, "reward_std": 0.9052812457084656, "rewards/rollout_reward_func/mean": 0.57100510597229, "rewards/rollout_reward_func/std": 0.9052811861038208, "sampling/importance_sampling_ratio/max": 1.1998878717422485, "sampling/importance_sampling_ratio/mean": 0.6592109799385071, "sampling/importance_sampling_ratio/min": 8.400677415920654e-07, "sampling/sampling_logp_difference/max": 2.432903289794922, "sampling/sampling_logp_difference/mean": 0.397929310798645, "step": 1183, "step_time": 26.714264549984364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.079191029071808, "epoch": 0.01184, "grad_norm": 0.08033516258001328, "kl": 0.4518809802830219, "learning_rate": 7.999512072795693e-06, "loss": -0.0519, "step": 1184, "step_time": 14.240458952990593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.65625, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2648612698540092, "epoch": 0.01185, "frac_reward_zero_std": 0.0, "grad_norm": 0.027504967525601387, "kl": 0.2878111321479082, "learning_rate": 7.999511222401162e-06, "loss": -0.0613, "num_tokens": 29977453.0, "reward": 0.7349871397018433, "reward_std": 0.912380576133728, "rewards/rollout_reward_func/mean": 0.7349871397018433, "rewards/rollout_reward_func/std": 0.912380576133728, "sampling/importance_sampling_ratio/max": 1.1089905500411987, "sampling/importance_sampling_ratio/mean": 0.7570399045944214, "sampling/importance_sampling_ratio/min": 3.2290287776959303e-07, "sampling/sampling_logp_difference/max": 2.370418071746826, "sampling/sampling_logp_difference/mean": 0.30891624093055725, "step": 1185, "step_time": 27.527329556061886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2607661662623286, "epoch": 0.01186, "grad_norm": 0.029109356924891472, "kl": 0.29401449766010046, "learning_rate": 7.999510371266273e-06, "loss": -0.0612, "step": 1186, "step_time": 13.796446716994978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.03125, "completions/mean_terminated_length": 4.91304349899292, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2862821742892265, "epoch": 0.01187, "frac_reward_zero_std": 0.25, "grad_norm": 0.13154169917106628, "kl": 0.2454141452908516, "learning_rate": 7.999509519391025e-06, "loss": -0.068, "num_tokens": 30030123.0, "reward": 0.4146604537963867, "reward_std": 0.886073887348175, "rewards/rollout_reward_func/mean": 0.4146604537963867, "rewards/rollout_reward_func/std": 0.886073887348175, "sampling/importance_sampling_ratio/max": 1.1725620031356812, "sampling/importance_sampling_ratio/mean": 0.6101367473602295, "sampling/importance_sampling_ratio/min": 3.770861678464854e-10, "sampling/sampling_logp_difference/max": 2.392514705657959, "sampling/sampling_logp_difference/mean": 0.48672741651535034, "step": 1187, "step_time": 29.57502379300422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2786015048623085, "epoch": 0.01188, "grad_norm": 0.12741173803806305, "kl": 0.2503729881718755, "learning_rate": 7.999508666775417e-06, "loss": -0.0683, "step": 1188, "step_time": 13.43472750496585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.71875, "completions/mean_terminated_length": 4.119999885559082, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6520036030560732, "epoch": 0.01189, "frac_reward_zero_std": 0.0, "grad_norm": 0.04651067033410072, "kl": 0.2721628900617361, "learning_rate": 7.999507813419454e-06, "loss": -0.042, "num_tokens": 30087015.0, "reward": 0.4344579577445984, "reward_std": 0.9240584969520569, "rewards/rollout_reward_func/mean": 0.4344579577445984, "rewards/rollout_reward_func/std": 0.9240584969520569, "sampling/importance_sampling_ratio/max": 1.1641978025436401, "sampling/importance_sampling_ratio/mean": 0.7125180959701538, "sampling/importance_sampling_ratio/min": 2.9582569283093108e-08, "sampling/sampling_logp_difference/max": 2.5900797843933105, "sampling/sampling_logp_difference/mean": 0.3622360825538635, "step": 1189, "step_time": 26.63760803997866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 1.6406160183250904, "epoch": 0.0119, "grad_norm": 0.0463806688785553, "kl": 0.27009117044508457, "learning_rate": 7.999506959323131e-06, "loss": -0.0421, "step": 1190, "step_time": 13.558931508014211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 4.615384578704834, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4488564915955067, "epoch": 0.01191, "frac_reward_zero_std": 0.0, "grad_norm": 0.12436281144618988, "kl": 0.21183054707944393, "learning_rate": 7.999506104486449e-06, "loss": -0.0217, "num_tokens": 30140178.0, "reward": 0.42440205812454224, "reward_std": 0.8867207169532776, "rewards/rollout_reward_func/mean": 0.42440205812454224, "rewards/rollout_reward_func/std": 0.8867207169532776, "sampling/importance_sampling_ratio/max": 1.1422181129455566, "sampling/importance_sampling_ratio/mean": 0.7216401696205139, "sampling/importance_sampling_ratio/min": 3.9359424590657e-06, "sampling/sampling_logp_difference/max": 2.2157182693481445, "sampling/sampling_logp_difference/mean": 0.28443336486816406, "step": 1191, "step_time": 32.97776186102419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.459282866679132, "epoch": 0.01192, "grad_norm": 0.14408405125141144, "kl": 0.2093802271410823, "learning_rate": 7.99950524890941e-06, "loss": -0.0222, "step": 1192, "step_time": 16.993150832044194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.21875, "completions/mean_terminated_length": 5.192307949066162, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1997949350625277, "epoch": 0.01193, "frac_reward_zero_std": 0.0, "grad_norm": 0.11006693542003632, "kl": 0.3029487617313862, "learning_rate": 7.999504392592015e-06, "loss": -0.0762, "num_tokens": 30203106.0, "reward": 0.28745368123054504, "reward_std": 0.8514378070831299, "rewards/rollout_reward_func/mean": 0.28745368123054504, "rewards/rollout_reward_func/std": 0.8514378070831299, "sampling/importance_sampling_ratio/max": 1.4291800260543823, "sampling/importance_sampling_ratio/mean": 0.620856761932373, "sampling/importance_sampling_ratio/min": 2.0748279894178268e-06, "sampling/sampling_logp_difference/max": 2.102796792984009, "sampling/sampling_logp_difference/mean": 0.4176936745643616, "step": 1193, "step_time": 29.177789744047914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1975214444100857, "epoch": 0.01194, "grad_norm": 0.10517391562461853, "kl": 0.3066371027380228, "learning_rate": 7.999503535534264e-06, "loss": -0.0766, "step": 1194, "step_time": 13.794382953987224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 4.159999847412109, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6453955313190818, "epoch": 0.01195, "frac_reward_zero_std": 0.0, "grad_norm": 0.0714726373553276, "kl": 0.8401844967156649, "learning_rate": 7.999502677736154e-06, "loss": -0.0831, "num_tokens": 30259072.0, "reward": 0.447262704372406, "reward_std": 0.9579675793647766, "rewards/rollout_reward_func/mean": 0.447262704372406, "rewards/rollout_reward_func/std": 0.9579675793647766, "sampling/importance_sampling_ratio/max": 1.1131048202514648, "sampling/importance_sampling_ratio/mean": 0.6173595786094666, "sampling/importance_sampling_ratio/min": 0.0004152908513788134, "sampling/sampling_logp_difference/max": 2.1274807453155518, "sampling/sampling_logp_difference/mean": 0.30394306778907776, "step": 1195, "step_time": 28.542182606033748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6354880090802908, "epoch": 0.01196, "grad_norm": 0.06367386877536774, "kl": 0.8824049215763807, "learning_rate": 7.999501819197687e-06, "loss": -0.0832, "step": 1196, "step_time": 14.422949196014088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9843959622085094, "epoch": 0.01197, "frac_reward_zero_std": 0.25, "grad_norm": 0.017693275585770607, "kl": 0.21476277336478233, "learning_rate": 7.999500959918865e-06, "loss": -0.064, "num_tokens": 30312368.0, "reward": 1.0879831314086914, "reward_std": 0.751436173915863, "rewards/rollout_reward_func/mean": 1.0879831314086914, "rewards/rollout_reward_func/std": 0.7514361143112183, "sampling/importance_sampling_ratio/max": 1.2340039014816284, "sampling/importance_sampling_ratio/mean": 0.8824689388275146, "sampling/importance_sampling_ratio/min": 1.5811810953891836e-05, "sampling/sampling_logp_difference/max": 1.7573063373565674, "sampling/sampling_logp_difference/mean": 0.21225441992282867, "step": 1197, "step_time": 27.64797363098478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9833015650510788, "epoch": 0.01198, "grad_norm": 0.016673097386956215, "kl": 0.21488841623067856, "learning_rate": 7.999500099899686e-06, "loss": -0.0641, "step": 1198, "step_time": 13.593689811008517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.9375, "completions/mean_terminated_length": 4.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.521922285668552, "epoch": 0.01199, "frac_reward_zero_std": 0.0, "grad_norm": 0.022582849487662315, "kl": 0.3393735168501735, "learning_rate": 7.999499239140151e-06, "loss": -0.0937, "num_tokens": 30365197.0, "reward": 0.6375814080238342, "reward_std": 0.9036562442779541, "rewards/rollout_reward_func/mean": 0.6375814080238342, "rewards/rollout_reward_func/std": 0.9036562442779541, "sampling/importance_sampling_ratio/max": 1.1451982259750366, "sampling/importance_sampling_ratio/mean": 0.7876056432723999, "sampling/importance_sampling_ratio/min": 1.9628092218226811e-07, "sampling/sampling_logp_difference/max": 2.77103853225708, "sampling/sampling_logp_difference/mean": 0.355323851108551, "step": 1199, "step_time": 24.452873441041447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5187247265130281, "epoch": 0.012, "grad_norm": 0.02208641916513443, "kl": 0.3388615632429719, "learning_rate": 7.99949837764026e-06, "loss": -0.0937, "step": 1200, "step_time": 11.725977600901388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.78125, "completions/mean_terminated_length": 4.3214287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1712525030598044, "epoch": 0.01201, "frac_reward_zero_std": 0.25, "grad_norm": 0.02000686153769493, "kl": 0.3010570593178272, "learning_rate": 7.999497515400014e-06, "loss": -0.0486, "num_tokens": 30418641.0, "reward": 0.9147769212722778, "reward_std": 0.7821258306503296, "rewards/rollout_reward_func/mean": 0.9147769212722778, "rewards/rollout_reward_func/std": 0.7821258306503296, "sampling/importance_sampling_ratio/max": 1.1110726594924927, "sampling/importance_sampling_ratio/mean": 0.882010817527771, "sampling/importance_sampling_ratio/min": 6.429996535217697e-10, "sampling/sampling_logp_difference/max": 2.0842957496643066, "sampling/sampling_logp_difference/mean": 0.3141152262687683, "step": 1201, "step_time": 23.33445214002859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1704126498661935, "epoch": 0.01202, "grad_norm": 0.020444242283701897, "kl": 0.3055995758622885, "learning_rate": 7.99949665241941e-06, "loss": -0.0486, "step": 1202, "step_time": 12.187058514013188 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 4.814815044403076, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8537814170122147, "epoch": 0.01203, "frac_reward_zero_std": 0.0, "grad_norm": 0.10384354740381241, "kl": 0.44420350715518, "learning_rate": 7.999495788698454e-06, "loss": -0.0656, "num_tokens": 30478706.0, "reward": 0.5362403988838196, "reward_std": 0.7979575395584106, "rewards/rollout_reward_func/mean": 0.5362403988838196, "rewards/rollout_reward_func/std": 0.7979575395584106, "sampling/importance_sampling_ratio/max": 1.2169064283370972, "sampling/importance_sampling_ratio/mean": 0.6688659191131592, "sampling/importance_sampling_ratio/min": 5.118212357047014e-05, "sampling/sampling_logp_difference/max": 2.3516392707824707, "sampling/sampling_logp_difference/mean": 0.36840495467185974, "step": 1203, "step_time": 27.22608237696113 }, { "clip_ratio/high_max": 0.021059782709926367, "clip_ratio/high_mean": 0.010529891354963183, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010529891354963183, "entropy": 1.8482940327376127, "epoch": 0.01204, "grad_norm": 0.1103176474571228, "kl": 0.4434547480195761, "learning_rate": 7.999494924237141e-06, "loss": -0.0656, "step": 1204, "step_time": 13.828485570993507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 5.037036895751953, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.710063274949789, "epoch": 0.01205, "frac_reward_zero_std": 0.25, "grad_norm": 0.1668229103088379, "kl": 0.5791033785790205, "learning_rate": 7.999494059035475e-06, "loss": -0.0743, "num_tokens": 30539960.0, "reward": 0.6623070240020752, "reward_std": 0.868299126625061, "rewards/rollout_reward_func/mean": 0.6623070240020752, "rewards/rollout_reward_func/std": 0.868299126625061, "sampling/importance_sampling_ratio/max": 1.6145490407943726, "sampling/importance_sampling_ratio/mean": 0.7087398767471313, "sampling/importance_sampling_ratio/min": 1.9023693312192336e-05, "sampling/sampling_logp_difference/max": 1.979546308517456, "sampling/sampling_logp_difference/mean": 0.3246166706085205, "step": 1205, "step_time": 34.67902971195872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6946041211485863, "epoch": 0.01206, "grad_norm": 0.12578003108501434, "kl": 0.6089415103197098, "learning_rate": 7.999493193093452e-06, "loss": -0.0752, "step": 1206, "step_time": 16.740052907000063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3361884141340852, "epoch": 0.01207, "frac_reward_zero_std": 0.25, "grad_norm": 0.08486081659793854, "kl": 0.19131870847195387, "learning_rate": 7.999492326411075e-06, "loss": -0.0639, "num_tokens": 30598024.0, "reward": 0.4089256525039673, "reward_std": 0.8475357890129089, "rewards/rollout_reward_func/mean": 0.4089256525039673, "rewards/rollout_reward_func/std": 0.8475357294082642, "sampling/importance_sampling_ratio/max": 1.1268168687820435, "sampling/importance_sampling_ratio/mean": 0.7800925970077515, "sampling/importance_sampling_ratio/min": 1.2747614164254628e-05, "sampling/sampling_logp_difference/max": 1.4281501770019531, "sampling/sampling_logp_difference/mean": 0.25521111488342285, "step": 1207, "step_time": 28.24246009500348 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.314791931770742, "epoch": 0.01208, "grad_norm": 0.05479757860302925, "kl": 0.19305296521633863, "learning_rate": 7.999491458988344e-06, "loss": -0.0643, "step": 1208, "step_time": 13.60730486200191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 4.615384578704834, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4002601811662316, "epoch": 0.01209, "frac_reward_zero_std": 0.0, "grad_norm": 0.1042499765753746, "kl": 0.20665345434099436, "learning_rate": 7.999490590825262e-06, "loss": -0.0836, "num_tokens": 30649160.0, "reward": 0.5727343559265137, "reward_std": 1.0184639692306519, "rewards/rollout_reward_func/mean": 0.5727343559265137, "rewards/rollout_reward_func/std": 1.0184639692306519, "sampling/importance_sampling_ratio/max": 1.5929045677185059, "sampling/importance_sampling_ratio/mean": 0.7952097058296204, "sampling/importance_sampling_ratio/min": 5.567660991800949e-06, "sampling/sampling_logp_difference/max": 2.142817974090576, "sampling/sampling_logp_difference/mean": 0.2930096983909607, "step": 1209, "step_time": 29.663552272948436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3816926423460245, "epoch": 0.0121, "grad_norm": 0.08633142709732056, "kl": 0.20849639270454645, "learning_rate": 7.999489721921822e-06, "loss": -0.0842, "step": 1210, "step_time": 13.529099131992552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 4.533333778381348, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9790845117531717, "epoch": 0.01211, "frac_reward_zero_std": 0.0, "grad_norm": 0.04171008989214897, "kl": 0.3036065101623535, "learning_rate": 7.999488852278032e-06, "loss": -0.0575, "num_tokens": 30704937.0, "reward": 0.3158425986766815, "reward_std": 0.7398615479469299, "rewards/rollout_reward_func/mean": 0.3158425986766815, "rewards/rollout_reward_func/std": 0.7398614883422852, "sampling/importance_sampling_ratio/max": 1.1439590454101562, "sampling/importance_sampling_ratio/mean": 0.9117237329483032, "sampling/importance_sampling_ratio/min": 2.9825564240582025e-08, "sampling/sampling_logp_difference/max": 1.8964592218399048, "sampling/sampling_logp_difference/mean": 0.24976882338523865, "step": 1211, "step_time": 25.633641686028568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9758967328816652, "epoch": 0.01212, "grad_norm": 0.04221814125776291, "kl": 0.29822794906795025, "learning_rate": 7.999487981893887e-06, "loss": -0.0576, "step": 1212, "step_time": 13.305247670999961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.78125, "completions/mean_terminated_length": 4.033333778381348, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5627277749590576, "epoch": 0.01213, "frac_reward_zero_std": 0.25, "grad_norm": 0.05040208250284195, "kl": 1.106504051014781, "learning_rate": 7.999487110769388e-06, "loss": -0.0536, "num_tokens": 30751603.0, "reward": 1.032221794128418, "reward_std": 0.7269097566604614, "rewards/rollout_reward_func/mean": 1.032221794128418, "rewards/rollout_reward_func/std": 0.7269096970558167, "sampling/importance_sampling_ratio/max": 1.1185539960861206, "sampling/importance_sampling_ratio/mean": 0.8787513971328735, "sampling/importance_sampling_ratio/min": 0.001880740630440414, "sampling/sampling_logp_difference/max": 1.7987251281738281, "sampling/sampling_logp_difference/mean": 0.12713094055652618, "step": 1213, "step_time": 22.837411869957577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.562241833191365, "epoch": 0.01214, "grad_norm": 0.0572972409427166, "kl": 1.1905736830085516, "learning_rate": 7.999486238904537e-06, "loss": -0.0535, "step": 1214, "step_time": 12.382148621953093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.59375, "completions/mean_terminated_length": 4.107142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9254865408875048, "epoch": 0.01215, "frac_reward_zero_std": 0.25, "grad_norm": 0.058329638093709946, "kl": 0.8646571505814791, "learning_rate": 7.999485366299335e-06, "loss": -0.0677, "num_tokens": 30802700.0, "reward": 1.036309838294983, "reward_std": 0.8172475099563599, "rewards/rollout_reward_func/mean": 1.036309838294983, "rewards/rollout_reward_func/std": 0.8172474503517151, "sampling/importance_sampling_ratio/max": 1.0290441513061523, "sampling/importance_sampling_ratio/mean": 0.7955892086029053, "sampling/importance_sampling_ratio/min": 0.0002700081968214363, "sampling/sampling_logp_difference/max": 2.2173898220062256, "sampling/sampling_logp_difference/mean": 0.1994684338569641, "step": 1215, "step_time": 23.907699187053367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9263042255770415, "epoch": 0.01216, "grad_norm": 0.047595180571079254, "kl": 0.7830945355817676, "learning_rate": 7.999484492953778e-06, "loss": -0.068, "step": 1216, "step_time": 12.89445193100255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.09375, "completions/mean_terminated_length": 4.366666793823242, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6842519398778677, "epoch": 0.01217, "frac_reward_zero_std": 0.25, "grad_norm": 0.01034089270979166, "kl": 0.32969539053738117, "learning_rate": 7.99948361886787e-06, "loss": -0.0448, "num_tokens": 30857537.0, "reward": 1.1278941631317139, "reward_std": 0.6739829182624817, "rewards/rollout_reward_func/mean": 1.1278941631317139, "rewards/rollout_reward_func/std": 0.6739829182624817, "sampling/importance_sampling_ratio/max": 1.0518351793289185, "sampling/importance_sampling_ratio/mean": 0.8968210816383362, "sampling/importance_sampling_ratio/min": 1.5257827726600226e-05, "sampling/sampling_logp_difference/max": 1.2847354412078857, "sampling/sampling_logp_difference/mean": 0.16855499148368835, "step": 1217, "step_time": 26.397920503048226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.685121797490865, "epoch": 0.01218, "grad_norm": 0.009582807309925556, "kl": 0.33479224517941475, "learning_rate": 7.99948274404161e-06, "loss": -0.0448, "step": 1218, "step_time": 13.800946111994563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.03125, "completions/mean_terminated_length": 4.185185432434082, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4452418321743608, "epoch": 0.01219, "frac_reward_zero_std": 0.0, "grad_norm": 0.14498978853225708, "kl": 0.20337572786957026, "learning_rate": 7.999481868474998e-06, "loss": -0.0824, "num_tokens": 30909799.0, "reward": 0.7004912495613098, "reward_std": 0.7244558334350586, "rewards/rollout_reward_func/mean": 0.7004912495613098, "rewards/rollout_reward_func/std": 0.7244558930397034, "sampling/importance_sampling_ratio/max": 1.1611275672912598, "sampling/importance_sampling_ratio/mean": 0.8318792581558228, "sampling/importance_sampling_ratio/min": 4.828931423617178e-07, "sampling/sampling_logp_difference/max": 1.759293556213379, "sampling/sampling_logp_difference/mean": 0.31505072116851807, "step": 1219, "step_time": 24.361306443082867 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.427313252352178, "epoch": 0.0122, "grad_norm": 0.09082330018281937, "kl": 0.20730648934841156, "learning_rate": 7.999480992168033e-06, "loss": -0.0829, "step": 1220, "step_time": 12.72269488600432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8656597919762135, "epoch": 0.01221, "frac_reward_zero_std": 0.25, "grad_norm": 0.11538010835647583, "kl": 0.25266470201313496, "learning_rate": 7.99948011512072e-06, "loss": -0.0354, "num_tokens": 30956496.0, "reward": 0.5253021717071533, "reward_std": 0.9575760960578918, "rewards/rollout_reward_func/mean": 0.5253021717071533, "rewards/rollout_reward_func/std": 0.9575760364532471, "sampling/importance_sampling_ratio/max": 1.8534531593322754, "sampling/importance_sampling_ratio/mean": 0.9199742674827576, "sampling/importance_sampling_ratio/min": 5.558575867325999e-05, "sampling/sampling_logp_difference/max": 1.7185466289520264, "sampling/sampling_logp_difference/mean": 0.20014508068561554, "step": 1221, "step_time": 24.611626184050692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.862997711636126, "epoch": 0.01222, "grad_norm": 0.11163728684186935, "kl": 0.2529229335486889, "learning_rate": 7.999479237333053e-06, "loss": -0.0354, "step": 1222, "step_time": 13.35647313503432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 4.96875, "completions/mean_terminated_length": 4.233333587646484, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6809639097191393, "epoch": 0.01223, "frac_reward_zero_std": 0.75, "grad_norm": 0.014061852358281612, "kl": 0.25394371896982193, "learning_rate": 7.999478358805037e-06, "loss": -0.0207, "num_tokens": 31000231.0, "reward": 0.6850413680076599, "reward_std": 0.9401091933250427, "rewards/rollout_reward_func/mean": 0.6850413680076599, "rewards/rollout_reward_func/std": 0.9401091933250427, "sampling/importance_sampling_ratio/max": 1.1190531253814697, "sampling/importance_sampling_ratio/mean": 0.93189537525177, "sampling/importance_sampling_ratio/min": 4.903030799141561e-07, "sampling/sampling_logp_difference/max": 2.2896578311920166, "sampling/sampling_logp_difference/mean": 0.1544726938009262, "step": 1223, "step_time": 18.749184348038398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6804157001897693, "epoch": 0.01224, "grad_norm": 0.013243469409644604, "kl": 0.2549999747425318, "learning_rate": 7.999477479536669e-06, "loss": -0.0206, "step": 1224, "step_time": 10.246617606986547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 4.21875, "completions/mean_terminated_length": 4.21875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.47478109947405756, "epoch": 0.01225, "frac_reward_zero_std": 0.25, "grad_norm": 0.11725137382745743, "kl": 0.7200828269124031, "learning_rate": 7.99947659952795e-06, "loss": -0.0569, "num_tokens": 31060189.0, "reward": 1.0381300449371338, "reward_std": 0.5481088757514954, "rewards/rollout_reward_func/mean": 1.0381300449371338, "rewards/rollout_reward_func/std": 0.5481088161468506, "sampling/importance_sampling_ratio/max": 1.0540636777877808, "sampling/importance_sampling_ratio/mean": 0.880059540271759, "sampling/importance_sampling_ratio/min": 8.204890946217347e-06, "sampling/sampling_logp_difference/max": 2.033184289932251, "sampling/sampling_logp_difference/mean": 0.15100277960300446, "step": 1225, "step_time": 25.02713855195907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.4800907790195197, "epoch": 0.01226, "grad_norm": 0.0520629808306694, "kl": 0.7762834466993809, "learning_rate": 7.999475718778881e-06, "loss": -0.0573, "step": 1226, "step_time": 14.505144086055225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.53125, "completions/mean_terminated_length": 4.161290168762207, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5244288756512105, "epoch": 0.01227, "frac_reward_zero_std": 0.25, "grad_norm": 0.030054660513997078, "kl": 0.37347302958369255, "learning_rate": 7.999474837289462e-06, "loss": -0.0194, "num_tokens": 31102931.0, "reward": 0.5702133178710938, "reward_std": 0.752170741558075, "rewards/rollout_reward_func/mean": 0.5702133178710938, "rewards/rollout_reward_func/std": 0.752170741558075, "sampling/importance_sampling_ratio/max": 1.0778074264526367, "sampling/importance_sampling_ratio/mean": 0.9440829157829285, "sampling/importance_sampling_ratio/min": 5.730130148151602e-09, "sampling/sampling_logp_difference/max": 1.9825786352157593, "sampling/sampling_logp_difference/mean": 0.148529052734375, "step": 1227, "step_time": 22.321015299035935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5262879864312708, "epoch": 0.01228, "grad_norm": 0.03539319336414337, "kl": 0.3978155180811882, "learning_rate": 7.999473955059694e-06, "loss": -0.0193, "step": 1228, "step_time": 12.587215286912397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 4.9375, "completions/mean_terminated_length": 4.200000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7252755942754447, "epoch": 0.01229, "frac_reward_zero_std": 0.5, "grad_norm": 0.025513838976621628, "kl": 0.3771744444966316, "learning_rate": 7.999473072089576e-06, "loss": -0.0432, "num_tokens": 31148290.0, "reward": 0.9708561301231384, "reward_std": 0.6011243462562561, "rewards/rollout_reward_func/mean": 0.9708561301231384, "rewards/rollout_reward_func/std": 0.6011244058609009, "sampling/importance_sampling_ratio/max": 1.0462881326675415, "sampling/importance_sampling_ratio/mean": 0.9235402345657349, "sampling/importance_sampling_ratio/min": 1.5425813444380765e-06, "sampling/sampling_logp_difference/max": 2.420654296875, "sampling/sampling_logp_difference/mean": 0.17150773108005524, "step": 1229, "step_time": 21.751084984018235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7249279632233083, "epoch": 0.0123, "grad_norm": 0.02233234792947769, "kl": 0.3624932412058115, "learning_rate": 7.999472188379108e-06, "loss": -0.0433, "step": 1230, "step_time": 11.289095685002394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.46875, "completions/mean_terminated_length": 4.379310131072998, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7921346672810614, "epoch": 0.01231, "frac_reward_zero_std": 0.25, "grad_norm": 0.014222775585949421, "kl": 0.3145897202193737, "learning_rate": 7.999471303928292e-06, "loss": -0.0633, "num_tokens": 31210647.0, "reward": 0.9029569029808044, "reward_std": 0.7528658509254456, "rewards/rollout_reward_func/mean": 0.9029569029808044, "rewards/rollout_reward_func/std": 0.7528658509254456, "sampling/importance_sampling_ratio/max": 1.2382242679595947, "sampling/importance_sampling_ratio/mean": 0.8786742687225342, "sampling/importance_sampling_ratio/min": 7.406350050587207e-05, "sampling/sampling_logp_difference/max": 1.7947659492492676, "sampling/sampling_logp_difference/mean": 0.22558197379112244, "step": 1231, "step_time": 28.961543378041824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7935010972432792, "epoch": 0.01232, "grad_norm": 0.014106698334217072, "kl": 0.31137985549867153, "learning_rate": 7.999470418737126e-06, "loss": -0.0633, "step": 1232, "step_time": 15.176173072890379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 4.384615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1528248256072402, "epoch": 0.01233, "frac_reward_zero_std": 0.0, "grad_norm": 0.024835174903273582, "kl": 0.36050926707684994, "learning_rate": 7.999469532805612e-06, "loss": -0.0638, "num_tokens": 31272458.0, "reward": 0.3520815670490265, "reward_std": 0.7627394795417786, "rewards/rollout_reward_func/mean": 0.3520815670490265, "rewards/rollout_reward_func/std": 0.7627394795417786, "sampling/importance_sampling_ratio/max": 1.1154571771621704, "sampling/importance_sampling_ratio/mean": 0.8038694858551025, "sampling/importance_sampling_ratio/min": 2.2224590168207214e-08, "sampling/sampling_logp_difference/max": 2.5092711448669434, "sampling/sampling_logp_difference/mean": 0.3037737011909485, "step": 1233, "step_time": 29.92558117301087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1565404529683292, "epoch": 0.01234, "grad_norm": 0.023142224177718163, "kl": 0.3516709506511688, "learning_rate": 7.999468646133747e-06, "loss": -0.0638, "step": 1234, "step_time": 14.615316615992924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.535714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3373371781781316, "epoch": 0.01235, "frac_reward_zero_std": 0.0, "grad_norm": 0.025342121720314026, "kl": 0.3859459478408098, "learning_rate": 7.999467758721536e-06, "loss": -0.073, "num_tokens": 31334581.0, "reward": 0.18531036376953125, "reward_std": 0.7124979496002197, "rewards/rollout_reward_func/mean": 0.18531036376953125, "rewards/rollout_reward_func/std": 0.712497889995575, "sampling/importance_sampling_ratio/max": 1.060667634010315, "sampling/importance_sampling_ratio/mean": 0.7674006819725037, "sampling/importance_sampling_ratio/min": 1.9217013687011786e-05, "sampling/sampling_logp_difference/max": 1.883750319480896, "sampling/sampling_logp_difference/mean": 0.22752851247787476, "step": 1235, "step_time": 30.133605832001194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3377464283257723, "epoch": 0.01236, "grad_norm": 0.03030630573630333, "kl": 0.37839688546955585, "learning_rate": 7.999466870568978e-06, "loss": -0.0731, "step": 1236, "step_time": 14.561257049936103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.59375, "completions/mean_terminated_length": 4.107142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9594486150890589, "epoch": 0.01237, "frac_reward_zero_std": 0.0, "grad_norm": 0.039510492235422134, "kl": 0.2996851038187742, "learning_rate": 7.999465981676069e-06, "loss": -0.0415, "num_tokens": 31392266.0, "reward": 0.5716915130615234, "reward_std": 0.876197874546051, "rewards/rollout_reward_func/mean": 0.5716915130615234, "rewards/rollout_reward_func/std": 0.876197874546051, "sampling/importance_sampling_ratio/max": 1.0785545110702515, "sampling/importance_sampling_ratio/mean": 0.8529155254364014, "sampling/importance_sampling_ratio/min": 6.025972609791097e-09, "sampling/sampling_logp_difference/max": 2.2924296855926514, "sampling/sampling_logp_difference/mean": 0.2862446904182434, "step": 1237, "step_time": 24.59510062399204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9629601901397109, "epoch": 0.01238, "grad_norm": 0.047089654952287674, "kl": 0.2961181215941906, "learning_rate": 7.999465092042814e-06, "loss": -0.0416, "step": 1238, "step_time": 12.902474065020215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 4.59375, "completions/mean_terminated_length": 4.59375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6164593789726496, "epoch": 0.01239, "frac_reward_zero_std": 0.25, "grad_norm": 0.03126506879925728, "kl": 0.6896755117923021, "learning_rate": 7.999464201669212e-06, "loss": -0.0555, "num_tokens": 31428292.0, "reward": 0.8913070559501648, "reward_std": 0.7848905920982361, "rewards/rollout_reward_func/mean": 0.8913070559501648, "rewards/rollout_reward_func/std": 0.7848905920982361, "sampling/importance_sampling_ratio/max": 1.0481616258621216, "sampling/importance_sampling_ratio/mean": 0.93163001537323, "sampling/importance_sampling_ratio/min": 4.3744184949900955e-06, "sampling/sampling_logp_difference/max": 2.1298718452453613, "sampling/sampling_logp_difference/mean": 0.18317872285842896, "step": 1239, "step_time": 12.110628110007383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6205817814916372, "epoch": 0.0124, "grad_norm": 0.028495388105511665, "kl": 0.6670202687382698, "learning_rate": 7.999463310555263e-06, "loss": -0.0555, "step": 1240, "step_time": 6.823709691991098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5557794277556241, "epoch": 0.01241, "frac_reward_zero_std": 0.25, "grad_norm": 0.12250138074159622, "kl": 0.2836908809840679, "learning_rate": 7.999462418700966e-06, "loss": -0.0343, "num_tokens": 31483612.0, "reward": 1.0569407939910889, "reward_std": 0.6596825122833252, "rewards/rollout_reward_func/mean": 1.0569407939910889, "rewards/rollout_reward_func/std": 0.6596825122833252, "sampling/importance_sampling_ratio/max": 1.205315351486206, "sampling/importance_sampling_ratio/mean": 0.9226169586181641, "sampling/importance_sampling_ratio/min": 0.0003279933880548924, "sampling/sampling_logp_difference/max": 1.466567873954773, "sampling/sampling_logp_difference/mean": 0.11028286814689636, "step": 1241, "step_time": 26.850618007039884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.5634440425783396, "epoch": 0.01242, "grad_norm": 0.02937089279294014, "kl": 0.2829643711447716, "learning_rate": 7.999461526106323e-06, "loss": -0.0347, "step": 1242, "step_time": 13.933679815003416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.5625, "completions/mean_terminated_length": 4.482758522033691, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.762713959440589, "epoch": 0.01243, "frac_reward_zero_std": 0.0, "grad_norm": 0.013653329573571682, "kl": 0.34549641236662865, "learning_rate": 7.999460632771332e-06, "loss": -0.0388, "num_tokens": 31534395.0, "reward": 0.4609614908695221, "reward_std": 0.8303871750831604, "rewards/rollout_reward_func/mean": 0.4609614908695221, "rewards/rollout_reward_func/std": 0.8303872346878052, "sampling/importance_sampling_ratio/max": 1.0865213871002197, "sampling/importance_sampling_ratio/mean": 0.8670123815536499, "sampling/importance_sampling_ratio/min": 9.988209058064967e-05, "sampling/sampling_logp_difference/max": 1.7595715522766113, "sampling/sampling_logp_difference/mean": 0.14382052421569824, "step": 1243, "step_time": 23.718430795008317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7632426247000694, "epoch": 0.01244, "grad_norm": 0.014451298862695694, "kl": 0.3372999597340822, "learning_rate": 7.999459738695996e-06, "loss": -0.0388, "step": 1244, "step_time": 11.908761997066904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 4.222222328186035, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7326065069064498, "epoch": 0.01245, "frac_reward_zero_std": 0.25, "grad_norm": 0.10621494054794312, "kl": 0.6940189450979233, "learning_rate": 7.999458843880313e-06, "loss": -0.0651, "num_tokens": 31588568.0, "reward": 0.604876697063446, "reward_std": 0.8200282454490662, "rewards/rollout_reward_func/mean": 0.604876697063446, "rewards/rollout_reward_func/std": 0.8200282454490662, "sampling/importance_sampling_ratio/max": 1.0934802293777466, "sampling/importance_sampling_ratio/mean": 0.7251678705215454, "sampling/importance_sampling_ratio/min": 3.012594174123251e-09, "sampling/sampling_logp_difference/max": 1.9059499502182007, "sampling/sampling_logp_difference/mean": 0.35662683844566345, "step": 1245, "step_time": 25.415978344040923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7473620837554336, "epoch": 0.01246, "grad_norm": 0.10314030200242996, "kl": 0.6662653926759958, "learning_rate": 7.999457948324285e-06, "loss": -0.0654, "step": 1246, "step_time": 13.304109025018988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.0625, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1664094403386116, "epoch": 0.01247, "frac_reward_zero_std": 0.0, "grad_norm": 0.26037928462028503, "kl": 1.0321468971669674, "learning_rate": 7.99945705202791e-06, "loss": -0.0557, "num_tokens": 31633306.0, "reward": 0.40359967947006226, "reward_std": 0.9001310467720032, "rewards/rollout_reward_func/mean": 0.40359967947006226, "rewards/rollout_reward_func/std": 0.9001310467720032, "sampling/importance_sampling_ratio/max": 1.6802735328674316, "sampling/importance_sampling_ratio/mean": 0.8901987075805664, "sampling/importance_sampling_ratio/min": 9.801119915664458e-09, "sampling/sampling_logp_difference/max": 2.2080271244049072, "sampling/sampling_logp_difference/mean": 0.29864683747291565, "step": 1247, "step_time": 23.132308319036383 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 1.164851845242083, "epoch": 0.01248, "grad_norm": 0.21625177562236786, "kl": 0.8888695575296879, "learning_rate": 7.999456154991191e-06, "loss": -0.0575, "step": 1248, "step_time": 12.41514119299245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 5.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5433075688779354, "epoch": 0.01249, "frac_reward_zero_std": 0.0, "grad_norm": 0.07364523410797119, "kl": 0.4273579064756632, "learning_rate": 7.999455257214127e-06, "loss": -0.065, "num_tokens": 31687157.0, "reward": 0.7403556108474731, "reward_std": 0.7689730525016785, "rewards/rollout_reward_func/mean": 0.7403556108474731, "rewards/rollout_reward_func/std": 0.7689729928970337, "sampling/importance_sampling_ratio/max": 1.076941967010498, "sampling/importance_sampling_ratio/mean": 0.7626111507415771, "sampling/importance_sampling_ratio/min": 5.311543427310994e-10, "sampling/sampling_logp_difference/max": 2.7793688774108887, "sampling/sampling_logp_difference/mean": 0.3894239068031311, "step": 1249, "step_time": 26.506826332974015 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013494318351149559, "entropy": 1.5606033625081182, "epoch": 0.0125, "grad_norm": 0.0608273521065712, "kl": 0.3897369746118784, "learning_rate": 7.999454358696716e-06, "loss": -0.0654, "step": 1250, "step_time": 13.800747366040014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.8125, "completions/mean_terminated_length": 4.357142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0509518571197987, "epoch": 0.01251, "frac_reward_zero_std": 0.0, "grad_norm": 0.08367928117513657, "kl": 0.7167922742664814, "learning_rate": 7.999453459438961e-06, "loss": -0.0857, "num_tokens": 31747165.0, "reward": 0.37658125162124634, "reward_std": 0.6300945281982422, "rewards/rollout_reward_func/mean": 0.37658125162124634, "rewards/rollout_reward_func/std": 0.6300944685935974, "sampling/importance_sampling_ratio/max": 1.1194705963134766, "sampling/importance_sampling_ratio/mean": 0.8365262746810913, "sampling/importance_sampling_ratio/min": 1.4577432011719793e-05, "sampling/sampling_logp_difference/max": 2.1955032348632812, "sampling/sampling_logp_difference/mean": 0.27801626920700073, "step": 1251, "step_time": 32.07735617499566 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "entropy": 1.0667220624163747, "epoch": 0.01252, "grad_norm": 0.06831930577754974, "kl": 0.6338173355907202, "learning_rate": 7.999452559440863e-06, "loss": -0.0858, "step": 1252, "step_time": 16.298376963939518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 4.230769157409668, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4620849583297968, "epoch": 0.01253, "frac_reward_zero_std": 0.25, "grad_norm": 0.014572063460946083, "kl": 0.3158673606812954, "learning_rate": 7.999451658702418e-06, "loss": -0.0648, "num_tokens": 31798358.0, "reward": 0.4753444790840149, "reward_std": 0.8659231662750244, "rewards/rollout_reward_func/mean": 0.4753444790840149, "rewards/rollout_reward_func/std": 0.8659232258796692, "sampling/importance_sampling_ratio/max": 1.0844871997833252, "sampling/importance_sampling_ratio/mean": 0.7813266515731812, "sampling/importance_sampling_ratio/min": 4.0067988038572366e-08, "sampling/sampling_logp_difference/max": 2.189329147338867, "sampling/sampling_logp_difference/mean": 0.34238138794898987, "step": 1253, "step_time": 25.324461414042162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4707123450934887, "epoch": 0.01254, "grad_norm": 0.015484670177102089, "kl": 0.31435886584222317, "learning_rate": 7.999450757223633e-06, "loss": -0.0648, "step": 1254, "step_time": 12.71523431505193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.84375, "completions/mean_terminated_length": 4.730769634246826, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4434481291100383, "epoch": 0.01255, "frac_reward_zero_std": 0.0, "grad_norm": 0.050098806619644165, "kl": 0.31925547076389194, "learning_rate": 7.9994498550045e-06, "loss": -0.0824, "num_tokens": 31861742.0, "reward": 0.3499888777732849, "reward_std": 0.7652558088302612, "rewards/rollout_reward_func/mean": 0.3499888777732849, "rewards/rollout_reward_func/std": 0.7652558088302612, "sampling/importance_sampling_ratio/max": 1.903880000114441, "sampling/importance_sampling_ratio/mean": 0.7545264959335327, "sampling/importance_sampling_ratio/min": 8.886484283721074e-05, "sampling/sampling_logp_difference/max": 1.959661602973938, "sampling/sampling_logp_difference/mean": 0.3036043047904968, "step": 1255, "step_time": 35.311353182041785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4574363753199577, "epoch": 0.01256, "grad_norm": 0.04966659098863602, "kl": 0.31746550695970654, "learning_rate": 7.999448952045025e-06, "loss": -0.0823, "step": 1256, "step_time": 18.798374808044173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.03125, "completions/mean_terminated_length": 4.607142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.33470225520432, "epoch": 0.01257, "frac_reward_zero_std": 0.25, "grad_norm": 0.02504400722682476, "kl": 0.44286156073212624, "learning_rate": 7.999448048345206e-06, "loss": -0.0541, "num_tokens": 31915364.0, "reward": 0.9606766700744629, "reward_std": 0.7538092136383057, "rewards/rollout_reward_func/mean": 0.9606766700744629, "rewards/rollout_reward_func/std": 0.7538092732429504, "sampling/importance_sampling_ratio/max": 1.0956050157546997, "sampling/importance_sampling_ratio/mean": 0.811593770980835, "sampling/importance_sampling_ratio/min": 1.6090474508345665e-09, "sampling/sampling_logp_difference/max": 2.9883670806884766, "sampling/sampling_logp_difference/mean": 0.2976455092430115, "step": 1257, "step_time": 25.986907152022468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3355730678886175, "epoch": 0.01258, "grad_norm": 0.023714657872915268, "kl": 0.431119030341506, "learning_rate": 7.999447143905043e-06, "loss": -0.0541, "step": 1258, "step_time": 12.672273781063268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 4.965517044067383, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6238751150667667, "epoch": 0.01259, "frac_reward_zero_std": 0.25, "grad_norm": 0.05795522406697273, "kl": 0.32319523952901363, "learning_rate": 7.99944623872454e-06, "loss": -0.0805, "num_tokens": 31966561.0, "reward": 0.8726373314857483, "reward_std": 0.8380013704299927, "rewards/rollout_reward_func/mean": 0.8726373314857483, "rewards/rollout_reward_func/std": 0.8380014300346375, "sampling/importance_sampling_ratio/max": 1.0707371234893799, "sampling/importance_sampling_ratio/mean": 0.7468210458755493, "sampling/importance_sampling_ratio/min": 3.310191459604539e-06, "sampling/sampling_logp_difference/max": 2.206740379333496, "sampling/sampling_logp_difference/mean": 0.351232647895813, "step": 1259, "step_time": 27.220610767952166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.622360902838409, "epoch": 0.0126, "grad_norm": 0.056684646755456924, "kl": 0.32601687498390675, "learning_rate": 7.999445332803692e-06, "loss": -0.0805, "step": 1260, "step_time": 14.865573181974469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.625, "completions/mean_terminated_length": 4.761904716491699, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1068890914320946, "epoch": 0.01261, "frac_reward_zero_std": 0.0, "grad_norm": 0.028676502406597137, "kl": 0.3461852655746043, "learning_rate": 7.999444426142501e-06, "loss": -0.1034, "num_tokens": 32026721.0, "reward": 0.6578989028930664, "reward_std": 0.993954062461853, "rewards/rollout_reward_func/mean": 0.6578989028930664, "rewards/rollout_reward_func/std": 0.993954062461853, "sampling/importance_sampling_ratio/max": 1.1093969345092773, "sampling/importance_sampling_ratio/mean": 0.6135625839233398, "sampling/importance_sampling_ratio/min": 1.7969578891552374e-07, "sampling/sampling_logp_difference/max": 2.501493453979492, "sampling/sampling_logp_difference/mean": 0.41220617294311523, "step": 1261, "step_time": 29.883549282007152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.10370625089854, "epoch": 0.01262, "grad_norm": 0.026851536706089973, "kl": 0.33371306979097426, "learning_rate": 7.999443518740967e-06, "loss": -0.1036, "step": 1262, "step_time": 12.594525177031755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.46875, "completions/mean_terminated_length": 4.130434989929199, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.083997407928109, "epoch": 0.01263, "frac_reward_zero_std": 0.0, "grad_norm": 0.1479387730360031, "kl": 0.498453626409173, "learning_rate": 7.999442610599094e-06, "loss": -0.087, "num_tokens": 32082616.0, "reward": 0.2938859164714813, "reward_std": 0.8871453404426575, "rewards/rollout_reward_func/mean": 0.2938859164714813, "rewards/rollout_reward_func/std": 0.8871453404426575, "sampling/importance_sampling_ratio/max": 1.1512115001678467, "sampling/importance_sampling_ratio/mean": 0.6249211430549622, "sampling/importance_sampling_ratio/min": 9.446462501117026e-10, "sampling/sampling_logp_difference/max": 2.5512490272521973, "sampling/sampling_logp_difference/mean": 0.3998541235923767, "step": 1263, "step_time": 29.543305723986123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0828962940722704, "epoch": 0.01264, "grad_norm": 0.13532955944538116, "kl": 0.5068298615515232, "learning_rate": 7.999441701716877e-06, "loss": -0.0877, "step": 1264, "step_time": 13.83235086305649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 4.869565486907959, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2181537598371506, "epoch": 0.01265, "frac_reward_zero_std": 0.0, "grad_norm": 0.10720459371805191, "kl": 0.3383202701807022, "learning_rate": 7.999440792094319e-06, "loss": -0.0903, "num_tokens": 32142686.0, "reward": 0.380035936832428, "reward_std": 0.928961992263794, "rewards/rollout_reward_func/mean": 0.380035936832428, "rewards/rollout_reward_func/std": 0.928961992263794, "sampling/importance_sampling_ratio/max": 1.144002079963684, "sampling/importance_sampling_ratio/mean": 0.5902698040008545, "sampling/importance_sampling_ratio/min": 4.749880488930103e-10, "sampling/sampling_logp_difference/max": 2.193359851837158, "sampling/sampling_logp_difference/mean": 0.40974974632263184, "step": 1265, "step_time": 28.78073847500491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.212021067738533, "epoch": 0.01266, "grad_norm": 0.10619927197694778, "kl": 0.3386642150580883, "learning_rate": 7.999439881731418e-06, "loss": -0.0905, "step": 1266, "step_time": 14.057777544978308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.28125, "completions/mean_terminated_length": 4.481481552124023, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3065098822116852, "epoch": 0.01267, "frac_reward_zero_std": 0.0, "grad_norm": 0.0641975998878479, "kl": 0.3486221553757787, "learning_rate": 7.999438970628177e-06, "loss": -0.0813, "num_tokens": 32199776.0, "reward": 0.6688190698623657, "reward_std": 0.8205643892288208, "rewards/rollout_reward_func/mean": 0.6688190698623657, "rewards/rollout_reward_func/std": 0.8205643892288208, "sampling/importance_sampling_ratio/max": 1.1438915729522705, "sampling/importance_sampling_ratio/mean": 0.7998310327529907, "sampling/importance_sampling_ratio/min": 0.00016474384756293148, "sampling/sampling_logp_difference/max": 1.6928128004074097, "sampling/sampling_logp_difference/mean": 0.2561810314655304, "step": 1267, "step_time": 26.200096807006048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3060685209929943, "epoch": 0.01268, "grad_norm": 0.06651357561349869, "kl": 0.356151289306581, "learning_rate": 7.999438058784595e-06, "loss": -0.0813, "step": 1268, "step_time": 12.801263502042275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.9375, "completions/mean_terminated_length": 4.0740742683410645, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4193603433668613, "epoch": 0.01269, "frac_reward_zero_std": 0.25, "grad_norm": 0.051803212612867355, "kl": 0.3703724257647991, "learning_rate": 7.999437146200673e-06, "loss": -0.0733, "num_tokens": 32250607.0, "reward": 1.0317986011505127, "reward_std": 0.6797657608985901, "rewards/rollout_reward_func/mean": 1.0317986011505127, "rewards/rollout_reward_func/std": 0.6797657012939453, "sampling/importance_sampling_ratio/max": 1.2172305583953857, "sampling/importance_sampling_ratio/mean": 0.8407789468765259, "sampling/importance_sampling_ratio/min": 6.390394879929318e-10, "sampling/sampling_logp_difference/max": 2.5111851692199707, "sampling/sampling_logp_difference/mean": 0.2922847270965576, "step": 1269, "step_time": 28.635092858952703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.416013508103788, "epoch": 0.0127, "grad_norm": 0.04988950490951538, "kl": 0.3845474701374769, "learning_rate": 7.99943623287641e-06, "loss": -0.0734, "step": 1270, "step_time": 13.74327394401189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.043572966940701, "epoch": 0.01271, "frac_reward_zero_std": 0.25, "grad_norm": 0.04135908931493759, "kl": 0.3886036239564419, "learning_rate": 7.999435318811804e-06, "loss": -0.0669, "num_tokens": 32298477.0, "reward": 0.9795235395431519, "reward_std": 0.7975299954414368, "rewards/rollout_reward_func/mean": 0.9795235395431519, "rewards/rollout_reward_func/std": 0.797529935836792, "sampling/importance_sampling_ratio/max": 1.0440675020217896, "sampling/importance_sampling_ratio/mean": 0.8331202268600464, "sampling/importance_sampling_ratio/min": 1.3307485460245516e-06, "sampling/sampling_logp_difference/max": 2.324720621109009, "sampling/sampling_logp_difference/mean": 0.2578190565109253, "step": 1271, "step_time": 28.192611632985063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.039041593670845, "epoch": 0.01272, "grad_norm": 0.036201536655426025, "kl": 0.3841166850179434, "learning_rate": 7.999434404006862e-06, "loss": -0.067, "step": 1272, "step_time": 14.979748004989233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 4.266666889190674, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9654583716765046, "epoch": 0.01273, "frac_reward_zero_std": 0.0, "grad_norm": 0.03310537710785866, "kl": 0.380545012652874, "learning_rate": 7.999433488461578e-06, "loss": -0.0807, "num_tokens": 32342499.0, "reward": 1.2241928577423096, "reward_std": 0.5359553694725037, "rewards/rollout_reward_func/mean": 1.2241928577423096, "rewards/rollout_reward_func/std": 0.5359553694725037, "sampling/importance_sampling_ratio/max": 1.1245815753936768, "sampling/importance_sampling_ratio/mean": 0.8813047409057617, "sampling/importance_sampling_ratio/min": 1.7732247961643743e-11, "sampling/sampling_logp_difference/max": 2.3562607765197754, "sampling/sampling_logp_difference/mean": 0.2924852967262268, "step": 1273, "step_time": 20.437366489961278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.9610408125445247, "epoch": 0.01274, "grad_norm": 0.03210471570491791, "kl": 0.4063938222825527, "learning_rate": 7.999432572175954e-06, "loss": -0.0808, "step": 1274, "step_time": 11.061266541015357 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.34375, "completions/mean_terminated_length": 4.555555820465088, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4220159836113453, "epoch": 0.01275, "frac_reward_zero_std": 0.25, "grad_norm": 0.14619101583957672, "kl": 0.2472313866019249, "learning_rate": 7.999431655149993e-06, "loss": -0.0347, "num_tokens": 32388837.0, "reward": 0.8070189952850342, "reward_std": 0.8760681748390198, "rewards/rollout_reward_func/mean": 0.8070189952850342, "rewards/rollout_reward_func/std": 0.876068115234375, "sampling/importance_sampling_ratio/max": 1.0930535793304443, "sampling/importance_sampling_ratio/mean": 0.7504525780677795, "sampling/importance_sampling_ratio/min": 2.893311148000066e-06, "sampling/sampling_logp_difference/max": 2.1009035110473633, "sampling/sampling_logp_difference/mean": 0.2680560350418091, "step": 1275, "step_time": 23.926990834996104 }, { "clip_ratio/high_max": 0.044602273497730494, "clip_ratio/high_mean": 0.022301136748865247, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022301136748865247, "entropy": 1.4037243258208036, "epoch": 0.01276, "grad_norm": 0.0862957313656807, "kl": 0.24624152667820454, "learning_rate": 7.99943073738369e-06, "loss": -0.0354, "step": 1276, "step_time": 12.033092789934017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.59375, "completions/mean_terminated_length": 4.304347991943359, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6467994078993797, "epoch": 0.01277, "frac_reward_zero_std": 0.0, "grad_norm": 0.0741247683763504, "kl": 0.4674170669168234, "learning_rate": 7.99942981887705e-06, "loss": -0.0781, "num_tokens": 32447229.0, "reward": 0.5391907095909119, "reward_std": 0.9579869508743286, "rewards/rollout_reward_func/mean": 0.5391907095909119, "rewards/rollout_reward_func/std": 0.9579869508743286, "sampling/importance_sampling_ratio/max": 1.07766854763031, "sampling/importance_sampling_ratio/mean": 0.6814926862716675, "sampling/importance_sampling_ratio/min": 7.098692731233314e-05, "sampling/sampling_logp_difference/max": 2.6439194679260254, "sampling/sampling_logp_difference/mean": 0.3158038258552551, "step": 1277, "step_time": 31.32366637996165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.641024399548769, "epoch": 0.01278, "grad_norm": 0.06819687783718109, "kl": 0.4575731363147497, "learning_rate": 7.99942889963007e-06, "loss": -0.078, "step": 1278, "step_time": 15.053228777978802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 4.533333778381348, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0109230382367969, "epoch": 0.01279, "frac_reward_zero_std": 0.25, "grad_norm": 0.1249396875500679, "kl": 0.9983568117022514, "learning_rate": 7.99942797964275e-06, "loss": -0.027, "num_tokens": 32499661.0, "reward": 0.9242802858352661, "reward_std": 0.7757692933082581, "rewards/rollout_reward_func/mean": 0.9242802858352661, "rewards/rollout_reward_func/std": 0.7757692337036133, "sampling/importance_sampling_ratio/max": 1.0936789512634277, "sampling/importance_sampling_ratio/mean": 0.875382661819458, "sampling/importance_sampling_ratio/min": 1.1369350773338116e-11, "sampling/sampling_logp_difference/max": 3.3066890239715576, "sampling/sampling_logp_difference/mean": 0.32199349999427795, "step": 1279, "step_time": 23.98353911601589 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 1.0153442886658013, "epoch": 0.0128, "grad_norm": 0.10172063857316971, "kl": 0.8542775940150023, "learning_rate": 7.999427058915096e-06, "loss": -0.0275, "step": 1280, "step_time": 12.738528733985731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.40625, "completions/mean_terminated_length": 4.192307949066162, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.27138494187966, "epoch": 0.01281, "frac_reward_zero_std": 0.0, "grad_norm": 0.050062328577041626, "kl": 0.3125022202730179, "learning_rate": 7.9994261374471e-06, "loss": -0.0669, "num_tokens": 32554242.0, "reward": 0.7657596468925476, "reward_std": 0.7492973208427429, "rewards/rollout_reward_func/mean": 0.7657596468925476, "rewards/rollout_reward_func/std": 0.7492972612380981, "sampling/importance_sampling_ratio/max": 1.1394461393356323, "sampling/importance_sampling_ratio/mean": 0.8095406889915466, "sampling/importance_sampling_ratio/min": 2.3887476885420256e-08, "sampling/sampling_logp_difference/max": 2.3231029510498047, "sampling/sampling_logp_difference/mean": 0.34296637773513794, "step": 1281, "step_time": 30.18852193004568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2787944478914142, "epoch": 0.01282, "grad_norm": 0.05062057822942734, "kl": 0.3018018249422312, "learning_rate": 7.99942521523877e-06, "loss": -0.0672, "step": 1282, "step_time": 15.037982233043294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 4.689655303955078, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4988565873354673, "epoch": 0.01283, "frac_reward_zero_std": 0.25, "grad_norm": 0.1671205759048462, "kl": 0.3365529952570796, "learning_rate": 7.999424292290098e-06, "loss": -0.0604, "num_tokens": 32596359.0, "reward": -0.007052123546600342, "reward_std": 0.5715733170509338, "rewards/rollout_reward_func/mean": -0.007052123546600342, "rewards/rollout_reward_func/std": 0.5715733170509338, "sampling/importance_sampling_ratio/max": 1.128292441368103, "sampling/importance_sampling_ratio/mean": 0.771719217300415, "sampling/importance_sampling_ratio/min": 3.9287347135541495e-06, "sampling/sampling_logp_difference/max": 2.560964822769165, "sampling/sampling_logp_difference/mean": 0.30835703015327454, "step": 1283, "step_time": 22.870799017051468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5024850592017174, "epoch": 0.01284, "grad_norm": 0.1640441119670868, "kl": 0.3359303828328848, "learning_rate": 7.999423368601091e-06, "loss": -0.0607, "step": 1284, "step_time": 12.2168638369767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 4.888888835906982, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6908640153706074, "epoch": 0.01285, "frac_reward_zero_std": 0.0, "grad_norm": 0.11169964075088501, "kl": 0.8573457766324282, "learning_rate": 7.999422444171747e-06, "loss": -0.0871, "num_tokens": 32649329.0, "reward": 0.696423351764679, "reward_std": 0.9243254661560059, "rewards/rollout_reward_func/mean": 0.696423351764679, "rewards/rollout_reward_func/std": 0.9243254065513611, "sampling/importance_sampling_ratio/max": 1.2371808290481567, "sampling/importance_sampling_ratio/mean": 0.7507187724113464, "sampling/importance_sampling_ratio/min": 1.308871446781268e-07, "sampling/sampling_logp_difference/max": 2.2531161308288574, "sampling/sampling_logp_difference/mean": 0.3156334161758423, "step": 1285, "step_time": 26.437459912034683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6906710863113403, "epoch": 0.01286, "grad_norm": 0.10868008434772491, "kl": 0.8211007732897997, "learning_rate": 7.999421519002065e-06, "loss": -0.0872, "step": 1286, "step_time": 12.559999148972565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 9.3125, "completions/mean_terminated_length": 4.736842155456543, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.6220060884952545, "epoch": 0.01287, "frac_reward_zero_std": 0.0, "grad_norm": 0.05996469035744667, "kl": 0.36289098719134927, "learning_rate": 7.999420593092048e-06, "loss": -0.063, "num_tokens": 32703353.0, "reward": 0.06042696535587311, "reward_std": 0.8967229127883911, "rewards/rollout_reward_func/mean": 0.06042696535587311, "rewards/rollout_reward_func/std": 0.8967229127883911, "sampling/importance_sampling_ratio/max": 1.1212352514266968, "sampling/importance_sampling_ratio/mean": 0.4883217215538025, "sampling/importance_sampling_ratio/min": 1.3668010723222324e-08, "sampling/sampling_logp_difference/max": 2.490062713623047, "sampling/sampling_logp_difference/mean": 0.43957966566085815, "step": 1287, "step_time": 32.02311287497287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.620723471045494, "epoch": 0.01288, "grad_norm": 0.05791601538658142, "kl": 0.36762740835547447, "learning_rate": 7.999419666441693e-06, "loss": -0.063, "step": 1288, "step_time": 13.35386279199156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.78125, "completions/mean_terminated_length": 5.0740742683410645, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6892509823665023, "epoch": 0.01289, "frac_reward_zero_std": 0.25, "grad_norm": 0.044993218034505844, "kl": 0.29081212170422077, "learning_rate": 7.999418739051001e-06, "loss": -0.0753, "num_tokens": 32761659.0, "reward": 0.5770307779312134, "reward_std": 0.7597915530204773, "rewards/rollout_reward_func/mean": 0.5770307779312134, "rewards/rollout_reward_func/std": 0.7597914934158325, "sampling/importance_sampling_ratio/max": 1.1532847881317139, "sampling/importance_sampling_ratio/mean": 0.7363117933273315, "sampling/importance_sampling_ratio/min": 5.211925113712823e-08, "sampling/sampling_logp_difference/max": 2.029681921005249, "sampling/sampling_logp_difference/mean": 0.3252028226852417, "step": 1289, "step_time": 29.13861073803855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6819781679660082, "epoch": 0.0129, "grad_norm": 0.04245605319738388, "kl": 0.28577502630650997, "learning_rate": 7.999417810919975e-06, "loss": -0.0753, "step": 1290, "step_time": 14.335397393995663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 4.785714626312256, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.462601199746132, "epoch": 0.01291, "frac_reward_zero_std": 0.0, "grad_norm": 0.10334353148937225, "kl": 0.3761893380433321, "learning_rate": 7.999416882048612e-06, "loss": -0.0752, "num_tokens": 32820659.0, "reward": 0.11240667849779129, "reward_std": 0.5101103782653809, "rewards/rollout_reward_func/mean": 0.11240667849779129, "rewards/rollout_reward_func/std": 0.5101103782653809, "sampling/importance_sampling_ratio/max": 1.1519490480422974, "sampling/importance_sampling_ratio/mean": 0.7152687311172485, "sampling/importance_sampling_ratio/min": 3.2035888580139726e-05, "sampling/sampling_logp_difference/max": 2.051187515258789, "sampling/sampling_logp_difference/mean": 0.3060470223426819, "step": 1291, "step_time": 30.307826503063552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4521410018205643, "epoch": 0.01292, "grad_norm": 0.09405336529016495, "kl": 0.3728594621643424, "learning_rate": 7.999415952436915e-06, "loss": -0.0756, "step": 1292, "step_time": 16.048733581992565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 4.461538791656494, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4154688566923141, "epoch": 0.01293, "frac_reward_zero_std": 0.0, "grad_norm": 0.10008393973112106, "kl": 0.2295845616608858, "learning_rate": 7.99941502208488e-06, "loss": -0.0557, "num_tokens": 32874543.0, "reward": -0.027369871735572815, "reward_std": 0.7251584529876709, "rewards/rollout_reward_func/mean": -0.027369871735572815, "rewards/rollout_reward_func/std": 0.7251583933830261, "sampling/importance_sampling_ratio/max": 1.1880179643630981, "sampling/importance_sampling_ratio/mean": 0.7213588953018188, "sampling/importance_sampling_ratio/min": 2.406641783636587e-07, "sampling/sampling_logp_difference/max": 1.9404487609863281, "sampling/sampling_logp_difference/mean": 0.3191789388656616, "step": 1293, "step_time": 28.438561529968865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4053100645542145, "epoch": 0.01294, "grad_norm": 0.09876804798841476, "kl": 0.22945496626198292, "learning_rate": 7.999414090992513e-06, "loss": -0.0559, "step": 1294, "step_time": 13.491058269981295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.535714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3722548536024988, "epoch": 0.01295, "frac_reward_zero_std": 0.25, "grad_norm": 0.03936873748898506, "kl": 0.35281043499708176, "learning_rate": 7.999413159159809e-06, "loss": -0.0517, "num_tokens": 32925531.0, "reward": 0.1972813904285431, "reward_std": 0.7206364870071411, "rewards/rollout_reward_func/mean": 0.1972813904285431, "rewards/rollout_reward_func/std": 0.7206364870071411, "sampling/importance_sampling_ratio/max": 1.1339585781097412, "sampling/importance_sampling_ratio/mean": 0.7222343683242798, "sampling/importance_sampling_ratio/min": 0.0002304328081663698, "sampling/sampling_logp_difference/max": 1.731791377067566, "sampling/sampling_logp_difference/mean": 0.25033730268478394, "step": 1295, "step_time": 25.62704585699248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.3749736780300736, "epoch": 0.01296, "grad_norm": 0.03804553672671318, "kl": 0.3402053341269493, "learning_rate": 7.999412226586771e-06, "loss": -0.0518, "step": 1296, "step_time": 12.769701702025486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 4.782608985900879, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.7869009040296078, "epoch": 0.01297, "frac_reward_zero_std": 0.0, "grad_norm": 0.1927720457315445, "kl": 1.8232288723811507, "learning_rate": 7.999411293273398e-06, "loss": -0.0805, "num_tokens": 32980955.0, "reward": 0.39732030034065247, "reward_std": 0.8575975298881531, "rewards/rollout_reward_func/mean": 0.39732030034065247, "rewards/rollout_reward_func/std": 0.8575975298881531, "sampling/importance_sampling_ratio/max": 1.0732868909835815, "sampling/importance_sampling_ratio/mean": 0.537794291973114, "sampling/importance_sampling_ratio/min": 2.2216442019384885e-09, "sampling/sampling_logp_difference/max": 2.671164035797119, "sampling/sampling_logp_difference/mean": 0.5451306700706482, "step": 1297, "step_time": 29.820163592987228 }, { "clip_ratio/high_max": 0.007352941203862429, "clip_ratio/high_mean": 0.0036764706019312143, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036764706019312143, "entropy": 2.792051427066326, "epoch": 0.01298, "grad_norm": 0.1746721714735031, "kl": 1.6930748298764229, "learning_rate": 7.999410359219692e-06, "loss": -0.0814, "step": 1298, "step_time": 13.779181866004365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 4.360000133514404, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9334735050797462, "epoch": 0.01299, "frac_reward_zero_std": 0.0, "grad_norm": 0.04219842702150345, "kl": 0.5059714894741774, "learning_rate": 7.999409424425651e-06, "loss": -0.087, "num_tokens": 33041156.0, "reward": 0.43237176537513733, "reward_std": 0.8937899470329285, "rewards/rollout_reward_func/mean": 0.43237176537513733, "rewards/rollout_reward_func/std": 0.8937899470329285, "sampling/importance_sampling_ratio/max": 1.2804745435714722, "sampling/importance_sampling_ratio/mean": 0.7069684267044067, "sampling/importance_sampling_ratio/min": 8.563674924744191e-08, "sampling/sampling_logp_difference/max": 2.185194969177246, "sampling/sampling_logp_difference/mean": 0.4048413336277008, "step": 1299, "step_time": 28.322112538007786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9428876340389252, "epoch": 0.013, "grad_norm": 0.04645809903740883, "kl": 0.48920871317386627, "learning_rate": 7.999408488891278e-06, "loss": -0.0869, "step": 1300, "step_time": 14.680227675969945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.90625, "completions/mean_terminated_length": 4.548387050628662, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9967289185151458, "epoch": 0.01301, "frac_reward_zero_std": 0.0, "grad_norm": 0.09930328279733658, "kl": 0.38005103543400764, "learning_rate": 7.99940755261657e-06, "loss": -0.0636, "num_tokens": 33094802.0, "reward": 0.9262996912002563, "reward_std": 0.6824420094490051, "rewards/rollout_reward_func/mean": 0.9262996912002563, "rewards/rollout_reward_func/std": 0.6824420690536499, "sampling/importance_sampling_ratio/max": 1.1218105554580688, "sampling/importance_sampling_ratio/mean": 0.8237243890762329, "sampling/importance_sampling_ratio/min": 5.604411126114428e-05, "sampling/sampling_logp_difference/max": 2.021263837814331, "sampling/sampling_logp_difference/mean": 0.23493097722530365, "step": 1301, "step_time": 26.307939148973674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0054091261699796, "epoch": 0.01302, "grad_norm": 0.09369000047445297, "kl": 0.36962447315454483, "learning_rate": 7.99940661560153e-06, "loss": -0.0637, "step": 1302, "step_time": 13.858354613970732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.875, "completions/mean_terminated_length": 4.133333683013916, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1336501650512218, "epoch": 0.01303, "frac_reward_zero_std": 0.0, "grad_norm": 0.13657020032405853, "kl": 0.6639980375766754, "learning_rate": 7.999405677846155e-06, "loss": -0.0691, "num_tokens": 33148979.0, "reward": 0.8204860091209412, "reward_std": 0.724336564540863, "rewards/rollout_reward_func/mean": 0.8204860091209412, "rewards/rollout_reward_func/std": 0.7243364453315735, "sampling/importance_sampling_ratio/max": 1.1529853343963623, "sampling/importance_sampling_ratio/mean": 0.7846258282661438, "sampling/importance_sampling_ratio/min": 4.432747857663344e-07, "sampling/sampling_logp_difference/max": 2.255703926086426, "sampling/sampling_logp_difference/mean": 0.24252092838287354, "step": 1303, "step_time": 23.678180270042503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1536552524194121, "epoch": 0.01304, "grad_norm": 0.13489815592765808, "kl": 0.6522859707474709, "learning_rate": 7.99940473935045e-06, "loss": -0.0698, "step": 1304, "step_time": 12.696948637021706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 4.965517044067383, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6077561574056745, "epoch": 0.01305, "frac_reward_zero_std": 0.0, "grad_norm": 0.04415476322174072, "kl": 0.3598881885409355, "learning_rate": 7.99940380011441e-06, "loss": -0.067, "num_tokens": 33204613.0, "reward": 0.6359463334083557, "reward_std": 0.8255017399787903, "rewards/rollout_reward_func/mean": 0.6359463334083557, "rewards/rollout_reward_func/std": 0.8255017399787903, "sampling/importance_sampling_ratio/max": 1.2694209814071655, "sampling/importance_sampling_ratio/mean": 0.7244728803634644, "sampling/importance_sampling_ratio/min": 3.4581785257614683e-07, "sampling/sampling_logp_difference/max": 2.179478168487549, "sampling/sampling_logp_difference/mean": 0.33439379930496216, "step": 1305, "step_time": 28.28015326001332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.62203876581043, "epoch": 0.01306, "grad_norm": 0.042459629476070404, "kl": 0.3323397906497121, "learning_rate": 7.999402860138038e-06, "loss": -0.0671, "step": 1306, "step_time": 15.418970138009172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.21875, "completions/mean_terminated_length": 4.103448390960693, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9581192843616009, "epoch": 0.01307, "frac_reward_zero_std": 0.0, "grad_norm": 0.0664515346288681, "kl": 0.330798476934433, "learning_rate": 7.999401919421334e-06, "loss": -0.0769, "num_tokens": 33262042.0, "reward": 0.9909751415252686, "reward_std": 0.6672890186309814, "rewards/rollout_reward_func/mean": 0.9909751415252686, "rewards/rollout_reward_func/std": 0.6672890186309814, "sampling/importance_sampling_ratio/max": 1.1969577074050903, "sampling/importance_sampling_ratio/mean": 0.8436991572380066, "sampling/importance_sampling_ratio/min": 5.35237347776274e-07, "sampling/sampling_logp_difference/max": 1.8448375463485718, "sampling/sampling_logp_difference/mean": 0.22484612464904785, "step": 1307, "step_time": 26.179499300022144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9702439960092306, "epoch": 0.01308, "grad_norm": 0.05917583778500557, "kl": 0.3290192876011133, "learning_rate": 7.999400977964298e-06, "loss": -0.0769, "step": 1308, "step_time": 13.678233990940498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3725075535476208, "epoch": 0.01309, "frac_reward_zero_std": 0.25, "grad_norm": 0.05785517394542694, "kl": 0.2796243913471699, "learning_rate": 7.999400035766932e-06, "loss": -0.045, "num_tokens": 33317421.0, "reward": 0.8160649538040161, "reward_std": 0.7921950817108154, "rewards/rollout_reward_func/mean": 0.8160649538040161, "rewards/rollout_reward_func/std": 0.7921951413154602, "sampling/importance_sampling_ratio/max": 1.4244439601898193, "sampling/importance_sampling_ratio/mean": 0.7934210300445557, "sampling/importance_sampling_ratio/min": 1.1769534680183824e-08, "sampling/sampling_logp_difference/max": 2.115093231201172, "sampling/sampling_logp_difference/mean": 0.3397694230079651, "step": 1309, "step_time": 28.369529680989217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3917011059820652, "epoch": 0.0131, "grad_norm": 0.05983823537826538, "kl": 0.2792197670787573, "learning_rate": 7.999399092829233e-06, "loss": -0.0449, "step": 1310, "step_time": 13.931509570975322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7815140318125486, "epoch": 0.01311, "frac_reward_zero_std": 0.25, "grad_norm": 0.04868916794657707, "kl": 0.39787755347788334, "learning_rate": 7.999398149151204e-06, "loss": -0.035, "num_tokens": 33368581.0, "reward": 1.0111620426177979, "reward_std": 0.6921525001525879, "rewards/rollout_reward_func/mean": 1.0111620426177979, "rewards/rollout_reward_func/std": 0.6921525001525879, "sampling/importance_sampling_ratio/max": 1.1586703062057495, "sampling/importance_sampling_ratio/mean": 0.9476338624954224, "sampling/importance_sampling_ratio/min": 2.0567318870234885e-07, "sampling/sampling_logp_difference/max": 2.3361880779266357, "sampling/sampling_logp_difference/mean": 0.18815356492996216, "step": 1311, "step_time": 22.559147273976123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7864070115610957, "epoch": 0.01312, "grad_norm": 0.05194881930947304, "kl": 0.4175106342881918, "learning_rate": 7.999397204732844e-06, "loss": -0.0348, "step": 1312, "step_time": 11.612489021004876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.875, "completions/mean_terminated_length": 4.133333683013916, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8223643349483609, "epoch": 0.01313, "frac_reward_zero_std": 0.25, "grad_norm": 0.052859220653772354, "kl": 0.31408772617578506, "learning_rate": 7.999396259574152e-06, "loss": -0.0456, "num_tokens": 33411613.0, "reward": 1.036081075668335, "reward_std": 0.7425597906112671, "rewards/rollout_reward_func/mean": 1.036081075668335, "rewards/rollout_reward_func/std": 0.7425597906112671, "sampling/importance_sampling_ratio/max": 1.108534812927246, "sampling/importance_sampling_ratio/mean": 0.8954062461853027, "sampling/importance_sampling_ratio/min": 1.4199762290445506e-06, "sampling/sampling_logp_difference/max": 1.991320013999939, "sampling/sampling_logp_difference/mean": 0.17507395148277283, "step": 1313, "step_time": 22.09469936595997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8300333200022578, "epoch": 0.01314, "grad_norm": 0.09052306413650513, "kl": 0.31178586930036545, "learning_rate": 7.99939531367513e-06, "loss": -0.0459, "step": 1314, "step_time": 12.884420448011952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.375, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6639089211821556, "epoch": 0.01315, "frac_reward_zero_std": 0.75, "grad_norm": 0.10941191762685776, "kl": 0.2454399075359106, "learning_rate": 7.999394367035778e-06, "loss": 0.0001, "num_tokens": 33451528.0, "reward": 1.1181769371032715, "reward_std": 0.6723359227180481, "rewards/rollout_reward_func/mean": 1.1181769371032715, "rewards/rollout_reward_func/std": 0.6723359227180481, "sampling/importance_sampling_ratio/max": 1.0480469465255737, "sampling/importance_sampling_ratio/mean": 0.9044910669326782, "sampling/importance_sampling_ratio/min": 8.81879191894086e-09, "sampling/sampling_logp_difference/max": 2.761660575866699, "sampling/sampling_logp_difference/mean": 0.16350612044334412, "step": 1315, "step_time": 21.27511526097078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6765145659446716, "epoch": 0.01316, "grad_norm": 0.10720815509557724, "kl": 0.24707656726241112, "learning_rate": 7.999393419656096e-06, "loss": -0.0008, "step": 1316, "step_time": 12.293618545023492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 4.230769157409668, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7355028986930847, "epoch": 0.01317, "frac_reward_zero_std": 0.0, "grad_norm": 0.045959942042827606, "kl": 0.2378231007605791, "learning_rate": 7.999392471536084e-06, "loss": -0.0892, "num_tokens": 33501333.0, "reward": 0.732365071773529, "reward_std": 0.8776340484619141, "rewards/rollout_reward_func/mean": 0.732365071773529, "rewards/rollout_reward_func/std": 0.8776340484619141, "sampling/importance_sampling_ratio/max": 1.0688191652297974, "sampling/importance_sampling_ratio/mean": 0.7405785918235779, "sampling/importance_sampling_ratio/min": 5.4429774536401965e-06, "sampling/sampling_logp_difference/max": 1.9165419340133667, "sampling/sampling_logp_difference/mean": 0.3463534712791443, "step": 1317, "step_time": 26.147046578960726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7458273274824023, "epoch": 0.01318, "grad_norm": 0.04613911360502243, "kl": 0.23830080404877663, "learning_rate": 7.999391522675743e-06, "loss": -0.0893, "step": 1318, "step_time": 12.777051342010964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.34375, "completions/mean_terminated_length": 4.633333683013916, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9173807483166456, "epoch": 0.01319, "frac_reward_zero_std": 0.25, "grad_norm": 0.06938741356134415, "kl": 0.32336399890482426, "learning_rate": 7.999390573075073e-06, "loss": -0.0202, "num_tokens": 33549236.0, "reward": 0.6890263557434082, "reward_std": 0.8512188196182251, "rewards/rollout_reward_func/mean": 0.6890263557434082, "rewards/rollout_reward_func/std": 0.8512188792228699, "sampling/importance_sampling_ratio/max": 1.2977800369262695, "sampling/importance_sampling_ratio/mean": 0.8402211666107178, "sampling/importance_sampling_ratio/min": 1.7785667694170115e-07, "sampling/sampling_logp_difference/max": 1.8386563062667847, "sampling/sampling_logp_difference/mean": 0.1984720081090927, "step": 1319, "step_time": 28.64132921898272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9247512016445398, "epoch": 0.0132, "grad_norm": 0.07674221694469452, "kl": 0.3221612721681595, "learning_rate": 7.999389622734072e-06, "loss": -0.0205, "step": 1320, "step_time": 14.141099488013424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.21875, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3389350585639477, "epoch": 0.01321, "frac_reward_zero_std": 0.25, "grad_norm": 0.09586886316537857, "kl": 0.3811286613345146, "learning_rate": 7.999388671652745e-06, "loss": -0.058, "num_tokens": 33605110.0, "reward": 0.9717437624931335, "reward_std": 0.5924195051193237, "rewards/rollout_reward_func/mean": 0.9717437624931335, "rewards/rollout_reward_func/std": 0.5924195051193237, "sampling/importance_sampling_ratio/max": 1.050016164779663, "sampling/importance_sampling_ratio/mean": 0.7568252086639404, "sampling/importance_sampling_ratio/min": 7.997233097967182e-08, "sampling/sampling_logp_difference/max": 1.9209954738616943, "sampling/sampling_logp_difference/mean": 0.2646157741546631, "step": 1321, "step_time": 30.80346070905216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3420143499970436, "epoch": 0.01322, "grad_norm": 0.10013101994991302, "kl": 0.3862393945455551, "learning_rate": 7.999387719831088e-06, "loss": -0.0577, "step": 1322, "step_time": 16.543262118997518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.59375, "completions/mean_terminated_length": 3.9599997997283936, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3938248977065086, "epoch": 0.01323, "frac_reward_zero_std": 0.25, "grad_norm": 0.0326581709086895, "kl": 0.2982332743704319, "learning_rate": 7.999386767269103e-06, "loss": -0.0663, "num_tokens": 33664906.0, "reward": 0.626079261302948, "reward_std": 0.9617969989776611, "rewards/rollout_reward_func/mean": 0.626079261302948, "rewards/rollout_reward_func/std": 0.9617969989776611, "sampling/importance_sampling_ratio/max": 1.0993614196777344, "sampling/importance_sampling_ratio/mean": 0.7758831977844238, "sampling/importance_sampling_ratio/min": 2.090553789457772e-05, "sampling/sampling_logp_difference/max": 1.6813212633132935, "sampling/sampling_logp_difference/mean": 0.28417056798934937, "step": 1323, "step_time": 31.105424184032017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3913776837289333, "epoch": 0.01324, "grad_norm": 0.032868392765522, "kl": 0.2982585374265909, "learning_rate": 7.999385813966789e-06, "loss": -0.0664, "step": 1324, "step_time": 15.612216923007509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.65625, "completions/mean_terminated_length": 4.178571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2935236543416977, "epoch": 0.01325, "frac_reward_zero_std": 0.0, "grad_norm": 0.14469724893569946, "kl": 0.9564192909747362, "learning_rate": 7.999384859924148e-06, "loss": -0.0737, "num_tokens": 33719984.0, "reward": 0.8087862730026245, "reward_std": 0.8233464956283569, "rewards/rollout_reward_func/mean": 0.8087862730026245, "rewards/rollout_reward_func/std": 0.8233464956283569, "sampling/importance_sampling_ratio/max": 1.1371829509735107, "sampling/importance_sampling_ratio/mean": 0.7808332443237305, "sampling/importance_sampling_ratio/min": 2.3934165938044316e-07, "sampling/sampling_logp_difference/max": 2.711132526397705, "sampling/sampling_logp_difference/mean": 0.3213544487953186, "step": 1325, "step_time": 25.045936685957713 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.2900584805756807, "epoch": 0.01326, "grad_norm": 0.10423671454191208, "kl": 0.9740430880337954, "learning_rate": 7.99938390514118e-06, "loss": -0.074, "step": 1326, "step_time": 13.120074262027629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 4.222222328186035, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5217710873112082, "epoch": 0.01327, "frac_reward_zero_std": 0.5, "grad_norm": 0.03252299129962921, "kl": 0.28551348112523556, "learning_rate": 7.999382949617882e-06, "loss": -0.0459, "num_tokens": 33766323.0, "reward": 0.609965443611145, "reward_std": 0.9154086709022522, "rewards/rollout_reward_func/mean": 0.609965443611145, "rewards/rollout_reward_func/std": 0.9154086112976074, "sampling/importance_sampling_ratio/max": 1.0921390056610107, "sampling/importance_sampling_ratio/mean": 0.7679243683815002, "sampling/importance_sampling_ratio/min": 7.483615860337522e-08, "sampling/sampling_logp_difference/max": 2.729738235473633, "sampling/sampling_logp_difference/mean": 0.3595602512359619, "step": 1327, "step_time": 25.289229828951648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5153812505304813, "epoch": 0.01328, "grad_norm": 0.031921807676553726, "kl": 0.2868460565805435, "learning_rate": 7.99938199335426e-06, "loss": -0.046, "step": 1328, "step_time": 13.42088701998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.90625, "completions/mean_terminated_length": 4.227272987365723, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.145745635032654, "epoch": 0.01329, "frac_reward_zero_std": 0.0, "grad_norm": 0.12126154452562332, "kl": 0.2898552846163511, "learning_rate": 7.999381036350309e-06, "loss": -0.0904, "num_tokens": 33834326.0, "reward": 0.37977010011672974, "reward_std": 0.8666717410087585, "rewards/rollout_reward_func/mean": 0.37977010011672974, "rewards/rollout_reward_func/std": 0.8666716814041138, "sampling/importance_sampling_ratio/max": 1.2298674583435059, "sampling/importance_sampling_ratio/mean": 0.6440245509147644, "sampling/importance_sampling_ratio/min": 4.0276360024904534e-10, "sampling/sampling_logp_difference/max": 2.6746373176574707, "sampling/sampling_logp_difference/mean": 0.4332870543003082, "step": 1329, "step_time": 31.34400717704557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.134868338704109, "epoch": 0.0133, "grad_norm": 0.14821456372737885, "kl": 0.295529767870903, "learning_rate": 7.999380078606032e-06, "loss": -0.0907, "step": 1330, "step_time": 14.823858235089574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.6875, "completions/mean_terminated_length": 4.322580337524414, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9198292335495353, "epoch": 0.01331, "frac_reward_zero_std": 0.0, "grad_norm": 0.16946963965892792, "kl": 0.7391394712030888, "learning_rate": 7.999379120121428e-06, "loss": -0.0619, "num_tokens": 33873839.0, "reward": 1.3002433776855469, "reward_std": 0.486829549074173, "rewards/rollout_reward_func/mean": 1.3002433776855469, "rewards/rollout_reward_func/std": 0.486829549074173, "sampling/importance_sampling_ratio/max": 1.136744499206543, "sampling/importance_sampling_ratio/mean": 0.8902738094329834, "sampling/importance_sampling_ratio/min": 1.5110565243503515e-08, "sampling/sampling_logp_difference/max": 2.2236852645874023, "sampling/sampling_logp_difference/mean": 0.2289884239435196, "step": 1331, "step_time": 17.733446927013574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9170281356200576, "epoch": 0.01332, "grad_norm": 0.15331290662288666, "kl": 0.7002407275140285, "learning_rate": 7.999378160896498e-06, "loss": -0.062, "step": 1332, "step_time": 9.77214529295452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.28125, "completions/mean_terminated_length": 4.1724138259887695, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9998172949999571, "epoch": 0.01333, "frac_reward_zero_std": 0.5, "grad_norm": 0.09112157672643661, "kl": 0.2642984502017498, "learning_rate": 7.999377200931244e-06, "loss": -0.0469, "num_tokens": 33926317.0, "reward": 0.5378268957138062, "reward_std": 0.7894269824028015, "rewards/rollout_reward_func/mean": 0.5378268957138062, "rewards/rollout_reward_func/std": 0.7894269227981567, "sampling/importance_sampling_ratio/max": 1.5043901205062866, "sampling/importance_sampling_ratio/mean": 0.8816560506820679, "sampling/importance_sampling_ratio/min": 3.76981079170946e-05, "sampling/sampling_logp_difference/max": 1.785982608795166, "sampling/sampling_logp_difference/mean": 0.21056777238845825, "step": 1333, "step_time": 27.797620989993447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9938315749168396, "epoch": 0.01334, "grad_norm": 0.09072951227426529, "kl": 0.2625075113028288, "learning_rate": 7.999376240225662e-06, "loss": -0.047, "step": 1334, "step_time": 15.35528473398881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 4.965517044067383, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.269527968019247, "epoch": 0.01335, "frac_reward_zero_std": 0.0, "grad_norm": 0.040904831141233444, "kl": 0.498602032661438, "learning_rate": 7.999375278779754e-06, "loss": -0.0641, "num_tokens": 33982505.0, "reward": 0.7910664081573486, "reward_std": 0.9037870168685913, "rewards/rollout_reward_func/mean": 0.7910664081573486, "rewards/rollout_reward_func/std": 0.9037869572639465, "sampling/importance_sampling_ratio/max": 1.127494215965271, "sampling/importance_sampling_ratio/mean": 0.7643019556999207, "sampling/importance_sampling_ratio/min": 0.0002923240535892546, "sampling/sampling_logp_difference/max": 2.140850067138672, "sampling/sampling_logp_difference/mean": 0.23837465047836304, "step": 1335, "step_time": 30.43810033495538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2653607875108719, "epoch": 0.01336, "grad_norm": 0.04038458690047264, "kl": 0.5009094029664993, "learning_rate": 7.999374316593522e-06, "loss": -0.0641, "step": 1336, "step_time": 15.601307307981187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.625, "completions/mean_terminated_length": 5.279999732971191, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2624786645174026, "epoch": 0.01337, "frac_reward_zero_std": 0.0, "grad_norm": 0.09875771403312683, "kl": 0.27643428184092045, "learning_rate": 7.999373353666963e-06, "loss": -0.0838, "num_tokens": 34048309.0, "reward": 0.29517272114753723, "reward_std": 0.815850019454956, "rewards/rollout_reward_func/mean": 0.29517272114753723, "rewards/rollout_reward_func/std": 0.815850019454956, "sampling/importance_sampling_ratio/max": 1.0667340755462646, "sampling/importance_sampling_ratio/mean": 0.5185269713401794, "sampling/importance_sampling_ratio/min": 1.450544431236267e-07, "sampling/sampling_logp_difference/max": 2.0512728691101074, "sampling/sampling_logp_difference/mean": 0.45129573345184326, "step": 1337, "step_time": 31.319706790993223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.24872986972332, "epoch": 0.01338, "grad_norm": 0.09578247368335724, "kl": 0.2753239031881094, "learning_rate": 7.999372390000081e-06, "loss": -0.0842, "step": 1338, "step_time": 14.702844980958616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.625, "completions/mean_terminated_length": 4.142857551574707, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2525327280163765, "epoch": 0.01339, "frac_reward_zero_std": 0.25, "grad_norm": 0.09471826255321503, "kl": 0.26835573092103004, "learning_rate": 7.999371425592876e-06, "loss": -0.0336, "num_tokens": 34100443.0, "reward": 0.5252145528793335, "reward_std": 0.755792498588562, "rewards/rollout_reward_func/mean": 0.5252145528793335, "rewards/rollout_reward_func/std": 0.7557924389839172, "sampling/importance_sampling_ratio/max": 1.0684558153152466, "sampling/importance_sampling_ratio/mean": 0.7909452319145203, "sampling/importance_sampling_ratio/min": 2.5870616809697822e-05, "sampling/sampling_logp_difference/max": 1.7188043594360352, "sampling/sampling_logp_difference/mean": 0.1900283396244049, "step": 1339, "step_time": 26.608203268027864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2328324122354388, "epoch": 0.0134, "grad_norm": 0.08359236270189285, "kl": 0.26262024603784084, "learning_rate": 7.999370460445345e-06, "loss": -0.034, "step": 1340, "step_time": 13.678609673981555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.4973264327272773, "epoch": 0.01341, "frac_reward_zero_std": 0.25, "grad_norm": 0.08091923594474792, "kl": 0.3855843562632799, "learning_rate": 7.99936949455749e-06, "loss": -0.0292, "num_tokens": 34151700.0, "reward": 0.6699376106262207, "reward_std": 0.8102649450302124, "rewards/rollout_reward_func/mean": 0.6699376106262207, "rewards/rollout_reward_func/std": 0.8102649450302124, "sampling/importance_sampling_ratio/max": 1.0852316617965698, "sampling/importance_sampling_ratio/mean": 0.9030267596244812, "sampling/importance_sampling_ratio/min": 0.033904436975717545, "sampling/sampling_logp_difference/max": 2.070598602294922, "sampling/sampling_logp_difference/mean": 0.09010909497737885, "step": 1341, "step_time": 21.650530084007187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4886859077960253, "epoch": 0.01342, "grad_norm": 0.08435078710317612, "kl": 0.38966871425509453, "learning_rate": 7.99936852792931e-06, "loss": -0.0291, "step": 1342, "step_time": 12.046727023029234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.03125, "completions/mean_terminated_length": 4.300000190734863, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2249123752117157, "epoch": 0.01343, "frac_reward_zero_std": 0.0, "grad_norm": 0.0675322636961937, "kl": 0.47197586856782436, "learning_rate": 7.999367560560807e-06, "loss": -0.0715, "num_tokens": 34198202.0, "reward": 0.3516566753387451, "reward_std": 0.8116670250892639, "rewards/rollout_reward_func/mean": 0.3516566753387451, "rewards/rollout_reward_func/std": 0.8116669654846191, "sampling/importance_sampling_ratio/max": 1.2567353248596191, "sampling/importance_sampling_ratio/mean": 0.7996305227279663, "sampling/importance_sampling_ratio/min": 0.0004968672292307019, "sampling/sampling_logp_difference/max": 2.265407085418701, "sampling/sampling_logp_difference/mean": 0.2503744959831238, "step": 1343, "step_time": 25.728828699939186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2301451079547405, "epoch": 0.01344, "grad_norm": 0.06571515649557114, "kl": 0.47129728831350803, "learning_rate": 7.999366592451981e-06, "loss": -0.0717, "step": 1344, "step_time": 13.845567848999053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3078789431601763, "epoch": 0.01345, "frac_reward_zero_std": 0.0, "grad_norm": 0.052711453288793564, "kl": 0.40098158456385136, "learning_rate": 7.999365623602833e-06, "loss": -0.078, "num_tokens": 34255443.0, "reward": 0.39622288942337036, "reward_std": 0.7469474673271179, "rewards/rollout_reward_func/mean": 0.39622288942337036, "rewards/rollout_reward_func/std": 0.7469474077224731, "sampling/importance_sampling_ratio/max": 1.1279932260513306, "sampling/importance_sampling_ratio/mean": 0.8005266189575195, "sampling/importance_sampling_ratio/min": 8.346415256710316e-07, "sampling/sampling_logp_difference/max": 2.114030599594116, "sampling/sampling_logp_difference/mean": 0.26288679242134094, "step": 1345, "step_time": 27.165672478004126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.306811947375536, "epoch": 0.01346, "grad_norm": 0.042802680283784866, "kl": 0.39927135314792395, "learning_rate": 7.99936465401336e-06, "loss": -0.0781, "step": 1346, "step_time": 13.856341039005201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.28125, "completions/mean_terminated_length": 4.481481552124023, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5568066947162151, "epoch": 0.01347, "frac_reward_zero_std": 0.25, "grad_norm": 0.07290297001600266, "kl": 0.3175596874207258, "learning_rate": 7.999363683683565e-06, "loss": -0.0422, "num_tokens": 34308030.0, "reward": 0.7446056008338928, "reward_std": 0.8979746699333191, "rewards/rollout_reward_func/mean": 0.7446056008338928, "rewards/rollout_reward_func/std": 0.8979747295379639, "sampling/importance_sampling_ratio/max": 1.2059181928634644, "sampling/importance_sampling_ratio/mean": 0.8208593130111694, "sampling/importance_sampling_ratio/min": 2.6533366614955867e-09, "sampling/sampling_logp_difference/max": 2.3303840160369873, "sampling/sampling_logp_difference/mean": 0.3652491569519043, "step": 1347, "step_time": 26.237626824964536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5566018433310091, "epoch": 0.01348, "grad_norm": 0.0757797360420227, "kl": 0.32279559783637524, "learning_rate": 7.999362712613448e-06, "loss": -0.0421, "step": 1348, "step_time": 13.678100220946362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.962963104248047, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.405680164694786, "epoch": 0.01349, "frac_reward_zero_std": 0.0, "grad_norm": 0.09474248439073563, "kl": 0.2668214663863182, "learning_rate": 7.99936174080301e-06, "loss": -0.0729, "num_tokens": 34365637.0, "reward": 0.46200671792030334, "reward_std": 0.8206571936607361, "rewards/rollout_reward_func/mean": 0.46200671792030334, "rewards/rollout_reward_func/std": 0.8206571340560913, "sampling/importance_sampling_ratio/max": 1.2496854066848755, "sampling/importance_sampling_ratio/mean": 0.7349423170089722, "sampling/importance_sampling_ratio/min": 5.793778541374195e-07, "sampling/sampling_logp_difference/max": 2.1780600547790527, "sampling/sampling_logp_difference/mean": 0.2783508598804474, "step": 1349, "step_time": 26.074117800948443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4177952148020267, "epoch": 0.0135, "grad_norm": 0.08780387789011002, "kl": 0.2652032673358917, "learning_rate": 7.999360768252248e-06, "loss": -0.0735, "step": 1350, "step_time": 13.276760164037114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005842391401529312, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005842391401529312, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 4.769230842590332, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8280859207734466, "epoch": 0.01351, "frac_reward_zero_std": 0.0, "grad_norm": 0.04754655808210373, "kl": 0.3763980567455292, "learning_rate": 7.999359794961164e-06, "loss": -0.0865, "num_tokens": 34423094.0, "reward": 0.25873154401779175, "reward_std": 0.863300085067749, "rewards/rollout_reward_func/mean": 0.25873154401779175, "rewards/rollout_reward_func/std": 0.863300085067749, "sampling/importance_sampling_ratio/max": 1.1385352611541748, "sampling/importance_sampling_ratio/mean": 0.5941179990768433, "sampling/importance_sampling_ratio/min": 6.996947922743857e-05, "sampling/sampling_logp_difference/max": 2.305311679840088, "sampling/sampling_logp_difference/mean": 0.3330577611923218, "step": 1351, "step_time": 27.716200465045404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.8316609906032681, "epoch": 0.01352, "grad_norm": 0.06142232194542885, "kl": 0.3717503324151039, "learning_rate": 7.99935882092976e-06, "loss": -0.0866, "step": 1352, "step_time": 14.099568810983328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 4.4375, "completions/mean_terminated_length": 4.064516067504883, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6581575525924563, "epoch": 0.01353, "frac_reward_zero_std": 0.5, "grad_norm": 0.08225518465042114, "kl": 0.30113016068935394, "learning_rate": 7.999357846158034e-06, "loss": -0.0252, "num_tokens": 34473419.0, "reward": 0.8377236127853394, "reward_std": 0.6851721405982971, "rewards/rollout_reward_func/mean": 0.8377236127853394, "rewards/rollout_reward_func/std": 0.6851721405982971, "sampling/importance_sampling_ratio/max": 1.6245139837265015, "sampling/importance_sampling_ratio/mean": 0.9877866506576538, "sampling/importance_sampling_ratio/min": 1.466726553189801e-05, "sampling/sampling_logp_difference/max": 1.7780917882919312, "sampling/sampling_logp_difference/mean": 0.13287952542304993, "step": 1353, "step_time": 29.921463965962175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.665152421221137, "epoch": 0.01354, "grad_norm": 0.08340542763471603, "kl": 0.30223094299435616, "learning_rate": 7.999356870645988e-06, "loss": -0.0251, "step": 1354, "step_time": 15.8696216539538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.09375, "completions/mean_terminated_length": 4.09375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.3429355276748538, "epoch": 0.01355, "frac_reward_zero_std": 0.0, "grad_norm": 0.03906740993261337, "kl": 0.6993992310017347, "learning_rate": 7.99935589439362e-06, "loss": -0.0345, "num_tokens": 34520219.0, "reward": 0.8734912872314453, "reward_std": 0.5781834721565247, "rewards/rollout_reward_func/mean": 0.8734912872314453, "rewards/rollout_reward_func/std": 0.5781834125518799, "sampling/importance_sampling_ratio/max": 1.1026471853256226, "sampling/importance_sampling_ratio/mean": 0.9756478071212769, "sampling/importance_sampling_ratio/min": 0.037722907960414886, "sampling/sampling_logp_difference/max": 1.6596567630767822, "sampling/sampling_logp_difference/mean": 0.058587826788425446, "step": 1355, "step_time": 21.332793999026762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3429697174578905, "epoch": 0.01356, "grad_norm": 0.03702566400170326, "kl": 0.7013443615287542, "learning_rate": 7.999354917400932e-06, "loss": -0.0346, "step": 1356, "step_time": 12.102547218004474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 4.807692527770996, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.876377433538437, "epoch": 0.01357, "frac_reward_zero_std": 0.25, "grad_norm": 0.26287809014320374, "kl": 0.3449670560657978, "learning_rate": 7.999353939667924e-06, "loss": -0.0376, "num_tokens": 34573523.0, "reward": 0.40488508343696594, "reward_std": 0.954382061958313, "rewards/rollout_reward_func/mean": 0.40488508343696594, "rewards/rollout_reward_func/std": 0.9543821215629578, "sampling/importance_sampling_ratio/max": 1.277279257774353, "sampling/importance_sampling_ratio/mean": 0.6584165096282959, "sampling/importance_sampling_ratio/min": 2.3228032659972087e-05, "sampling/sampling_logp_difference/max": 1.8925328254699707, "sampling/sampling_logp_difference/mean": 0.32012367248535156, "step": 1357, "step_time": 29.549124387005577 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013020833488553762, "entropy": 1.881250336766243, "epoch": 0.01358, "grad_norm": 0.19949083030223846, "kl": 0.3350753244012594, "learning_rate": 7.999352961194598e-06, "loss": -0.0391, "step": 1358, "step_time": 14.954831919952994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 4.363636493682861, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.632174927741289, "epoch": 0.01359, "frac_reward_zero_std": 0.0, "grad_norm": 0.18779787421226501, "kl": 0.7186917997896671, "learning_rate": 7.99935198198095e-06, "loss": -0.0664, "num_tokens": 34617635.0, "reward": 0.6256412863731384, "reward_std": 0.9038971662521362, "rewards/rollout_reward_func/mean": 0.6256412863731384, "rewards/rollout_reward_func/std": 0.9038971662521362, "sampling/importance_sampling_ratio/max": 1.399121642112732, "sampling/importance_sampling_ratio/mean": 0.6045835018157959, "sampling/importance_sampling_ratio/min": 2.1424828577920607e-10, "sampling/sampling_logp_difference/max": 2.3866171836853027, "sampling/sampling_logp_difference/mean": 0.5604696869850159, "step": 1359, "step_time": 27.826975569973 }, { "clip_ratio/high_max": 0.01875000074505806, "clip_ratio/high_mean": 0.00937500037252903, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00937500037252903, "entropy": 2.62396951764822, "epoch": 0.0136, "grad_norm": 0.059212736785411835, "kl": 0.7431049756705761, "learning_rate": 7.999351002026981e-06, "loss": -0.0667, "step": 1360, "step_time": 14.301053640054306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.8125, "completions/mean_terminated_length": 4.357142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0953008905053139, "epoch": 0.01361, "frac_reward_zero_std": 0.25, "grad_norm": 0.19990640878677368, "kl": 0.28337832540273666, "learning_rate": 7.999350021332696e-06, "loss": -0.0446, "num_tokens": 34676942.0, "reward": 0.3849119544029236, "reward_std": 0.7679306864738464, "rewards/rollout_reward_func/mean": 0.3849119544029236, "rewards/rollout_reward_func/std": 0.7679305672645569, "sampling/importance_sampling_ratio/max": 1.1499537229537964, "sampling/importance_sampling_ratio/mean": 0.8295060396194458, "sampling/importance_sampling_ratio/min": 0.0007428984390571713, "sampling/sampling_logp_difference/max": 1.7102575302124023, "sampling/sampling_logp_difference/mean": 0.19103756546974182, "step": 1361, "step_time": 28.631485047953902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1040762774646282, "epoch": 0.01362, "grad_norm": 0.1734914779663086, "kl": 0.2808042438700795, "learning_rate": 7.99934903989809e-06, "loss": -0.0452, "step": 1362, "step_time": 14.572878901934018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5625, "completions/mean_terminated_length": 4.0714287757873535, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0039468351751566, "epoch": 0.01363, "frac_reward_zero_std": 0.0, "grad_norm": 0.09816308319568634, "kl": 0.35116790514439344, "learning_rate": 7.999348057723165e-06, "loss": -0.075, "num_tokens": 34739465.0, "reward": 0.9839681386947632, "reward_std": 0.7364173531532288, "rewards/rollout_reward_func/mean": 0.9839681386947632, "rewards/rollout_reward_func/std": 0.7364173531532288, "sampling/importance_sampling_ratio/max": 1.102064609527588, "sampling/importance_sampling_ratio/mean": 0.8224169015884399, "sampling/importance_sampling_ratio/min": 0.00011314508446957916, "sampling/sampling_logp_difference/max": 1.866309642791748, "sampling/sampling_logp_difference/mean": 0.24328859150409698, "step": 1363, "step_time": 30.709084595029708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0163369979709387, "epoch": 0.01364, "grad_norm": 0.09567063301801682, "kl": 0.3458610698580742, "learning_rate": 7.999347074807922e-06, "loss": -0.0749, "step": 1364, "step_time": 16.086554794950644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8440108112990856, "epoch": 0.01365, "frac_reward_zero_std": 0.25, "grad_norm": 0.03811172768473625, "kl": 0.3335914444178343, "learning_rate": 7.99934609115236e-06, "loss": -0.0649, "num_tokens": 34800392.0, "reward": 0.5993302464485168, "reward_std": 0.7402697801589966, "rewards/rollout_reward_func/mean": 0.5993302464485168, "rewards/rollout_reward_func/std": 0.7402697801589966, "sampling/importance_sampling_ratio/max": 1.161706566810608, "sampling/importance_sampling_ratio/mean": 0.7110016942024231, "sampling/importance_sampling_ratio/min": 2.031917301081876e-09, "sampling/sampling_logp_difference/max": 2.9492545127868652, "sampling/sampling_logp_difference/mean": 0.4509164094924927, "step": 1365, "step_time": 33.93155921905418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8528565634042025, "epoch": 0.01366, "grad_norm": 0.040838148444890976, "kl": 0.32932849787175655, "learning_rate": 7.99934510675648e-06, "loss": -0.0649, "step": 1366, "step_time": 16.410518868040526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 4.875, "completions/mean_terminated_length": 4.516129016876221, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7337399078533053, "epoch": 0.01367, "frac_reward_zero_std": 0.25, "grad_norm": 0.06513308733701706, "kl": 0.3263364043086767, "learning_rate": 7.999344121620284e-06, "loss": -0.0333, "num_tokens": 34847937.0, "reward": 1.112210988998413, "reward_std": 0.7369892001152039, "rewards/rollout_reward_func/mean": 1.112210988998413, "rewards/rollout_reward_func/std": 0.7369891405105591, "sampling/importance_sampling_ratio/max": 1.1612106561660767, "sampling/importance_sampling_ratio/mean": 0.9429747462272644, "sampling/importance_sampling_ratio/min": 7.632800407009199e-05, "sampling/sampling_logp_difference/max": 1.4623627662658691, "sampling/sampling_logp_difference/mean": 0.13913056254386902, "step": 1367, "step_time": 22.858617182006128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7419811263680458, "epoch": 0.01368, "grad_norm": 0.06862494349479675, "kl": 0.317373501136899, "learning_rate": 7.99934313574377e-06, "loss": -0.0334, "step": 1368, "step_time": 12.024345341022126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.84375, "completions/mean_terminated_length": 4.793103218078613, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5457997769117355, "epoch": 0.01369, "frac_reward_zero_std": 0.0, "grad_norm": 0.027520842850208282, "kl": 0.560224000364542, "learning_rate": 7.999342149126936e-06, "loss": -0.0798, "num_tokens": 34897843.0, "reward": 0.6473278403282166, "reward_std": 0.8131614923477173, "rewards/rollout_reward_func/mean": 0.6473278403282166, "rewards/rollout_reward_func/std": 0.8131614923477173, "sampling/importance_sampling_ratio/max": 1.1470723152160645, "sampling/importance_sampling_ratio/mean": 0.6911399364471436, "sampling/importance_sampling_ratio/min": 1.5326810171245597e-05, "sampling/sampling_logp_difference/max": 1.859775185585022, "sampling/sampling_logp_difference/mean": 0.32477623224258423, "step": 1369, "step_time": 25.727700364979682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5537260565906763, "epoch": 0.0137, "grad_norm": 0.029064906761050224, "kl": 0.5543944537639618, "learning_rate": 7.999341161769788e-06, "loss": -0.0799, "step": 1370, "step_time": 12.950548837980023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.90625, "completions/mean_terminated_length": 4.464285850524902, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.52324890345335, "epoch": 0.01371, "frac_reward_zero_std": 0.0, "grad_norm": 0.0599401481449604, "kl": 0.5211282148957253, "learning_rate": 7.999340173672321e-06, "loss": -0.0816, "num_tokens": 34953506.0, "reward": 0.9768867492675781, "reward_std": 0.733301043510437, "rewards/rollout_reward_func/mean": 0.9768867492675781, "rewards/rollout_reward_func/std": 0.7333009243011475, "sampling/importance_sampling_ratio/max": 1.1604866981506348, "sampling/importance_sampling_ratio/mean": 0.7216206192970276, "sampling/importance_sampling_ratio/min": 8.108155924446692e-08, "sampling/sampling_logp_difference/max": 2.0753705501556396, "sampling/sampling_logp_difference/mean": 0.30800020694732666, "step": 1371, "step_time": 24.2378361330193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.54099478200078, "epoch": 0.01372, "grad_norm": 0.06012701615691185, "kl": 0.5116746537387371, "learning_rate": 7.999339184834539e-06, "loss": -0.0816, "step": 1372, "step_time": 12.226050413999474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.96875, "completions/mean_terminated_length": 4.440000057220459, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2291674949228764, "epoch": 0.01373, "frac_reward_zero_std": 0.0, "grad_norm": 0.10221324861049652, "kl": 0.3590168720111251, "learning_rate": 7.99933819525644e-06, "loss": -0.0791, "num_tokens": 35005897.0, "reward": 0.38039928674697876, "reward_std": 0.8644630908966064, "rewards/rollout_reward_func/mean": 0.38039928674697876, "rewards/rollout_reward_func/std": 0.8644630312919617, "sampling/importance_sampling_ratio/max": 1.1467446088790894, "sampling/importance_sampling_ratio/mean": 0.651180624961853, "sampling/importance_sampling_ratio/min": 1.6196390788536519e-06, "sampling/sampling_logp_difference/max": 2.1137096881866455, "sampling/sampling_logp_difference/mean": 0.4231947958469391, "step": 1373, "step_time": 27.72822697099764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.225400425493717, "epoch": 0.01374, "grad_norm": 0.10505405813455582, "kl": 0.34726487286388874, "learning_rate": 7.999337204938023e-06, "loss": -0.0796, "step": 1374, "step_time": 13.729140871000709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.962963104248047, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8045903258025646, "epoch": 0.01375, "frac_reward_zero_std": 0.25, "grad_norm": 0.09468740224838257, "kl": 0.29686637967824936, "learning_rate": 7.999336213879294e-06, "loss": -0.0619, "num_tokens": 35054350.0, "reward": 0.6695054769515991, "reward_std": 0.8494643568992615, "rewards/rollout_reward_func/mean": 0.6695054769515991, "rewards/rollout_reward_func/std": 0.8494642972946167, "sampling/importance_sampling_ratio/max": 1.0972682237625122, "sampling/importance_sampling_ratio/mean": 0.7231082916259766, "sampling/importance_sampling_ratio/min": 2.0913383025344956e-08, "sampling/sampling_logp_difference/max": 2.745051383972168, "sampling/sampling_logp_difference/mean": 0.4096793532371521, "step": 1375, "step_time": 23.00538718898315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7960345540195704, "epoch": 0.01376, "grad_norm": 0.08384378999471664, "kl": 0.3004696574062109, "learning_rate": 7.999335222080245e-06, "loss": -0.0622, "step": 1376, "step_time": 11.163775485008955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.535714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5761376153677702, "epoch": 0.01377, "frac_reward_zero_std": 0.0, "grad_norm": 0.09195708483457565, "kl": 0.26050349697470665, "learning_rate": 7.999334229540883e-06, "loss": -0.07, "num_tokens": 35104495.0, "reward": 0.45398086309432983, "reward_std": 0.9335454106330872, "rewards/rollout_reward_func/mean": 0.45398086309432983, "rewards/rollout_reward_func/std": 0.9335453510284424, "sampling/importance_sampling_ratio/max": 1.10380220413208, "sampling/importance_sampling_ratio/mean": 0.7646336555480957, "sampling/importance_sampling_ratio/min": 1.555140300979474e-07, "sampling/sampling_logp_difference/max": 2.1800591945648193, "sampling/sampling_logp_difference/mean": 0.33013707399368286, "step": 1377, "step_time": 26.96946714396472 }, { "clip_ratio/high_max": 0.005434782709926367, "clip_ratio/high_mean": 0.0027173913549631834, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008399209938943386, "entropy": 1.5473023187369108, "epoch": 0.01378, "grad_norm": 0.08028696477413177, "kl": 0.2660878337919712, "learning_rate": 7.999333236261206e-06, "loss": -0.0705, "step": 1378, "step_time": 13.427102332032518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 4.799999713897705, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7936011590063572, "epoch": 0.01379, "frac_reward_zero_std": 0.0, "grad_norm": 0.020334266126155853, "kl": 0.3716081287711859, "learning_rate": 7.999332242241213e-06, "loss": -0.096, "num_tokens": 35175938.0, "reward": 0.5185717344284058, "reward_std": 0.860536515712738, "rewards/rollout_reward_func/mean": 0.5185717344284058, "rewards/rollout_reward_func/std": 0.8605364561080933, "sampling/importance_sampling_ratio/max": 1.079215407371521, "sampling/importance_sampling_ratio/mean": 0.6284984350204468, "sampling/importance_sampling_ratio/min": 0.00014220592856872827, "sampling/sampling_logp_difference/max": 1.891507863998413, "sampling/sampling_logp_difference/mean": 0.3007776141166687, "step": 1379, "step_time": 34.77011570497416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7890705168247223, "epoch": 0.0138, "grad_norm": 0.018600227311253548, "kl": 0.3749254709109664, "learning_rate": 7.999331247480907e-06, "loss": -0.096, "step": 1380, "step_time": 17.0806719379907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3656926127150655, "epoch": 0.01381, "frac_reward_zero_std": 0.0, "grad_norm": 0.0485294945538044, "kl": 0.44103845208883286, "learning_rate": 7.999330251980286e-06, "loss": -0.0806, "num_tokens": 35222575.0, "reward": 0.3667454719543457, "reward_std": 0.7910425066947937, "rewards/rollout_reward_func/mean": 0.3667454719543457, "rewards/rollout_reward_func/std": 0.7910425066947937, "sampling/importance_sampling_ratio/max": 1.0829747915267944, "sampling/importance_sampling_ratio/mean": 0.7371652126312256, "sampling/importance_sampling_ratio/min": 9.708019933896139e-05, "sampling/sampling_logp_difference/max": 2.0426125526428223, "sampling/sampling_logp_difference/mean": 0.28675439953804016, "step": 1381, "step_time": 20.4859541170008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3607004582881927, "epoch": 0.01382, "grad_norm": 0.04705165699124336, "kl": 0.4472053274512291, "learning_rate": 7.999329255739351e-06, "loss": -0.0808, "step": 1382, "step_time": 11.658438622049289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.875, "completions/mean_terminated_length": 4.133333683013916, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7737659299746156, "epoch": 0.01383, "frac_reward_zero_std": 0.0, "grad_norm": 0.10046830028295517, "kl": 0.35220332257449627, "learning_rate": 7.999328258758103e-06, "loss": -0.0398, "num_tokens": 35280781.0, "reward": 0.538854718208313, "reward_std": 0.7525845766067505, "rewards/rollout_reward_func/mean": 0.538854718208313, "rewards/rollout_reward_func/std": 0.7525846362113953, "sampling/importance_sampling_ratio/max": 1.2962130308151245, "sampling/importance_sampling_ratio/mean": 0.9055094718933105, "sampling/importance_sampling_ratio/min": 0.004035828169435263, "sampling/sampling_logp_difference/max": 1.7493417263031006, "sampling/sampling_logp_difference/mean": 0.13756176829338074, "step": 1383, "step_time": 26.66838428704068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7617215258069336, "epoch": 0.01384, "grad_norm": 0.09657532721757889, "kl": 0.3566909171640873, "learning_rate": 7.999327261036539e-06, "loss": -0.0401, "step": 1384, "step_time": 14.411309527989943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.96875, "completions/mean_terminated_length": 4.233333587646484, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7273833542130888, "epoch": 0.01385, "frac_reward_zero_std": 0.25, "grad_norm": 0.09190559387207031, "kl": 0.43163200095295906, "learning_rate": 7.999326262574661e-06, "loss": -0.0435, "num_tokens": 35334999.0, "reward": 0.8142796158790588, "reward_std": 0.7594788074493408, "rewards/rollout_reward_func/mean": 0.8142796158790588, "rewards/rollout_reward_func/std": 0.7594786882400513, "sampling/importance_sampling_ratio/max": 1.0584264993667603, "sampling/importance_sampling_ratio/mean": 0.86292564868927, "sampling/importance_sampling_ratio/min": 1.0344985639676452e-05, "sampling/sampling_logp_difference/max": 1.9127198457717896, "sampling/sampling_logp_difference/mean": 0.16970650851726532, "step": 1385, "step_time": 26.072019720071694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7128428432624787, "epoch": 0.01386, "grad_norm": 0.08263710141181946, "kl": 0.43196864053606987, "learning_rate": 7.999325263372472e-06, "loss": -0.0432, "step": 1386, "step_time": 13.740151464007795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.28125, "completions/mean_terminated_length": 4.481481552124023, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4457139996811748, "epoch": 0.01387, "frac_reward_zero_std": 0.0, "grad_norm": 0.013805896043777466, "kl": 0.3078836742788553, "learning_rate": 7.999324263429971e-06, "loss": -0.0707, "num_tokens": 35396819.0, "reward": 0.7875891923904419, "reward_std": 0.790448009967804, "rewards/rollout_reward_func/mean": 0.7875891923904419, "rewards/rollout_reward_func/std": 0.790448009967804, "sampling/importance_sampling_ratio/max": 1.2815310955047607, "sampling/importance_sampling_ratio/mean": 0.7878780364990234, "sampling/importance_sampling_ratio/min": 3.1601646242052084e-06, "sampling/sampling_logp_difference/max": 1.9268162250518799, "sampling/sampling_logp_difference/mean": 0.2563178539276123, "step": 1387, "step_time": 30.249954637954943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4427978610619903, "epoch": 0.01388, "grad_norm": 0.013466846197843552, "kl": 0.30230986326932907, "learning_rate": 7.999323262747155e-06, "loss": -0.0707, "step": 1388, "step_time": 13.687333960930118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.413793087005615, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0826070997864008, "epoch": 0.01389, "frac_reward_zero_std": 0.25, "grad_norm": 0.09260120987892151, "kl": 0.526747640222311, "learning_rate": 7.999322261324026e-06, "loss": -0.0722, "num_tokens": 35442512.0, "reward": 1.2492878437042236, "reward_std": 0.4376474916934967, "rewards/rollout_reward_func/mean": 1.2492878437042236, "rewards/rollout_reward_func/std": 0.4376474618911743, "sampling/importance_sampling_ratio/max": 1.1302419900894165, "sampling/importance_sampling_ratio/mean": 0.8351147174835205, "sampling/importance_sampling_ratio/min": 1.2302446561474056e-10, "sampling/sampling_logp_difference/max": 2.460357666015625, "sampling/sampling_logp_difference/mean": 0.3471858501434326, "step": 1389, "step_time": 18.72930404896033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.079925718717277, "epoch": 0.0139, "grad_norm": 0.07402119040489197, "kl": 0.5228621736168861, "learning_rate": 7.999321259160586e-06, "loss": -0.0725, "step": 1390, "step_time": 11.002779495960567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2957439091987908, "epoch": 0.01391, "frac_reward_zero_std": 0.25, "grad_norm": 0.019141392782330513, "kl": 0.3742883480153978, "learning_rate": 7.999320256256836e-06, "loss": -0.0501, "num_tokens": 35496337.0, "reward": 0.14391428232192993, "reward_std": 0.7703215479850769, "rewards/rollout_reward_func/mean": 0.14391428232192993, "rewards/rollout_reward_func/std": 0.7703215479850769, "sampling/importance_sampling_ratio/max": 1.0711182355880737, "sampling/importance_sampling_ratio/mean": 0.8058792352676392, "sampling/importance_sampling_ratio/min": 2.206377757829614e-06, "sampling/sampling_logp_difference/max": 1.9322056770324707, "sampling/sampling_logp_difference/mean": 0.2718810737133026, "step": 1391, "step_time": 26.5108745819889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.295164375565946, "epoch": 0.01392, "grad_norm": 0.019099578261375427, "kl": 0.3710443479940295, "learning_rate": 7.999319252612772e-06, "loss": -0.0501, "step": 1392, "step_time": 13.735422226978699 }, { "clip_ratio/high_max": 0.046875, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.28125, "completions/mean_terminated_length": 5.633333683013916, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7832934353500605, "epoch": 0.01393, "frac_reward_zero_std": 0.0, "grad_norm": 0.02126901037991047, "kl": 0.4172071013599634, "learning_rate": 7.999318248228396e-06, "loss": -0.0837, "num_tokens": 35560775.0, "reward": 0.4422709345817566, "reward_std": 0.7464762926101685, "rewards/rollout_reward_func/mean": 0.4422709345817566, "rewards/rollout_reward_func/std": 0.7464763522148132, "sampling/importance_sampling_ratio/max": 1.0502108335494995, "sampling/importance_sampling_ratio/mean": 0.6092166304588318, "sampling/importance_sampling_ratio/min": 5.149991011421662e-06, "sampling/sampling_logp_difference/max": 2.3421692848205566, "sampling/sampling_logp_difference/mean": 0.3681500554084778, "step": 1393, "step_time": 29.9169483819569 }, { "clip_ratio/high_max": 0.028125000186264515, "clip_ratio/high_mean": 0.014062500093132257, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016903409268707037, "entropy": 1.7791381794959307, "epoch": 0.01394, "grad_norm": 0.04083037003874779, "kl": 0.4089391063898802, "learning_rate": 7.99931724310371e-06, "loss": -0.0836, "step": 1394, "step_time": 16.530537276004907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 4.40625, "completions/mean_terminated_length": 4.40625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5216386313550174, "epoch": 0.01395, "frac_reward_zero_std": 0.25, "grad_norm": 0.11927171051502228, "kl": 0.3232865445315838, "learning_rate": 7.999316237238714e-06, "loss": -0.0301, "num_tokens": 35610573.0, "reward": 1.2343683242797852, "reward_std": 0.43440282344818115, "rewards/rollout_reward_func/mean": 1.2343683242797852, "rewards/rollout_reward_func/std": 0.43440282344818115, "sampling/importance_sampling_ratio/max": 1.3075144290924072, "sampling/importance_sampling_ratio/mean": 0.9700829982757568, "sampling/importance_sampling_ratio/min": 2.0582024262694176e-06, "sampling/sampling_logp_difference/max": 2.2197604179382324, "sampling/sampling_logp_difference/mean": 0.12964387238025665, "step": 1395, "step_time": 21.919821987015894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5116614773869514, "epoch": 0.01396, "grad_norm": 0.10478872805833817, "kl": 0.3287041410803795, "learning_rate": 7.999315230633406e-06, "loss": -0.0304, "step": 1396, "step_time": 12.25813829805702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.375, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.41063603200018406, "epoch": 0.01397, "frac_reward_zero_std": 0.25, "grad_norm": 0.08908209204673767, "kl": 0.4142598509788513, "learning_rate": 7.999314223287788e-06, "loss": -0.0345, "num_tokens": 35663115.0, "reward": 0.5326617360115051, "reward_std": 0.51310795545578, "rewards/rollout_reward_func/mean": 0.5326617360115051, "rewards/rollout_reward_func/std": 0.5131080150604248, "sampling/importance_sampling_ratio/max": 1.195500135421753, "sampling/importance_sampling_ratio/mean": 0.9555395841598511, "sampling/importance_sampling_ratio/min": 0.0037860493175685406, "sampling/sampling_logp_difference/max": 1.6612883806228638, "sampling/sampling_logp_difference/mean": 0.0768526941537857, "step": 1397, "step_time": 21.976110216026427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40270035481080413, "epoch": 0.01398, "grad_norm": 0.07725430279970169, "kl": 0.4151536598801613, "learning_rate": 7.99931321520186e-06, "loss": -0.0348, "step": 1398, "step_time": 11.236306353006512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.8275861740112305, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.221391617320478, "epoch": 0.01399, "frac_reward_zero_std": 0.25, "grad_norm": 0.02731487527489662, "kl": 0.39241168461740017, "learning_rate": 7.999312206375621e-06, "loss": -0.0564, "num_tokens": 35726799.0, "reward": 0.3547411561012268, "reward_std": 0.6173102259635925, "rewards/rollout_reward_func/mean": 0.3547411561012268, "rewards/rollout_reward_func/std": 0.6173102259635925, "sampling/importance_sampling_ratio/max": 1.2124131917953491, "sampling/importance_sampling_ratio/mean": 0.81743323802948, "sampling/importance_sampling_ratio/min": 1.3026166016061325e-05, "sampling/sampling_logp_difference/max": 1.9119707345962524, "sampling/sampling_logp_difference/mean": 0.2832220196723938, "step": 1399, "step_time": 31.567691041971557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2175419596023858, "epoch": 0.014, "grad_norm": 0.027829347178339958, "kl": 0.39917914383113384, "learning_rate": 7.999311196809072e-06, "loss": -0.0564, "step": 1400, "step_time": 16.059956464974675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.47573342686519027, "epoch": 0.01401, "frac_reward_zero_std": 0.5, "grad_norm": 0.03874115273356438, "kl": 0.32431914657354355, "learning_rate": 7.999310186502215e-06, "loss": -0.0295, "num_tokens": 35765673.0, "reward": 1.3475522994995117, "reward_std": 0.4106653034687042, "rewards/rollout_reward_func/mean": 1.3475522994995117, "rewards/rollout_reward_func/std": 0.41066527366638184, "sampling/importance_sampling_ratio/max": 1.071578860282898, "sampling/importance_sampling_ratio/mean": 0.9558337926864624, "sampling/importance_sampling_ratio/min": 2.6398372483527055e-06, "sampling/sampling_logp_difference/max": 2.06243634223938, "sampling/sampling_logp_difference/mean": 0.15891429781913757, "step": 1401, "step_time": 23.140063000988448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 0.47619440220296383, "epoch": 0.01402, "grad_norm": 0.04484934359788895, "kl": 0.3378113992512226, "learning_rate": 7.999309175455048e-06, "loss": -0.0294, "step": 1402, "step_time": 13.619105148041854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.03125, "completions/mean_terminated_length": 4.300000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8056326285004616, "epoch": 0.01403, "frac_reward_zero_std": 0.5, "grad_norm": 0.060542572289705276, "kl": 0.32003965973854065, "learning_rate": 7.999308163667572e-06, "loss": -0.0305, "num_tokens": 35814048.0, "reward": 0.9680665731430054, "reward_std": 0.6583076119422913, "rewards/rollout_reward_func/mean": 0.9680665731430054, "rewards/rollout_reward_func/std": 0.6583075523376465, "sampling/importance_sampling_ratio/max": 1.3823403120040894, "sampling/importance_sampling_ratio/mean": 0.8368013501167297, "sampling/importance_sampling_ratio/min": 0.0002878213708754629, "sampling/sampling_logp_difference/max": 1.6087076663970947, "sampling/sampling_logp_difference/mean": 0.17387479543685913, "step": 1403, "step_time": 25.18220414602547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8053773916326463, "epoch": 0.01404, "grad_norm": 0.05718773975968361, "kl": 0.3163163289427757, "learning_rate": 7.999307151139788e-06, "loss": -0.0305, "step": 1404, "step_time": 14.290870717959478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2689425740391016, "epoch": 0.01405, "frac_reward_zero_std": 0.0, "grad_norm": 0.1665250062942505, "kl": 1.0667317025363445, "learning_rate": 7.999306137871695e-06, "loss": -0.0597, "num_tokens": 35865719.0, "reward": 0.8689560890197754, "reward_std": 0.6647321581840515, "rewards/rollout_reward_func/mean": 0.8689560890197754, "rewards/rollout_reward_func/std": 0.6647321581840515, "sampling/importance_sampling_ratio/max": 1.0839017629623413, "sampling/importance_sampling_ratio/mean": 0.808835506439209, "sampling/importance_sampling_ratio/min": 1.3630417683430096e-08, "sampling/sampling_logp_difference/max": 2.5406887531280518, "sampling/sampling_logp_difference/mean": 0.2900828719139099, "step": 1405, "step_time": 26.071939379966352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 1.2912393403239548, "epoch": 0.01406, "grad_norm": 0.11280004680156708, "kl": 0.9892446398735046, "learning_rate": 7.999305123863293e-06, "loss": -0.0602, "step": 1406, "step_time": 14.112890937016346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 5.200000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.518584132194519, "epoch": 0.01407, "frac_reward_zero_std": 0.25, "grad_norm": 0.03549351915717125, "kl": 0.34806588292121887, "learning_rate": 7.999304109114585e-06, "loss": -0.0381, "num_tokens": 35921457.0, "reward": 0.3644520044326782, "reward_std": 0.7833212018013, "rewards/rollout_reward_func/mean": 0.3644520044326782, "rewards/rollout_reward_func/std": 0.7833212018013, "sampling/importance_sampling_ratio/max": 1.2402691841125488, "sampling/importance_sampling_ratio/mean": 0.7454935908317566, "sampling/importance_sampling_ratio/min": 2.9833148801117204e-05, "sampling/sampling_logp_difference/max": 2.671518564224243, "sampling/sampling_logp_difference/mean": 0.3188873827457428, "step": 1407, "step_time": 26.94360813999083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008333333767950535, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008333333767950535, "entropy": 1.5542049799114466, "epoch": 0.01408, "grad_norm": 0.03912648186087608, "kl": 0.33470089361071587, "learning_rate": 7.999303093625568e-06, "loss": -0.0382, "step": 1408, "step_time": 13.662304739031242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.71875, "completions/mean_terminated_length": 4.655172348022461, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2340928330086172, "epoch": 0.01409, "frac_reward_zero_std": 0.0, "grad_norm": 0.03183445706963539, "kl": 0.35654917918145657, "learning_rate": 7.999302077396243e-06, "loss": -0.0752, "num_tokens": 35973379.0, "reward": 1.0257625579833984, "reward_std": 0.6029021739959717, "rewards/rollout_reward_func/mean": 1.0257625579833984, "rewards/rollout_reward_func/std": 0.6029021739959717, "sampling/importance_sampling_ratio/max": 1.2336188554763794, "sampling/importance_sampling_ratio/mean": 0.8448655605316162, "sampling/importance_sampling_ratio/min": 8.957206532045348e-09, "sampling/sampling_logp_difference/max": 2.695772647857666, "sampling/sampling_logp_difference/mean": 0.3396318554878235, "step": 1409, "step_time": 26.890538808947895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2418456841260195, "epoch": 0.0141, "grad_norm": 0.032299723476171494, "kl": 0.34883230924606323, "learning_rate": 7.999301060426611e-06, "loss": -0.0754, "step": 1410, "step_time": 13.74809586603078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.21875, "completions/mean_terminated_length": 4.21875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.28163318894803524, "epoch": 0.01411, "frac_reward_zero_std": 0.25, "grad_norm": 0.022264858707785606, "kl": 0.30828049033880234, "learning_rate": 7.999300042716673e-06, "loss": -0.0167, "num_tokens": 36022199.0, "reward": 1.2026551961898804, "reward_std": 0.3949196934700012, "rewards/rollout_reward_func/mean": 1.2026551961898804, "rewards/rollout_reward_func/std": 0.39491966366767883, "sampling/importance_sampling_ratio/max": 1.157022476196289, "sampling/importance_sampling_ratio/mean": 0.9729282855987549, "sampling/importance_sampling_ratio/min": 0.015228080563247204, "sampling/sampling_logp_difference/max": 1.3564099073410034, "sampling/sampling_logp_difference/mean": 0.056764233857393265, "step": 1411, "step_time": 22.011057020979933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2873782431706786, "epoch": 0.01412, "grad_norm": 0.02500118687748909, "kl": 0.3098116312175989, "learning_rate": 7.999299024266429e-06, "loss": -0.0166, "step": 1412, "step_time": 12.146240673988359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.90625, "completions/mean_terminated_length": 4.862069129943848, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.209274996072054, "epoch": 0.01413, "frac_reward_zero_std": 0.0, "grad_norm": 0.03922645375132561, "kl": 0.44673795625567436, "learning_rate": 7.999298005075876e-06, "loss": -0.0658, "num_tokens": 36070009.0, "reward": 1.160778522491455, "reward_std": 0.6842161417007446, "rewards/rollout_reward_func/mean": 1.160778522491455, "rewards/rollout_reward_func/std": 0.6842161417007446, "sampling/importance_sampling_ratio/max": 1.0910714864730835, "sampling/importance_sampling_ratio/mean": 0.8208309412002563, "sampling/importance_sampling_ratio/min": 2.39625336462268e-07, "sampling/sampling_logp_difference/max": 2.359731674194336, "sampling/sampling_logp_difference/mean": 0.2653631567955017, "step": 1413, "step_time": 26.825428503012517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2166306134313345, "epoch": 0.01414, "grad_norm": 0.03908860310912132, "kl": 0.44519426114857197, "learning_rate": 7.999296985145019e-06, "loss": -0.0658, "step": 1414, "step_time": 14.833753910032101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.535714626312256, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8545445129275322, "epoch": 0.01415, "frac_reward_zero_std": 0.0, "grad_norm": 0.15150092542171478, "kl": 0.32601819932460785, "learning_rate": 7.999295964473855e-06, "loss": -0.0491, "num_tokens": 36117264.0, "reward": 0.22434143722057343, "reward_std": 0.8813360929489136, "rewards/rollout_reward_func/mean": 0.22434143722057343, "rewards/rollout_reward_func/std": 0.8813360929489136, "sampling/importance_sampling_ratio/max": 1.0757455825805664, "sampling/importance_sampling_ratio/mean": 0.6846246719360352, "sampling/importance_sampling_ratio/min": 1.2115256575739863e-09, "sampling/sampling_logp_difference/max": 2.461181640625, "sampling/sampling_logp_difference/mean": 0.4145846962928772, "step": 1415, "step_time": 25.449184765020618 }, { "clip_ratio/high_max": 0.0357142873108387, "clip_ratio/high_mean": 0.01785714365541935, "clip_ratio/low_mean": 0.003289473708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021146617364138365, "entropy": 1.8598687537014484, "epoch": 0.01416, "grad_norm": 0.08428321033716202, "kl": 0.3253621142357588, "learning_rate": 7.999294943062385e-06, "loss": -0.0497, "step": 1416, "step_time": 14.027697903977241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.71875, "completions/mean_terminated_length": 5.758620738983154, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9180726800113916, "epoch": 0.01417, "frac_reward_zero_std": 0.0, "grad_norm": 0.025521229952573776, "kl": 0.47982707619667053, "learning_rate": 7.999293920910611e-06, "loss": -0.0742, "num_tokens": 36160337.0, "reward": 0.8695602416992188, "reward_std": 0.7387232184410095, "rewards/rollout_reward_func/mean": 0.8695602416992188, "rewards/rollout_reward_func/std": 0.7387232780456543, "sampling/importance_sampling_ratio/max": 1.481210470199585, "sampling/importance_sampling_ratio/mean": 0.6983410120010376, "sampling/importance_sampling_ratio/min": 7.639921584257081e-09, "sampling/sampling_logp_difference/max": 2.5307016372680664, "sampling/sampling_logp_difference/mean": 0.4702351987361908, "step": 1417, "step_time": 25.41465739806881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9213948408141732, "epoch": 0.01418, "grad_norm": 0.026764895766973495, "kl": 0.4785483069717884, "learning_rate": 7.999292898018531e-06, "loss": -0.0742, "step": 1418, "step_time": 13.837605167995207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.40625, "completions/mean_terminated_length": 4.40625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8322471939027309, "epoch": 0.01419, "frac_reward_zero_std": 0.25, "grad_norm": 0.07109906524419785, "kl": 0.3898061364889145, "learning_rate": 7.999291874386145e-06, "loss": -0.0334, "num_tokens": 36208133.0, "reward": 0.9667227268218994, "reward_std": 0.6246141195297241, "rewards/rollout_reward_func/mean": 0.9667227268218994, "rewards/rollout_reward_func/std": 0.6246141195297241, "sampling/importance_sampling_ratio/max": 1.4697731733322144, "sampling/importance_sampling_ratio/mean": 0.8871444463729858, "sampling/importance_sampling_ratio/min": 1.2357061677903403e-05, "sampling/sampling_logp_difference/max": 2.107732057571411, "sampling/sampling_logp_difference/mean": 0.17645111680030823, "step": 1419, "step_time": 22.548611679987516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8445524675771594, "epoch": 0.0142, "grad_norm": 0.07210420817136765, "kl": 0.38790377229452133, "learning_rate": 7.999290850013455e-06, "loss": -0.0334, "step": 1420, "step_time": 11.974914624996018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.9375, "completions/mean_terminated_length": 5.259259223937988, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.83354371227324, "epoch": 0.01421, "frac_reward_zero_std": 0.0, "grad_norm": 0.1515181064605713, "kl": 0.4147801399230957, "learning_rate": 7.999289824900462e-06, "loss": -0.0818, "num_tokens": 36268809.0, "reward": 0.6386974453926086, "reward_std": 0.7934721112251282, "rewards/rollout_reward_func/mean": 0.6386974453926086, "rewards/rollout_reward_func/std": 0.7934720516204834, "sampling/importance_sampling_ratio/max": 1.2648813724517822, "sampling/importance_sampling_ratio/mean": 0.6997456550598145, "sampling/importance_sampling_ratio/min": 5.283912685172254e-08, "sampling/sampling_logp_difference/max": 2.196129322052002, "sampling/sampling_logp_difference/mean": 0.42257946729660034, "step": 1421, "step_time": 31.230348863027757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8238954972475767, "epoch": 0.01422, "grad_norm": 0.14546918869018555, "kl": 0.4041283242404461, "learning_rate": 7.999288799047162e-06, "loss": -0.082, "step": 1422, "step_time": 16.081100863026222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.75, "completions/mean_terminated_length": 4.615384578704834, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.639457419514656, "epoch": 0.01423, "frac_reward_zero_std": 0.0, "grad_norm": 0.056276511400938034, "kl": 0.3215017132461071, "learning_rate": 7.99928777245356e-06, "loss": -0.0728, "num_tokens": 36325707.0, "reward": 0.6170017719268799, "reward_std": 0.8482890725135803, "rewards/rollout_reward_func/mean": 0.6170017719268799, "rewards/rollout_reward_func/std": 0.8482890129089355, "sampling/importance_sampling_ratio/max": 1.1447080373764038, "sampling/importance_sampling_ratio/mean": 0.7637451887130737, "sampling/importance_sampling_ratio/min": 9.51757561651334e-10, "sampling/sampling_logp_difference/max": 2.376534938812256, "sampling/sampling_logp_difference/mean": 0.33462241291999817, "step": 1423, "step_time": 26.86032288102433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.6298057287931442, "epoch": 0.01424, "grad_norm": 0.04661174118518829, "kl": 0.3372476398944855, "learning_rate": 7.999286745119656e-06, "loss": -0.0731, "step": 1424, "step_time": 13.264841536030872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.5625, "completions/mean_terminated_length": 4.193548202514648, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5793335931375623, "epoch": 0.01425, "frac_reward_zero_std": 0.25, "grad_norm": 0.01913541927933693, "kl": 0.35523335449397564, "learning_rate": 7.999285717045445e-06, "loss": -0.042, "num_tokens": 36368792.0, "reward": 1.0804468393325806, "reward_std": 0.6569802165031433, "rewards/rollout_reward_func/mean": 1.0804468393325806, "rewards/rollout_reward_func/std": 0.6569802761077881, "sampling/importance_sampling_ratio/max": 1.1319884061813354, "sampling/importance_sampling_ratio/mean": 0.9437359571456909, "sampling/importance_sampling_ratio/min": 1.7748779113091473e-09, "sampling/sampling_logp_difference/max": 3.228145122528076, "sampling/sampling_logp_difference/mean": 0.18826928734779358, "step": 1425, "step_time": 17.972038774983957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5678848214447498, "epoch": 0.01426, "grad_norm": 0.015202982351183891, "kl": 0.3645682167261839, "learning_rate": 7.999284688230932e-06, "loss": -0.0421, "step": 1426, "step_time": 10.768679875065573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 4.903225898742676, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0678439401090145, "epoch": 0.01427, "frac_reward_zero_std": 0.25, "grad_norm": 0.05064316466450691, "kl": 0.33989301323890686, "learning_rate": 7.999283658676117e-06, "loss": -0.061, "num_tokens": 36417080.0, "reward": 1.0749441385269165, "reward_std": 0.5878797769546509, "rewards/rollout_reward_func/mean": 1.0749441385269165, "rewards/rollout_reward_func/std": 0.5878797173500061, "sampling/importance_sampling_ratio/max": 1.118351697921753, "sampling/importance_sampling_ratio/mean": 0.8206577301025391, "sampling/importance_sampling_ratio/min": 3.99716773245018e-05, "sampling/sampling_logp_difference/max": 2.267482280731201, "sampling/sampling_logp_difference/mean": 0.22042077779769897, "step": 1427, "step_time": 28.019322798994835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0567313386127353, "epoch": 0.01428, "grad_norm": 0.042440179735422134, "kl": 0.35216119326651096, "learning_rate": 7.999282628381e-06, "loss": -0.0611, "step": 1428, "step_time": 14.99817454899312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.21875, "completions/mean_terminated_length": 4.8214287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5255409739911556, "epoch": 0.01429, "frac_reward_zero_std": 0.0, "grad_norm": 0.022644655779004097, "kl": 0.386632464826107, "learning_rate": 7.999281597345579e-06, "loss": -0.0832, "num_tokens": 36465505.0, "reward": 0.8255616426467896, "reward_std": 0.5828943848609924, "rewards/rollout_reward_func/mean": 0.8255616426467896, "rewards/rollout_reward_func/std": 0.5828943848609924, "sampling/importance_sampling_ratio/max": 1.240309238433838, "sampling/importance_sampling_ratio/mean": 0.7661675214767456, "sampling/importance_sampling_ratio/min": 3.5145364396527157e-09, "sampling/sampling_logp_difference/max": 2.6501946449279785, "sampling/sampling_logp_difference/mean": 0.39626094698905945, "step": 1429, "step_time": 23.850715701992158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5228273095563054, "epoch": 0.0143, "grad_norm": 0.0204091127961874, "kl": 0.39522377774119377, "learning_rate": 7.999280565569855e-06, "loss": -0.0833, "step": 1430, "step_time": 11.920552627037978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.59375, "completions/mean_terminated_length": 4.517241477966309, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3886677413247526, "epoch": 0.01431, "frac_reward_zero_std": 0.25, "grad_norm": 0.06375471502542496, "kl": 0.8875252306461334, "learning_rate": 7.999279533053831e-06, "loss": -0.06, "num_tokens": 36519450.0, "reward": 1.1974434852600098, "reward_std": 0.5758753418922424, "rewards/rollout_reward_func/mean": 1.1974434852600098, "rewards/rollout_reward_func/std": 0.5758752822875977, "sampling/importance_sampling_ratio/max": 1.1166669130325317, "sampling/importance_sampling_ratio/mean": 0.8446400165557861, "sampling/importance_sampling_ratio/min": 2.7536381508319607e-10, "sampling/sampling_logp_difference/max": 2.396981716156006, "sampling/sampling_logp_difference/mean": 0.38720864057540894, "step": 1431, "step_time": 24.378640929993708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3870089314877987, "epoch": 0.01432, "grad_norm": 0.06320802122354507, "kl": 0.8773166369646788, "learning_rate": 7.999278499797504e-06, "loss": -0.0601, "step": 1432, "step_time": 13.340513673989335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 4.148148059844971, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9538360591977835, "epoch": 0.01433, "frac_reward_zero_std": 0.25, "grad_norm": 0.019136926159262657, "kl": 0.24608524609357119, "learning_rate": 7.999277465800877e-06, "loss": -0.0604, "num_tokens": 36570699.0, "reward": 0.867656946182251, "reward_std": 0.8080637454986572, "rewards/rollout_reward_func/mean": 0.867656946182251, "rewards/rollout_reward_func/std": 0.8080636858940125, "sampling/importance_sampling_ratio/max": 1.1151585578918457, "sampling/importance_sampling_ratio/mean": 0.808796226978302, "sampling/importance_sampling_ratio/min": 4.939163409289904e-05, "sampling/sampling_logp_difference/max": 1.8194410800933838, "sampling/sampling_logp_difference/mean": 0.20869526267051697, "step": 1433, "step_time": 29.23933884696453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.946274203248322, "epoch": 0.01434, "grad_norm": 0.018443917855620384, "kl": 0.2478911057114601, "learning_rate": 7.999276431063948e-06, "loss": -0.0604, "step": 1434, "step_time": 13.685099941998487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.413793087005615, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0038224654272199, "epoch": 0.01435, "frac_reward_zero_std": 0.25, "grad_norm": 0.043126124888658524, "kl": 0.31713192351162434, "learning_rate": 7.999275395586719e-06, "loss": -0.0466, "num_tokens": 36631008.0, "reward": 0.5184138417243958, "reward_std": 0.6651692986488342, "rewards/rollout_reward_func/mean": 0.5184138417243958, "rewards/rollout_reward_func/std": 0.6651692986488342, "sampling/importance_sampling_ratio/max": 1.1938502788543701, "sampling/importance_sampling_ratio/mean": 0.8336822986602783, "sampling/importance_sampling_ratio/min": 0.00022888433886691928, "sampling/sampling_logp_difference/max": 1.9017032384872437, "sampling/sampling_logp_difference/mean": 0.20541061460971832, "step": 1435, "step_time": 31.288154760957696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0019310880452394, "epoch": 0.01436, "grad_norm": 0.045479919761419296, "kl": 0.3251007739454508, "learning_rate": 7.999274359369188e-06, "loss": -0.0466, "step": 1436, "step_time": 16.98273844301002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.71875, "completions/mean_terminated_length": 5.033333778381348, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1174886738881469, "epoch": 0.01437, "frac_reward_zero_std": 0.0, "grad_norm": 0.073102205991745, "kl": 0.4605768620967865, "learning_rate": 7.999273322411358e-06, "loss": -0.0813, "num_tokens": 36676355.0, "reward": 0.8292834758758545, "reward_std": 0.8215842247009277, "rewards/rollout_reward_func/mean": 0.8292834758758545, "rewards/rollout_reward_func/std": 0.821584165096283, "sampling/importance_sampling_ratio/max": 1.1045937538146973, "sampling/importance_sampling_ratio/mean": 0.8102141618728638, "sampling/importance_sampling_ratio/min": 0.00022272864589467645, "sampling/sampling_logp_difference/max": 2.1358118057250977, "sampling/sampling_logp_difference/mean": 0.25137874484062195, "step": 1437, "step_time": 21.169726558058755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.117083883844316, "epoch": 0.01438, "grad_norm": 0.07288370281457901, "kl": 0.4546237140893936, "learning_rate": 7.999272284713227e-06, "loss": -0.0812, "step": 1438, "step_time": 12.095638307015179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.28125, "completions/mean_terminated_length": 4.481481552124023, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4529264736920595, "epoch": 0.01439, "frac_reward_zero_std": 0.0, "grad_norm": 0.15094974637031555, "kl": 0.37485505267977715, "learning_rate": 7.999271246274796e-06, "loss": -0.0549, "num_tokens": 36725778.0, "reward": 0.6931321620941162, "reward_std": 0.9057021737098694, "rewards/rollout_reward_func/mean": 0.6931321620941162, "rewards/rollout_reward_func/std": 0.9057021737098694, "sampling/importance_sampling_ratio/max": 1.0794825553894043, "sampling/importance_sampling_ratio/mean": 0.7782323956489563, "sampling/importance_sampling_ratio/min": 2.3225128131798556e-07, "sampling/sampling_logp_difference/max": 2.844680070877075, "sampling/sampling_logp_difference/mean": 0.3706718683242798, "step": 1439, "step_time": 21.964124215970514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4545367560349405, "epoch": 0.0144, "grad_norm": 0.15666890144348145, "kl": 0.36505687423050404, "learning_rate": 7.999270207096065e-06, "loss": -0.0552, "step": 1440, "step_time": 12.23174806003226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 4.96875, "completions/mean_terminated_length": 4.233333587646484, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.034115120768547, "epoch": 0.01441, "frac_reward_zero_std": 0.25, "grad_norm": 0.05276242271065712, "kl": 0.34136855974793434, "learning_rate": 7.999269167177036e-06, "loss": -0.0406, "num_tokens": 36772508.0, "reward": 0.9099687337875366, "reward_std": 0.6769205331802368, "rewards/rollout_reward_func/mean": 0.9099687337875366, "rewards/rollout_reward_func/std": 0.6769205927848816, "sampling/importance_sampling_ratio/max": 1.1630473136901855, "sampling/importance_sampling_ratio/mean": 0.8893023729324341, "sampling/importance_sampling_ratio/min": 6.800175700316657e-11, "sampling/sampling_logp_difference/max": 2.3987300395965576, "sampling/sampling_logp_difference/mean": 0.2764755189418793, "step": 1441, "step_time": 24.03828731397516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.0408572144806385, "epoch": 0.01442, "grad_norm": 0.045294973999261856, "kl": 0.3458764310926199, "learning_rate": 7.999268126517707e-06, "loss": -0.0405, "step": 1442, "step_time": 13.564292300026864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.21875, "completions/mean_terminated_length": 5.592592716217041, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.412341622635722, "epoch": 0.01443, "frac_reward_zero_std": 0.0, "grad_norm": 0.06071977689862251, "kl": 0.5279396735131741, "learning_rate": 7.999267085118081e-06, "loss": -0.0619, "num_tokens": 36830143.0, "reward": 0.287201464176178, "reward_std": 0.7309840321540833, "rewards/rollout_reward_func/mean": 0.287201464176178, "rewards/rollout_reward_func/std": 0.7309840321540833, "sampling/importance_sampling_ratio/max": 1.1403411626815796, "sampling/importance_sampling_ratio/mean": 0.6387906670570374, "sampling/importance_sampling_ratio/min": 1.8226682385602544e-08, "sampling/sampling_logp_difference/max": 2.993530750274658, "sampling/sampling_logp_difference/mean": 0.5028845071792603, "step": 1443, "step_time": 25.873794468992855 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 2.418152032420039, "epoch": 0.01444, "grad_norm": 0.06330141425132751, "kl": 0.4921934986487031, "learning_rate": 7.999266042978154e-06, "loss": -0.0621, "step": 1444, "step_time": 13.39880914799869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.71875, "completions/mean_terminated_length": 4.655172348022461, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.480250645428896, "epoch": 0.01445, "frac_reward_zero_std": 0.0, "grad_norm": 0.06817641854286194, "kl": 0.3844977505505085, "learning_rate": 7.99926500009793e-06, "loss": -0.0392, "num_tokens": 36889271.0, "reward": 0.31136155128479004, "reward_std": 0.7428398728370667, "rewards/rollout_reward_func/mean": 0.31136155128479004, "rewards/rollout_reward_func/std": 0.7428398132324219, "sampling/importance_sampling_ratio/max": 1.4397950172424316, "sampling/importance_sampling_ratio/mean": 0.8252481818199158, "sampling/importance_sampling_ratio/min": 3.0169079678898925e-08, "sampling/sampling_logp_difference/max": 1.7611422538757324, "sampling/sampling_logp_difference/mean": 0.26886922121047974, "step": 1445, "step_time": 25.488058034068672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5167464837431908, "epoch": 0.01446, "grad_norm": 0.07117433100938797, "kl": 0.3848354984074831, "learning_rate": 7.999263956477407e-06, "loss": -0.0394, "step": 1446, "step_time": 13.79995420598425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.28125, "completions/mean_terminated_length": 4.481481552124023, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6067431420087814, "epoch": 0.01447, "frac_reward_zero_std": 0.0, "grad_norm": 0.08646218478679657, "kl": 0.24101541563868523, "learning_rate": 7.999262912116586e-06, "loss": -0.0852, "num_tokens": 36949387.0, "reward": 0.6040066480636597, "reward_std": 0.8838919997215271, "rewards/rollout_reward_func/mean": 0.6040066480636597, "rewards/rollout_reward_func/std": 0.8838920593261719, "sampling/importance_sampling_ratio/max": 1.2540228366851807, "sampling/importance_sampling_ratio/mean": 0.7767484188079834, "sampling/importance_sampling_ratio/min": 6.641320169364917e-07, "sampling/sampling_logp_difference/max": 1.9656847715377808, "sampling/sampling_logp_difference/mean": 0.32346683740615845, "step": 1447, "step_time": 27.51958420095616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02656250004656613, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02656250004656613, "entropy": 1.6436467096209526, "epoch": 0.01448, "grad_norm": 0.05042606219649315, "kl": 0.24171861354261637, "learning_rate": 7.999261867015469e-06, "loss": -0.0854, "step": 1448, "step_time": 14.279060793021927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.84375, "completions/mean_terminated_length": 4.392857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5835424084216356, "epoch": 0.01449, "frac_reward_zero_std": 0.25, "grad_norm": 0.08717279136180878, "kl": 0.2702924311161041, "learning_rate": 7.999260821174055e-06, "loss": -0.0755, "num_tokens": 37002484.0, "reward": 0.9966346025466919, "reward_std": 0.6806631684303284, "rewards/rollout_reward_func/mean": 0.9966346025466919, "rewards/rollout_reward_func/std": 0.6806631088256836, "sampling/importance_sampling_ratio/max": 1.2912472486495972, "sampling/importance_sampling_ratio/mean": 0.827768087387085, "sampling/importance_sampling_ratio/min": 6.633410487566493e-10, "sampling/sampling_logp_difference/max": 2.531381130218506, "sampling/sampling_logp_difference/mean": 0.44339120388031006, "step": 1449, "step_time": 27.443559327017283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5891773728653789, "epoch": 0.0145, "grad_norm": 0.09468020498752594, "kl": 0.269860589876771, "learning_rate": 7.999259774592343e-06, "loss": -0.0753, "step": 1450, "step_time": 14.964802953938488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.84375, "completions/mean_terminated_length": 5.148148059844971, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9077945072203875, "epoch": 0.01451, "frac_reward_zero_std": 0.0, "grad_norm": 0.03483295813202858, "kl": 0.42758084647357464, "learning_rate": 7.999258727270334e-06, "loss": -0.073, "num_tokens": 37054846.0, "reward": 0.8913162350654602, "reward_std": 0.7170125246047974, "rewards/rollout_reward_func/mean": 0.8913162350654602, "rewards/rollout_reward_func/std": 0.7170125246047974, "sampling/importance_sampling_ratio/max": 1.2636598348617554, "sampling/importance_sampling_ratio/mean": 0.6891506910324097, "sampling/importance_sampling_ratio/min": 4.382677831760162e-10, "sampling/sampling_logp_difference/max": 2.700779914855957, "sampling/sampling_logp_difference/mean": 0.4511314630508423, "step": 1451, "step_time": 25.23210358398501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.91042142175138, "epoch": 0.01452, "grad_norm": 0.034140244126319885, "kl": 0.4250353313982487, "learning_rate": 7.999257679208028e-06, "loss": -0.073, "step": 1452, "step_time": 12.355963376961881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.03125, "completions/mean_terminated_length": 4.607142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4882995765656233, "epoch": 0.01453, "frac_reward_zero_std": 0.0, "grad_norm": 0.043684691190719604, "kl": 0.2862953944131732, "learning_rate": 7.999256630405426e-06, "loss": -0.0722, "num_tokens": 37110553.0, "reward": 0.7797183990478516, "reward_std": 0.7847559452056885, "rewards/rollout_reward_func/mean": 0.7797183990478516, "rewards/rollout_reward_func/std": 0.7847559452056885, "sampling/importance_sampling_ratio/max": 1.2247333526611328, "sampling/importance_sampling_ratio/mean": 0.7414056062698364, "sampling/importance_sampling_ratio/min": 3.7505301975215843e-08, "sampling/sampling_logp_difference/max": 2.033327579498291, "sampling/sampling_logp_difference/mean": 0.3047122359275818, "step": 1453, "step_time": 28.15757544495864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.483614133670926, "epoch": 0.01454, "grad_norm": 0.04037616401910782, "kl": 0.2829284891486168, "learning_rate": 7.999255580862529e-06, "loss": -0.0723, "step": 1454, "step_time": 14.614741973025957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.84375, "completions/mean_terminated_length": 5.516129016876221, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2209423519670963, "epoch": 0.01455, "frac_reward_zero_std": 0.0, "grad_norm": 0.06994375586509705, "kl": 0.2864728607237339, "learning_rate": 7.999254530579335e-06, "loss": -0.0547, "num_tokens": 37161387.0, "reward": 0.49443116784095764, "reward_std": 0.6755855083465576, "rewards/rollout_reward_func/mean": 0.49443116784095764, "rewards/rollout_reward_func/std": 0.6755855083465576, "sampling/importance_sampling_ratio/max": 1.2152396440505981, "sampling/importance_sampling_ratio/mean": 0.774965226650238, "sampling/importance_sampling_ratio/min": 1.3584759471996222e-05, "sampling/sampling_logp_difference/max": 1.8647010326385498, "sampling/sampling_logp_difference/mean": 0.2562015652656555, "step": 1455, "step_time": 19.645472724951105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2112097069621086, "epoch": 0.01456, "grad_norm": 0.06360716372728348, "kl": 0.2913884185254574, "learning_rate": 7.999253479555846e-06, "loss": -0.0545, "step": 1456, "step_time": 10.37394604596193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.90625, "completions/mean_terminated_length": 4.464285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2659617587924004, "epoch": 0.01457, "frac_reward_zero_std": 0.0, "grad_norm": 0.09145291894674301, "kl": 0.2724625263363123, "learning_rate": 7.999252427792062e-06, "loss": -0.0727, "num_tokens": 37209686.0, "reward": 0.5200660824775696, "reward_std": 0.7681968212127686, "rewards/rollout_reward_func/mean": 0.5200660824775696, "rewards/rollout_reward_func/std": 0.7681968808174133, "sampling/importance_sampling_ratio/max": 1.3264740705490112, "sampling/importance_sampling_ratio/mean": 0.8019460439682007, "sampling/importance_sampling_ratio/min": 3.7956781397952e-06, "sampling/sampling_logp_difference/max": 1.8680706024169922, "sampling/sampling_logp_difference/mean": 0.2654937207698822, "step": 1457, "step_time": 24.851084628026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2637721616774797, "epoch": 0.01458, "grad_norm": 0.09511946141719818, "kl": 0.2725658603012562, "learning_rate": 7.999251375287984e-06, "loss": -0.0729, "step": 1458, "step_time": 12.414880909986096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 5.407407283782959, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.403331071138382, "epoch": 0.01459, "frac_reward_zero_std": 0.0, "grad_norm": 0.10845153778791428, "kl": 0.3533110674470663, "learning_rate": 7.999250322043608e-06, "loss": -0.0422, "num_tokens": 37272193.0, "reward": 0.1659197360277176, "reward_std": 0.6250478029251099, "rewards/rollout_reward_func/mean": 0.1659197360277176, "rewards/rollout_reward_func/std": 0.6250477433204651, "sampling/importance_sampling_ratio/max": 1.141497254371643, "sampling/importance_sampling_ratio/mean": 0.49227556586265564, "sampling/importance_sampling_ratio/min": 8.950784113892496e-09, "sampling/sampling_logp_difference/max": 2.3677170276641846, "sampling/sampling_logp_difference/mean": 0.4767175316810608, "step": 1459, "step_time": 25.987825598975178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3981890231370926, "epoch": 0.0146, "grad_norm": 0.10957217216491699, "kl": 0.3445397112518549, "learning_rate": 7.999249268058942e-06, "loss": -0.0423, "step": 1460, "step_time": 13.281717738020234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.21875, "completions/mean_terminated_length": 4.870967388153076, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0952994357794523, "epoch": 0.01461, "frac_reward_zero_std": 0.25, "grad_norm": 0.04901399090886116, "kl": 0.34866453148424625, "learning_rate": 7.99924821333398e-06, "loss": -0.0668, "num_tokens": 37318464.0, "reward": 0.8436421155929565, "reward_std": 0.7100052833557129, "rewards/rollout_reward_func/mean": 0.8436421155929565, "rewards/rollout_reward_func/std": 0.7100052833557129, "sampling/importance_sampling_ratio/max": 1.1800687313079834, "sampling/importance_sampling_ratio/mean": 0.7700604200363159, "sampling/importance_sampling_ratio/min": 4.5075370280756033e-07, "sampling/sampling_logp_difference/max": 1.7203679084777832, "sampling/sampling_logp_difference/mean": 0.24929380416870117, "step": 1461, "step_time": 25.434209124039626 }, { "clip_ratio/high_max": 0.021780303679406643, "clip_ratio/high_mean": 0.010890151839703321, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010890151839703321, "entropy": 1.1114016948267817, "epoch": 0.01462, "grad_norm": 0.04911031201481819, "kl": 0.34113029204308987, "learning_rate": 7.999247157868723e-06, "loss": -0.0667, "step": 1462, "step_time": 13.844948498997837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.28125, "completions/mean_terminated_length": 5.666666507720947, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0259387344121933, "epoch": 0.01463, "frac_reward_zero_std": 0.0, "grad_norm": 0.0528874397277832, "kl": 0.3615191364660859, "learning_rate": 7.999246101663173e-06, "loss": -0.1027, "num_tokens": 37376744.0, "reward": 0.6165462732315063, "reward_std": 0.8757670521736145, "rewards/rollout_reward_func/mean": 0.6165462732315063, "rewards/rollout_reward_func/std": 0.8757670521736145, "sampling/importance_sampling_ratio/max": 1.2143049240112305, "sampling/importance_sampling_ratio/mean": 0.6855133175849915, "sampling/importance_sampling_ratio/min": 1.2594546205946244e-07, "sampling/sampling_logp_difference/max": 2.6423964500427246, "sampling/sampling_logp_difference/mean": 0.40516912937164307, "step": 1463, "step_time": 28.604631205060286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0199803337454796, "epoch": 0.01464, "grad_norm": 0.05333844944834709, "kl": 0.3632718324661255, "learning_rate": 7.99924504471733e-06, "loss": -0.1027, "step": 1464, "step_time": 13.607406080001965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.34375, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.461016715504229, "epoch": 0.01465, "frac_reward_zero_std": 0.0, "grad_norm": 0.15788519382476807, "kl": 0.3159385249018669, "learning_rate": 7.999243987031194e-06, "loss": -0.074, "num_tokens": 37425625.0, "reward": 0.4688640832901001, "reward_std": 0.7595770955085754, "rewards/rollout_reward_func/mean": 0.4688640832901001, "rewards/rollout_reward_func/std": 0.7595770955085754, "sampling/importance_sampling_ratio/max": 1.3023626804351807, "sampling/importance_sampling_ratio/mean": 0.7090398073196411, "sampling/importance_sampling_ratio/min": 1.1207720490347128e-05, "sampling/sampling_logp_difference/max": 2.4936208724975586, "sampling/sampling_logp_difference/mean": 0.31290483474731445, "step": 1465, "step_time": 24.2431788520189 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.4347891379147768, "epoch": 0.01466, "grad_norm": 0.10450221598148346, "kl": 0.32993841357529163, "learning_rate": 7.999242928604764e-06, "loss": -0.0748, "step": 1466, "step_time": 12.513775970990537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 5.103448390960693, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7880946546792984, "epoch": 0.01467, "frac_reward_zero_std": 0.0, "grad_norm": 0.08083946257829666, "kl": 0.4412374570965767, "learning_rate": 7.999241869438041e-06, "loss": -0.0695, "num_tokens": 37483891.0, "reward": 0.18617209792137146, "reward_std": 0.6012189388275146, "rewards/rollout_reward_func/mean": 0.18617209792137146, "rewards/rollout_reward_func/std": 0.6012189388275146, "sampling/importance_sampling_ratio/max": 1.2412972450256348, "sampling/importance_sampling_ratio/mean": 0.7275445461273193, "sampling/importance_sampling_ratio/min": 5.359194332754669e-08, "sampling/sampling_logp_difference/max": 2.1128571033477783, "sampling/sampling_logp_difference/mean": 0.3930327296257019, "step": 1467, "step_time": 25.773394681018544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0036764706019312143, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036764706019312143, "entropy": 1.7583046574145555, "epoch": 0.01468, "grad_norm": 0.06573311984539032, "kl": 0.4508211836218834, "learning_rate": 7.999240809531027e-06, "loss": -0.0698, "step": 1468, "step_time": 14.370363653986715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 6.222222328186035, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.9005726724863052, "epoch": 0.01469, "frac_reward_zero_std": 0.0, "grad_norm": 0.16335684061050415, "kl": 0.23054546397179365, "learning_rate": 7.999239748883722e-06, "loss": -0.0895, "num_tokens": 37546566.0, "reward": 0.2355494350194931, "reward_std": 0.7643581628799438, "rewards/rollout_reward_func/mean": 0.2355494350194931, "rewards/rollout_reward_func/std": 0.7643582224845886, "sampling/importance_sampling_ratio/max": 1.219254732131958, "sampling/importance_sampling_ratio/mean": 0.4656919240951538, "sampling/importance_sampling_ratio/min": 2.238475182991806e-09, "sampling/sampling_logp_difference/max": 2.525209426879883, "sampling/sampling_logp_difference/mean": 0.528907060623169, "step": 1469, "step_time": 26.95762349697179 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.8760216385126114, "epoch": 0.0147, "grad_norm": 0.06554554402828217, "kl": 0.23601508885622025, "learning_rate": 7.999238687496123e-06, "loss": -0.0901, "step": 1470, "step_time": 13.272529982001288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 4.230769157409668, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6261617988348007, "epoch": 0.01471, "frac_reward_zero_std": 0.0, "grad_norm": 0.19976511597633362, "kl": 0.3638708936050534, "learning_rate": 7.999237625368234e-06, "loss": -0.0693, "num_tokens": 37603476.0, "reward": 0.6201790571212769, "reward_std": 0.9051950573921204, "rewards/rollout_reward_func/mean": 0.6201790571212769, "rewards/rollout_reward_func/std": 0.9051949977874756, "sampling/importance_sampling_ratio/max": 1.1184420585632324, "sampling/importance_sampling_ratio/mean": 0.7034831047058105, "sampling/importance_sampling_ratio/min": 2.4442879293928854e-06, "sampling/sampling_logp_difference/max": 2.003267526626587, "sampling/sampling_logp_difference/mean": 0.29867130517959595, "step": 1471, "step_time": 28.985575535014505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6069827526807785, "epoch": 0.01472, "grad_norm": 0.21191717684268951, "kl": 0.3871163884177804, "learning_rate": 7.999236562500053e-06, "loss": -0.0707, "step": 1472, "step_time": 15.651467421965208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.34375, "completions/mean_terminated_length": 4.633333683013916, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8778085550293326, "epoch": 0.01473, "frac_reward_zero_std": 0.25, "grad_norm": 0.028276730328798294, "kl": 0.30895617231726646, "learning_rate": 7.999235498891581e-06, "loss": -0.0605, "num_tokens": 37660082.0, "reward": 1.0506895780563354, "reward_std": 0.7144255638122559, "rewards/rollout_reward_func/mean": 1.0506895780563354, "rewards/rollout_reward_func/std": 0.7144255638122559, "sampling/importance_sampling_ratio/max": 1.3364375829696655, "sampling/importance_sampling_ratio/mean": 0.8900173902511597, "sampling/importance_sampling_ratio/min": 1.5947121937642805e-05, "sampling/sampling_logp_difference/max": 1.6049950122833252, "sampling/sampling_logp_difference/mean": 0.169702410697937, "step": 1473, "step_time": 29.78931034100242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8712477725930512, "epoch": 0.01474, "grad_norm": 0.022373061627149582, "kl": 0.31697800382971764, "learning_rate": 7.999234434542819e-06, "loss": -0.0606, "step": 1474, "step_time": 15.699347239045892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.03125, "completions/mean_terminated_length": 5.366666793823242, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5433714669197798, "epoch": 0.01475, "frac_reward_zero_std": 0.0, "grad_norm": 0.12779982388019562, "kl": 1.286453329026699, "learning_rate": 7.999233369453764e-06, "loss": -0.0719, "num_tokens": 37721450.0, "reward": 0.6778584122657776, "reward_std": 0.7813511490821838, "rewards/rollout_reward_func/mean": 0.6778584122657776, "rewards/rollout_reward_func/std": 0.7813511490821838, "sampling/importance_sampling_ratio/max": 1.2917144298553467, "sampling/importance_sampling_ratio/mean": 0.7548661231994629, "sampling/importance_sampling_ratio/min": 8.435824483399301e-09, "sampling/sampling_logp_difference/max": 2.2391843795776367, "sampling/sampling_logp_difference/mean": 0.3615584373474121, "step": 1475, "step_time": 30.25504207497579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5361870601773262, "epoch": 0.01476, "grad_norm": 0.12225472182035446, "kl": 1.2461526617407799, "learning_rate": 7.999232303624422e-06, "loss": -0.0722, "step": 1476, "step_time": 15.749246149003739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 4.5714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2749801899772137, "epoch": 0.01477, "frac_reward_zero_std": 0.0, "grad_norm": 0.0338447280228138, "kl": 0.31673683412373066, "learning_rate": 7.999231237054787e-06, "loss": -0.0864, "num_tokens": 37786378.0, "reward": 0.8946058750152588, "reward_std": 0.770028829574585, "rewards/rollout_reward_func/mean": 0.8946058750152588, "rewards/rollout_reward_func/std": 0.7700287699699402, "sampling/importance_sampling_ratio/max": 1.218186378479004, "sampling/importance_sampling_ratio/mean": 0.8235450983047485, "sampling/importance_sampling_ratio/min": 2.905414930864936e-07, "sampling/sampling_logp_difference/max": 1.9325839281082153, "sampling/sampling_logp_difference/mean": 0.2873164415359497, "step": 1477, "step_time": 30.24925736800651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2664130944758654, "epoch": 0.01478, "grad_norm": 0.03064996376633644, "kl": 0.309868523851037, "learning_rate": 7.999230169744864e-06, "loss": -0.0864, "step": 1478, "step_time": 15.778887221007608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6286706384271383, "epoch": 0.01479, "frac_reward_zero_std": 0.0, "grad_norm": 0.24040870368480682, "kl": 0.7925072479993105, "learning_rate": 7.999229101694651e-06, "loss": -0.0594, "num_tokens": 37839899.0, "reward": 1.1241695880889893, "reward_std": 0.7300562262535095, "rewards/rollout_reward_func/mean": 1.1241695880889893, "rewards/rollout_reward_func/std": 0.7300562262535095, "sampling/importance_sampling_ratio/max": 1.217177391052246, "sampling/importance_sampling_ratio/mean": 0.9060683250427246, "sampling/importance_sampling_ratio/min": 2.2407375581678934e-05, "sampling/sampling_logp_difference/max": 1.8943952322006226, "sampling/sampling_logp_difference/mean": 0.1694430410861969, "step": 1479, "step_time": 26.73976494805538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6303149266168475, "epoch": 0.0148, "grad_norm": 0.20844431221485138, "kl": 0.7940102778375149, "learning_rate": 7.99922803290415e-06, "loss": -0.061, "step": 1480, "step_time": 13.984132465993753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 4.360000133514404, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5018293275497854, "epoch": 0.01481, "frac_reward_zero_std": 0.0, "grad_norm": 0.01050857175141573, "kl": 0.33051891438663006, "learning_rate": 7.999226963373359e-06, "loss": -0.0671, "num_tokens": 37900808.0, "reward": 0.7565141916275024, "reward_std": 0.8980282545089722, "rewards/rollout_reward_func/mean": 0.7565141916275024, "rewards/rollout_reward_func/std": 0.8980282545089722, "sampling/importance_sampling_ratio/max": 1.1478806734085083, "sampling/importance_sampling_ratio/mean": 0.7466726899147034, "sampling/importance_sampling_ratio/min": 1.1493512829474639e-05, "sampling/sampling_logp_difference/max": 1.8111889362335205, "sampling/sampling_logp_difference/mean": 0.2980892062187195, "step": 1481, "step_time": 27.288661353988573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5052990776021034, "epoch": 0.01482, "grad_norm": 0.01145633589476347, "kl": 0.33510834630578756, "learning_rate": 7.99922589310228e-06, "loss": -0.0671, "step": 1482, "step_time": 12.8631545730168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.96875, "completions/mean_terminated_length": 4.440000057220459, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6922055892646313, "epoch": 0.01483, "frac_reward_zero_std": 0.0, "grad_norm": 0.06692484766244888, "kl": 0.34932973328977823, "learning_rate": 7.999224822090912e-06, "loss": -0.0867, "num_tokens": 37951482.0, "reward": 0.5255428552627563, "reward_std": 0.9001410007476807, "rewards/rollout_reward_func/mean": 0.5255428552627563, "rewards/rollout_reward_func/std": 0.9001410603523254, "sampling/importance_sampling_ratio/max": 1.0874958038330078, "sampling/importance_sampling_ratio/mean": 0.7730618715286255, "sampling/importance_sampling_ratio/min": 1.0982832776562645e-07, "sampling/sampling_logp_difference/max": 2.289428949356079, "sampling/sampling_logp_difference/mean": 0.37302452325820923, "step": 1483, "step_time": 26.62159122803132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6865698415786028, "epoch": 0.01484, "grad_norm": 0.0626068115234375, "kl": 0.3267091903835535, "learning_rate": 7.999223750339255e-06, "loss": -0.0869, "step": 1484, "step_time": 13.828210347011918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.78125, "completions/mean_terminated_length": 4.724137783050537, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1619189092889428, "epoch": 0.01485, "frac_reward_zero_std": 0.0, "grad_norm": 0.10729792714118958, "kl": 0.24394009448587894, "learning_rate": 7.999222677847311e-06, "loss": -0.0687, "num_tokens": 37998295.0, "reward": 0.8800287246704102, "reward_std": 0.9228971004486084, "rewards/rollout_reward_func/mean": 0.8800287246704102, "rewards/rollout_reward_func/std": 0.9228970408439636, "sampling/importance_sampling_ratio/max": 1.155964970588684, "sampling/importance_sampling_ratio/mean": 0.849117636680603, "sampling/importance_sampling_ratio/min": 2.002286692004418e-08, "sampling/sampling_logp_difference/max": 2.503688335418701, "sampling/sampling_logp_difference/mean": 0.25537964701652527, "step": 1485, "step_time": 21.53745230298955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1620870782062411, "epoch": 0.01486, "grad_norm": 0.12567490339279175, "kl": 0.24237742833793163, "learning_rate": 7.99922160461508e-06, "loss": -0.0691, "step": 1486, "step_time": 11.476038488035556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.90625, "completions/mean_terminated_length": 4.227272987365723, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.923516652546823, "epoch": 0.01487, "frac_reward_zero_std": 0.0, "grad_norm": 0.029518641531467438, "kl": 0.18617158848792315, "learning_rate": 7.99922053064256e-06, "loss": -0.0984, "num_tokens": 38055811.0, "reward": 0.7307268977165222, "reward_std": 0.9401432275772095, "rewards/rollout_reward_func/mean": 0.7307268977165222, "rewards/rollout_reward_func/std": 0.9401432275772095, "sampling/importance_sampling_ratio/max": 1.11970853805542, "sampling/importance_sampling_ratio/mean": 0.6887848377227783, "sampling/importance_sampling_ratio/min": 3.6365519918035716e-05, "sampling/sampling_logp_difference/max": 1.956953525543213, "sampling/sampling_logp_difference/mean": 0.3333945870399475, "step": 1487, "step_time": 28.951575358980335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9225249011069536, "epoch": 0.01488, "grad_norm": 0.03401530534029007, "kl": 0.18450218811631203, "learning_rate": 7.999219455929755e-06, "loss": -0.0982, "step": 1488, "step_time": 14.064262374013197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.71875, "completions/mean_terminated_length": 6.2916669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.0130598349496722, "epoch": 0.01489, "frac_reward_zero_std": 0.0, "grad_norm": 0.13124965131282806, "kl": 0.19146952964365482, "learning_rate": 7.999218380476662e-06, "loss": -0.0513, "num_tokens": 38113497.0, "reward": 0.056882113218307495, "reward_std": 0.710442841053009, "rewards/rollout_reward_func/mean": 0.056882113218307495, "rewards/rollout_reward_func/std": 0.7104427814483643, "sampling/importance_sampling_ratio/max": 1.1725695133209229, "sampling/importance_sampling_ratio/mean": 0.4229121804237366, "sampling/importance_sampling_ratio/min": 4.490235738785486e-08, "sampling/sampling_logp_difference/max": 2.5778942108154297, "sampling/sampling_logp_difference/mean": 0.5030860900878906, "step": 1489, "step_time": 28.28428414001246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008971292059868574, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008971292059868574, "entropy": 3.0026480928063393, "epoch": 0.0149, "grad_norm": 0.07544592767953873, "kl": 0.1872134730219841, "learning_rate": 7.999217304283282e-06, "loss": -0.0516, "step": 1490, "step_time": 13.930689781118417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 4.6875, "completions/mean_terminated_length": 4.6875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6738094175234437, "epoch": 0.01491, "frac_reward_zero_std": 0.25, "grad_norm": 0.2730652391910553, "kl": 2.001795321702957, "learning_rate": 7.999216227349616e-06, "loss": -0.0454, "num_tokens": 38149226.0, "reward": 0.7699413895606995, "reward_std": 0.8225588202476501, "rewards/rollout_reward_func/mean": 0.7699413895606995, "rewards/rollout_reward_func/std": 0.8225588798522949, "sampling/importance_sampling_ratio/max": 1.1994379758834839, "sampling/importance_sampling_ratio/mean": 0.9236046075820923, "sampling/importance_sampling_ratio/min": 6.889609721838497e-06, "sampling/sampling_logp_difference/max": 1.6164357662200928, "sampling/sampling_logp_difference/mean": 0.17371435463428497, "step": 1491, "step_time": 13.342444072972285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6703857639804482, "epoch": 0.01492, "grad_norm": 0.13996762037277222, "kl": 1.2835102751851082, "learning_rate": 7.999215149675664e-06, "loss": -0.0468, "step": 1492, "step_time": 8.057344734988874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.53125, "completions/mean_terminated_length": 4.708333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4597031064331532, "epoch": 0.01493, "frac_reward_zero_std": 0.0, "grad_norm": 0.05219985917210579, "kl": 0.2144182687625289, "learning_rate": 7.999214071261429e-06, "loss": -0.0927, "num_tokens": 38202063.0, "reward": 0.4656178653240204, "reward_std": 0.9878540635108948, "rewards/rollout_reward_func/mean": 0.4656178653240204, "rewards/rollout_reward_func/std": 0.98785400390625, "sampling/importance_sampling_ratio/max": 1.141465187072754, "sampling/importance_sampling_ratio/mean": 0.714990496635437, "sampling/importance_sampling_ratio/min": 6.46922126179561e-06, "sampling/sampling_logp_difference/max": 1.8843908309936523, "sampling/sampling_logp_difference/mean": 0.31232917308807373, "step": 1493, "step_time": 27.036432053952012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.452432494610548, "epoch": 0.01494, "grad_norm": 0.048907116055488586, "kl": 0.2123978938907385, "learning_rate": 7.999212992106905e-06, "loss": -0.0929, "step": 1494, "step_time": 12.860474224988138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.40625, "completions/mean_terminated_length": 4.954545497894287, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.948514237999916, "epoch": 0.01495, "frac_reward_zero_std": 0.0, "grad_norm": 0.07576575875282288, "kl": 0.46282473485916853, "learning_rate": 7.999211912212097e-06, "loss": -0.0795, "num_tokens": 38270961.0, "reward": 0.6383006572723389, "reward_std": 0.859481155872345, "rewards/rollout_reward_func/mean": 0.6383006572723389, "rewards/rollout_reward_func/std": 0.8594812154769897, "sampling/importance_sampling_ratio/max": 1.3792567253112793, "sampling/importance_sampling_ratio/mean": 0.6152675747871399, "sampling/importance_sampling_ratio/min": 3.5558971944738005e-07, "sampling/sampling_logp_difference/max": 2.637812614440918, "sampling/sampling_logp_difference/mean": 0.35938549041748047, "step": 1495, "step_time": 33.261880772945005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9455203413963318, "epoch": 0.01496, "grad_norm": 0.06656665354967117, "kl": 0.44298470206558704, "learning_rate": 7.999210831577004e-06, "loss": -0.0797, "step": 1496, "step_time": 16.063600160967326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.84375, "completions/mean_terminated_length": 4.392857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8583430983126163, "epoch": 0.01497, "frac_reward_zero_std": 0.5, "grad_norm": 0.04430257901549339, "kl": 0.24589009024202824, "learning_rate": 7.999209750201626e-06, "loss": -0.0519, "num_tokens": 38312921.0, "reward": 0.8030546307563782, "reward_std": 0.8824517130851746, "rewards/rollout_reward_func/mean": 0.8030546307563782, "rewards/rollout_reward_func/std": 0.8824517726898193, "sampling/importance_sampling_ratio/max": 1.241906762123108, "sampling/importance_sampling_ratio/mean": 0.8773583769798279, "sampling/importance_sampling_ratio/min": 1.0493764420971274e-05, "sampling/sampling_logp_difference/max": 2.129122257232666, "sampling/sampling_logp_difference/mean": 0.2066158950328827, "step": 1497, "step_time": 22.361246643937193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8578002694994211, "epoch": 0.01498, "grad_norm": 0.04212942719459534, "kl": 0.24638892710208893, "learning_rate": 7.999208668085963e-06, "loss": -0.0519, "step": 1498, "step_time": 12.023167231993284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.28125, "completions/mean_terminated_length": 4.892857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2471381053328514, "epoch": 0.01499, "frac_reward_zero_std": 0.0, "grad_norm": 0.039091963320970535, "kl": 0.280439511872828, "learning_rate": 7.999207585230017e-06, "loss": -0.0742, "num_tokens": 38357297.0, "reward": 0.39051708579063416, "reward_std": 0.8989819288253784, "rewards/rollout_reward_func/mean": 0.39051708579063416, "rewards/rollout_reward_func/std": 0.8989819288253784, "sampling/importance_sampling_ratio/max": 1.1160677671432495, "sampling/importance_sampling_ratio/mean": 0.7800823450088501, "sampling/importance_sampling_ratio/min": 2.7452479116618633e-06, "sampling/sampling_logp_difference/max": 2.107750415802002, "sampling/sampling_logp_difference/mean": 0.2747761607170105, "step": 1499, "step_time": 23.07288683502702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2472765892744064, "epoch": 0.015, "grad_norm": 0.03958745300769806, "kl": 0.28002936486154795, "learning_rate": 7.999206501633788e-06, "loss": -0.0741, "step": 1500, "step_time": 12.009773964062333 } ], "logging_steps": 1.0, "max_steps": 200000, "num_input_tokens_seen": 38357297, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }