{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.00112, "eval_steps": 500, "global_step": 56, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 125.03125, "completions/mean_terminated_length": 125.03125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.45527722872793674, "epoch": 2e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.013841008767485619, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0012, "num_tokens": 51075.0, "reward": 6.777899265289307, "reward_std": 5.124541759490967, "rewards/_dispatch_reward/mean": 6.777899265289307, "rewards/_dispatch_reward/std": 5.124542236328125, "sampling/importance_sampling_ratio/max": 1.0203018188476562, "sampling/importance_sampling_ratio/mean": 0.5446840524673462, "sampling/importance_sampling_ratio/min": 0.004089090973138809, "sampling/sampling_logp_difference/max": 0.5533419251441956, "sampling/sampling_logp_difference/mean": 0.023174326866865158, "step": 1, "step_time": 56.13741443500112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 245.53125, "completions/mean_terminated_length": 245.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6149225234985352, "epoch": 4e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.009881995618343353, "kl": 0.0, "learning_rate": 2.8571428571428575e-07, "loss": -0.0059, "num_tokens": 110688.0, "reward": 4.6487836837768555, "reward_std": 3.784931182861328, "rewards/_dispatch_reward/mean": 4.6487836837768555, "rewards/_dispatch_reward/std": 3.784930944442749, "sampling/importance_sampling_ratio/max": 1.3708744049072266, "sampling/importance_sampling_ratio/mean": 0.3464105427265167, "sampling/importance_sampling_ratio/min": 8.993503541887549e-08, "sampling/sampling_logp_difference/max": 0.6291780471801758, "sampling/sampling_logp_difference/mean": 0.027994709089398384, "step": 2, "step_time": 87.9052765430024 }, { "clip_ratio/high_max": 0.004685592517489567, "clip_ratio/high_mean": 0.0026511843752814457, "clip_ratio/low_mean": 0.0012970532843610272, "clip_ratio/low_min": 0.0004810317768715322, "clip_ratio/region_mean": 0.003948237717850134, "completions/clipped_ratio": 0.0, "completions/max_length": 1309.0, "completions/max_terminated_length": 1309.0, "completions/mean_length": 262.6875, "completions/mean_terminated_length": 262.6875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.6115306541323662, "epoch": 6e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.01363975927233696, "kl": 0.0011097499082097784, "learning_rate": 5.714285714285715e-07, "loss": 0.0041, "num_tokens": 149708.0, "reward": 5.756682395935059, "reward_std": 2.871458053588867, "rewards/_dispatch_reward/mean": 5.756682395935059, "rewards/_dispatch_reward/std": 2.871457815170288, "sampling/importance_sampling_ratio/max": 1.0003302097320557, "sampling/importance_sampling_ratio/mean": 0.24685890972614288, "sampling/importance_sampling_ratio/min": 1.2128718474002653e-08, "sampling/sampling_logp_difference/max": 1.1710762977600098, "sampling/sampling_logp_difference/mean": 0.024929963052272797, "step": 3, "step_time": 76.56850643599682 }, { "clip_ratio/high_max": 0.005967892473563552, "clip_ratio/high_mean": 0.0036123984755249694, "clip_ratio/low_mean": 0.0005970557394903153, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0042094542295672, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 384.65625, "completions/mean_terminated_length": 331.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.6012951210141182, "epoch": 8e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010854247957468033, "kl": 0.001065619035216514, "learning_rate": 8.571428571428572e-07, "loss": -0.0005, "num_tokens": 196257.0, "reward": 7.588564872741699, "reward_std": 1.0828258991241455, "rewards/_dispatch_reward/mean": 7.588564872741699, "rewards/_dispatch_reward/std": 1.082825779914856, "sampling/importance_sampling_ratio/max": 0.6254830956459045, "sampling/importance_sampling_ratio/mean": 0.21221274137496948, "sampling/importance_sampling_ratio/min": 3.4579251862885876e-16, "sampling/sampling_logp_difference/max": 0.9008774161338806, "sampling/sampling_logp_difference/mean": 0.025733064860105515, "step": 4, "step_time": 105.42133834299966 }, { "clip_ratio/high_max": 0.023003800350124948, "clip_ratio/high_mean": 0.01184065386041766, "clip_ratio/low_mean": 0.012614889892574865, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024455543752992526, "completions/clipped_ratio": 0.0, "completions/max_length": 1074.0, "completions/max_terminated_length": 1074.0, "completions/mean_length": 90.90625, "completions/mean_terminated_length": 90.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.298396717524156, "epoch": 0.0001, "frac_reward_zero_std": 0.0, "grad_norm": 0.00900355540215969, "kl": 0.14300901055321447, "learning_rate": 1.142857142857143e-06, "loss": -0.0006, "num_tokens": 289297.0, "reward": 2.042935848236084, "reward_std": 6.4184794425964355, "rewards/_dispatch_reward/mean": 2.042935848236084, "rewards/_dispatch_reward/std": 6.418478965759277, "sampling/importance_sampling_ratio/max": 1.389119029045105, "sampling/importance_sampling_ratio/mean": 0.7724133729934692, "sampling/importance_sampling_ratio/min": 1.1340963510519941e-06, "sampling/sampling_logp_difference/max": 2.341449022293091, "sampling/sampling_logp_difference/mean": 0.023405466228723526, "step": 5, "step_time": 76.50286699199569 }, { "clip_ratio/high_max": 0.0006235100008780137, "clip_ratio/high_mean": 0.00031175500043900684, "clip_ratio/low_mean": 0.012735862444969825, "clip_ratio/low_min": 9.578544268151745e-05, "clip_ratio/region_mean": 0.013047617438132875, "completions/clipped_ratio": 0.0, "completions/max_length": 1302.0, "completions/max_terminated_length": 1302.0, "completions/mean_length": 98.15625, "completions/mean_terminated_length": 98.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3092191433534026, "epoch": 0.00012, "frac_reward_zero_std": 0.0, "grad_norm": 0.0037304293364286423, "kl": 0.009563418556354009, "learning_rate": 1.4285714285714286e-06, "loss": -0.0002, "num_tokens": 382323.0, "reward": 1.780243992805481, "reward_std": 5.718746185302734, "rewards/_dispatch_reward/mean": 1.780243992805481, "rewards/_dispatch_reward/std": 5.718745708465576, "sampling/importance_sampling_ratio/max": 1.6135059595108032, "sampling/importance_sampling_ratio/mean": 0.7582643032073975, "sampling/importance_sampling_ratio/min": 7.616848124447628e-12, "sampling/sampling_logp_difference/max": 0.5308667421340942, "sampling/sampling_logp_difference/mean": 0.023645631968975067, "step": 6, "step_time": 66.5997183530053 }, { "clip_ratio/high_max": 0.005657617410179228, "clip_ratio/high_mean": 0.0030029033950995654, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030029033950995654, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 197.40625, "completions/mean_terminated_length": 197.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5808572247624397, "epoch": 0.00014, "frac_reward_zero_std": 0.25, "grad_norm": 0.009763856418430805, "kl": 0.0010594904888421297, "learning_rate": 1.7142857142857145e-06, "loss": -0.0009, "num_tokens": 438093.0, "reward": 5.920393466949463, "reward_std": 5.249653339385986, "rewards/_dispatch_reward/mean": 5.920393466949463, "rewards/_dispatch_reward/std": 5.249653339385986, "sampling/importance_sampling_ratio/max": 1.0098344087600708, "sampling/importance_sampling_ratio/mean": 0.3949565887451172, "sampling/importance_sampling_ratio/min": 1.0556744811651697e-08, "sampling/sampling_logp_difference/max": 0.6985321044921875, "sampling/sampling_logp_difference/mean": 0.027950983494520187, "step": 7, "step_time": 69.09156412899756 }, { "clip_ratio/high_max": 0.008222879550885409, "clip_ratio/high_mean": 0.0050142133550252765, "clip_ratio/low_mean": 0.012798447132809088, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017812660371419042, "completions/clipped_ratio": 0.0, "completions/max_length": 1242.0, "completions/max_terminated_length": 1242.0, "completions/mean_length": 259.34375, "completions/mean_terminated_length": 259.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5893099643290043, "epoch": 0.00016, "frac_reward_zero_std": 0.0, "grad_norm": 0.02151084505021572, "kl": 0.0027830141116282903, "learning_rate": 2.0000000000000003e-06, "loss": -0.0011, "num_tokens": 496400.0, "reward": 5.134228229522705, "reward_std": 4.130280017852783, "rewards/_dispatch_reward/mean": 5.134228229522705, "rewards/_dispatch_reward/std": 4.130280017852783, "sampling/importance_sampling_ratio/max": 1.2578178644180298, "sampling/importance_sampling_ratio/mean": 0.38606229424476624, "sampling/importance_sampling_ratio/min": 1.0430401681249535e-10, "sampling/sampling_logp_difference/max": 0.6706124544143677, "sampling/sampling_logp_difference/mean": 0.027107033878564835, "step": 8, "step_time": 76.88476859900038 }, { "clip_ratio/high_max": 0.0024189914984162897, "clip_ratio/high_mean": 0.0012094957492081448, "clip_ratio/low_mean": 0.0003834999370155856, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015929956862237304, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 123.34375, "completions/mean_terminated_length": 123.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.34363649412989616, "epoch": 0.00018, "frac_reward_zero_std": 0.0, "grad_norm": 0.009796424768865108, "kl": 0.0008698884230398107, "learning_rate": 2.285714285714286e-06, "loss": -0.0034, "num_tokens": 570174.0, "reward": 2.8205642700195312, "reward_std": 4.669835090637207, "rewards/_dispatch_reward/mean": 2.8205642700195312, "rewards/_dispatch_reward/std": 4.669835090637207, "sampling/importance_sampling_ratio/max": 1.1795650720596313, "sampling/importance_sampling_ratio/mean": 0.605711042881012, "sampling/importance_sampling_ratio/min": 6.102543557062745e-05, "sampling/sampling_logp_difference/max": 0.43030834197998047, "sampling/sampling_logp_difference/mean": 0.016010664403438568, "step": 9, "step_time": 68.84652510398882 }, { "clip_ratio/high_max": 0.0027637388557195663, "clip_ratio/high_mean": 0.0013818694278597832, "clip_ratio/low_mean": 0.0009210727730533108, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002302942215465009, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 100.75, "completions/mean_terminated_length": 100.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3151314351707697, "epoch": 0.0002, "frac_reward_zero_std": 0.0, "grad_norm": 0.009586871601641178, "kl": 0.0016707113827578723, "learning_rate": 2.571428571428571e-06, "loss": -0.0005, "num_tokens": 645301.0, "reward": 3.475369930267334, "reward_std": 4.783686637878418, "rewards/_dispatch_reward/mean": 3.475369930267334, "rewards/_dispatch_reward/std": 4.783686637878418, "sampling/importance_sampling_ratio/max": 1.1692392826080322, "sampling/importance_sampling_ratio/mean": 0.6406182050704956, "sampling/importance_sampling_ratio/min": 0.02848934382200241, "sampling/sampling_logp_difference/max": 0.508894681930542, "sampling/sampling_logp_difference/mean": 0.018476907163858414, "step": 10, "step_time": 70.24923908900018 }, { "clip_ratio/high_max": 0.0052745313150808215, "clip_ratio/high_mean": 0.0026372656575404108, "clip_ratio/low_mean": 0.001209677429869771, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0038469430874101818, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 106.84375, "completions/mean_terminated_length": 106.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3057017717510462, "epoch": 0.00022, "frac_reward_zero_std": 0.0, "grad_norm": 0.017719965428113937, "kl": 0.0021114282753842417, "learning_rate": 2.8571428571428573e-06, "loss": -0.0001, "num_tokens": 716206.0, "reward": 2.0035197734832764, "reward_std": 4.043484687805176, "rewards/_dispatch_reward/mean": 2.0035197734832764, "rewards/_dispatch_reward/std": 4.043484210968018, "sampling/importance_sampling_ratio/max": 1.0895683765411377, "sampling/importance_sampling_ratio/mean": 0.6331280469894409, "sampling/importance_sampling_ratio/min": 0.0012601253110915422, "sampling/sampling_logp_difference/max": 0.6603082418441772, "sampling/sampling_logp_difference/mean": 0.01621483638882637, "step": 11, "step_time": 59.28811028199925 }, { "clip_ratio/high_max": 0.0028494500438682735, "clip_ratio/high_mean": 0.0014247250219341367, "clip_ratio/low_mean": 0.0008933765202527866, "clip_ratio/low_min": 0.00016066837997641414, "clip_ratio/region_mean": 0.0023181015421869233, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 205.625, "completions/mean_terminated_length": 205.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6472973525524139, "epoch": 0.00024, "frac_reward_zero_std": 0.0, "grad_norm": 0.008480154909193516, "kl": 0.0010127535788342357, "learning_rate": 3.142857142857143e-06, "loss": -0.0009, "num_tokens": 776411.0, "reward": 4.977667808532715, "reward_std": 4.607911109924316, "rewards/_dispatch_reward/mean": 4.977667808532715, "rewards/_dispatch_reward/std": 4.607911109924316, "sampling/importance_sampling_ratio/max": 1.0007600784301758, "sampling/importance_sampling_ratio/mean": 0.34274715185165405, "sampling/importance_sampling_ratio/min": 2.605641702402295e-09, "sampling/sampling_logp_difference/max": 0.562023401260376, "sampling/sampling_logp_difference/mean": 0.02857815846800804, "step": 12, "step_time": 78.00761527100258 }, { "clip_ratio/high_max": 0.004665051761548966, "clip_ratio/high_mean": 0.002332525880774483, "clip_ratio/low_mean": 0.01103231159504503, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013364837534027174, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 121.75, "completions/mean_terminated_length": 121.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.42481718584895134, "epoch": 0.00026, "frac_reward_zero_std": 0.0, "grad_norm": 0.008503826335072517, "kl": 0.006206674090208253, "learning_rate": 3.428571428571429e-06, "loss": -0.0005, "num_tokens": 848645.0, "reward": 3.7999541759490967, "reward_std": 5.260345935821533, "rewards/_dispatch_reward/mean": 3.7999541759490967, "rewards/_dispatch_reward/std": 5.260345458984375, "sampling/importance_sampling_ratio/max": 1.3213924169540405, "sampling/importance_sampling_ratio/mean": 0.7206687331199646, "sampling/importance_sampling_ratio/min": 1.3760912906946032e-06, "sampling/sampling_logp_difference/max": 0.5942957401275635, "sampling/sampling_logp_difference/mean": 0.02436770685017109, "step": 13, "step_time": 65.29833886800043 }, { "clip_ratio/high_max": 0.02439830679213628, "clip_ratio/high_mean": 0.01219915339606814, "clip_ratio/low_mean": 0.01281407053465955, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.025013223712448962, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 102.25, "completions/mean_terminated_length": 102.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3535791225731373, "epoch": 0.00028, "frac_reward_zero_std": 0.0, "grad_norm": 0.007466800510883331, "kl": 0.0010526334699534345, "learning_rate": 3.7142857142857146e-06, "loss": -0.003, "num_tokens": 922414.0, "reward": 3.58528208732605, "reward_std": 5.264614582061768, "rewards/_dispatch_reward/mean": 3.58528208732605, "rewards/_dispatch_reward/std": 5.264614582061768, "sampling/importance_sampling_ratio/max": 1.6783454418182373, "sampling/importance_sampling_ratio/mean": 0.6591842770576477, "sampling/importance_sampling_ratio/min": 0.015803860500454903, "sampling/sampling_logp_difference/max": 0.6768133044242859, "sampling/sampling_logp_difference/mean": 0.027988821268081665, "step": 14, "step_time": 76.04676585100606 }, { "clip_ratio/high_max": 0.0046294865605887026, "clip_ratio/high_mean": 0.0031611660815542564, "clip_ratio/low_mean": 0.0009954585111699998, "clip_ratio/low_min": 0.000405844155466184, "clip_ratio/region_mean": 0.004156624592724256, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 184.3125, "completions/mean_terminated_length": 184.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5234995484352112, "epoch": 0.0003, "frac_reward_zero_std": 0.0, "grad_norm": 0.017105605453252792, "kl": 0.001143711997428909, "learning_rate": 4.000000000000001e-06, "loss": 0.0024, "num_tokens": 976803.0, "reward": 4.643593788146973, "reward_std": 3.711594820022583, "rewards/_dispatch_reward/mean": 4.643593788146973, "rewards/_dispatch_reward/std": 3.711594581604004, "sampling/importance_sampling_ratio/max": 1.0112754106521606, "sampling/importance_sampling_ratio/mean": 0.43820053339004517, "sampling/importance_sampling_ratio/min": 0.0061927661299705505, "sampling/sampling_logp_difference/max": 1.0779176950454712, "sampling/sampling_logp_difference/mean": 0.023352596908807755, "step": 15, "step_time": 69.32589143500081 }, { "clip_ratio/high_max": 0.0053526594856521115, "clip_ratio/high_mean": 0.0026763297428260557, "clip_ratio/low_mean": 0.001263205318537075, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0039395351050188765, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 137.4375, "completions/mean_terminated_length": 137.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4956584945321083, "epoch": 0.00032, "frac_reward_zero_std": 0.0, "grad_norm": 0.010012968443334103, "kl": 0.0016766081389505416, "learning_rate": 4.2857142857142855e-06, "loss": 0.0017, "num_tokens": 1027131.0, "reward": 6.254507064819336, "reward_std": 4.845858097076416, "rewards/_dispatch_reward/mean": 6.254507064819336, "rewards/_dispatch_reward/std": 4.845858097076416, "sampling/importance_sampling_ratio/max": 1.2621318101882935, "sampling/importance_sampling_ratio/mean": 0.4852866530418396, "sampling/importance_sampling_ratio/min": 0.0020480025559663773, "sampling/sampling_logp_difference/max": 0.6868143081665039, "sampling/sampling_logp_difference/mean": 0.024414777755737305, "step": 16, "step_time": 56.2167018509972 }, { "clip_ratio/high_max": 0.004321954504121095, "clip_ratio/high_mean": 0.0021609772520605475, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021609772520605475, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 64.6875, "completions/mean_terminated_length": 64.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2991622658446431, "epoch": 0.00034, "frac_reward_zero_std": 0.0, "grad_norm": 0.02793041430413723, "kl": 0.003004775617228006, "learning_rate": 4.571428571428572e-06, "loss": -0.0019, "num_tokens": 1116281.0, "reward": 1.158271074295044, "reward_std": 3.9837613105773926, "rewards/_dispatch_reward/mean": 1.158271074295044, "rewards/_dispatch_reward/std": 3.9837613105773926, "sampling/importance_sampling_ratio/max": 1.7287894487380981, "sampling/importance_sampling_ratio/mean": 0.8597855567932129, "sampling/importance_sampling_ratio/min": 0.00014148977061267942, "sampling/sampling_logp_difference/max": 0.5800625085830688, "sampling/sampling_logp_difference/mean": 0.031770870089530945, "step": 17, "step_time": 45.771620833998895 }, { "clip_ratio/high_max": 0.0039806214335840195, "clip_ratio/high_mean": 0.0019903107167920098, "clip_ratio/low_mean": 0.013759620025666663, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015749930713354843, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 182.59375, "completions/mean_terminated_length": 182.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5259533487260342, "epoch": 0.00036, "frac_reward_zero_std": 0.0, "grad_norm": 0.008414385840296745, "kl": 0.006829695048509166, "learning_rate": 4.857142857142858e-06, "loss": -0.0015, "num_tokens": 1171273.0, "reward": 5.743894100189209, "reward_std": 4.690041542053223, "rewards/_dispatch_reward/mean": 5.743894100189209, "rewards/_dispatch_reward/std": 4.690041542053223, "sampling/importance_sampling_ratio/max": 1.9703744649887085, "sampling/importance_sampling_ratio/mean": 0.46085572242736816, "sampling/importance_sampling_ratio/min": 1.293967510918037e-08, "sampling/sampling_logp_difference/max": 0.6781671047210693, "sampling/sampling_logp_difference/mean": 0.024906471371650696, "step": 18, "step_time": 61.14139282900214 }, { "clip_ratio/high_max": 0.0030382057011593133, "clip_ratio/high_mean": 0.00191759922017809, "clip_ratio/low_mean": 0.0008816451372695155, "clip_ratio/low_min": 0.0002367424312978983, "clip_ratio/region_mean": 0.002799244350171648, "completions/clipped_ratio": 0.0, "completions/max_length": 1660.0, "completions/max_terminated_length": 1660.0, "completions/mean_length": 186.5, "completions/mean_terminated_length": 186.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.602867329493165, "epoch": 0.00038, "frac_reward_zero_std": 0.0, "grad_norm": 0.004653717391192913, "kl": 0.0013299644269864075, "learning_rate": 5.142857142857142e-06, "loss": 0.0004, "num_tokens": 1252413.0, "reward": 2.5815396308898926, "reward_std": 4.349632263183594, "rewards/_dispatch_reward/mean": 2.5815396308898926, "rewards/_dispatch_reward/std": 4.349632740020752, "sampling/importance_sampling_ratio/max": 1.1002501249313354, "sampling/importance_sampling_ratio/mean": 0.555901050567627, "sampling/importance_sampling_ratio/min": 1.2933999649078487e-08, "sampling/sampling_logp_difference/max": 0.869408369064331, "sampling/sampling_logp_difference/mean": 0.02660653553903103, "step": 19, "step_time": 78.29009881900129 }, { "clip_ratio/high_max": 0.0009124087519012392, "clip_ratio/high_mean": 0.0004562043759506196, "clip_ratio/low_mean": 0.0009879888530122116, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014441932289628312, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 73.71875, "completions/mean_terminated_length": 73.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.33102226129267365, "epoch": 0.0004, "frac_reward_zero_std": 0.0, "grad_norm": 0.029436543583869934, "kl": 0.0005448539421308851, "learning_rate": 5.428571428571429e-06, "loss": -0.009, "num_tokens": 1351826.0, "reward": 0.3658851385116577, "reward_std": 2.878241539001465, "rewards/_dispatch_reward/mean": 0.3658851385116577, "rewards/_dispatch_reward/std": 2.878241539001465, "sampling/importance_sampling_ratio/max": 1.82212495803833, "sampling/importance_sampling_ratio/mean": 0.813861072063446, "sampling/importance_sampling_ratio/min": 6.510071398224682e-05, "sampling/sampling_logp_difference/max": 0.550661563873291, "sampling/sampling_logp_difference/mean": 0.015052185393869877, "step": 20, "step_time": 56.76346912401641 }, { "clip_ratio/high_max": 0.005187851551454514, "clip_ratio/high_mean": 0.002695058472454548, "clip_ratio/low_mean": 0.00019778481510002166, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028928432147949934, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 108.03125, "completions/mean_terminated_length": 108.03125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.37717200443148613, "epoch": 0.00042, "frac_reward_zero_std": 0.0, "grad_norm": 0.009999923408031464, "kl": 0.000724673747754423, "learning_rate": 5.7142857142857145e-06, "loss": -0.0028, "num_tokens": 1436552.0, "reward": 2.89748477935791, "reward_std": 4.333184719085693, "rewards/_dispatch_reward/mean": 2.89748477935791, "rewards/_dispatch_reward/std": 4.333184719085693, "sampling/importance_sampling_ratio/max": 1.1284852027893066, "sampling/importance_sampling_ratio/mean": 0.6845414638519287, "sampling/importance_sampling_ratio/min": 0.00010813102562678978, "sampling/sampling_logp_difference/max": 0.4671330451965332, "sampling/sampling_logp_difference/mean": 0.023374546319246292, "step": 21, "step_time": 54.53796494498965 }, { "clip_ratio/high_max": 0.07083333469927311, "clip_ratio/high_mean": 0.035416667349636555, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04583333432674408, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.75, "completions/mean_terminated_length": 2.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07059835561085492, "epoch": 0.00044, "frac_reward_zero_std": 0.0, "grad_norm": 0.002399139106273651, "kl": 0.0038920448471913005, "learning_rate": 6e-06, "loss": 0.0, "num_tokens": 1575800.0, "reward": -1.2599544525146484, "reward_std": 0.19062143564224243, "rewards/_dispatch_reward/mean": -1.2599544525146484, "rewards/_dispatch_reward/std": 0.19062143564224243, "sampling/importance_sampling_ratio/max": 1.1356183290481567, "sampling/importance_sampling_ratio/mean": 0.9610987901687622, "sampling/importance_sampling_ratio/min": 0.41064921021461487, "sampling/sampling_logp_difference/max": 0.6516157388687134, "sampling/sampling_logp_difference/mean": 0.024187467992305756, "step": 22, "step_time": 46.77752613399207 }, { "clip_ratio/high_max": 0.005154712242074311, "clip_ratio/high_mean": 0.0035003781085833907, "clip_ratio/low_mean": 0.0005952381179668009, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004095616226550192, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 105.1875, "completions/mean_terminated_length": 105.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.410700709791854, "epoch": 0.00046, "frac_reward_zero_std": 0.0, "grad_norm": 0.014338094741106033, "kl": 0.0008683733431098517, "learning_rate": 6.285714285714286e-06, "loss": -0.0041, "num_tokens": 1671271.0, "reward": 3.2939820289611816, "reward_std": 4.406397342681885, "rewards/_dispatch_reward/mean": 3.2939820289611816, "rewards/_dispatch_reward/std": 4.406397342681885, "sampling/importance_sampling_ratio/max": 1.676399827003479, "sampling/importance_sampling_ratio/mean": 0.6958481073379517, "sampling/importance_sampling_ratio/min": 0.01673131436109543, "sampling/sampling_logp_difference/max": 0.9975869655609131, "sampling/sampling_logp_difference/mean": 0.022911131381988525, "step": 23, "step_time": 63.07633655099198 }, { "clip_ratio/high_max": 0.005110417871037498, "clip_ratio/high_mean": 0.002555208935518749, "clip_ratio/low_mean": 0.0005854800692759454, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031406890047946945, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 92.78125, "completions/mean_terminated_length": 92.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.28824638947844505, "epoch": 0.00048, "frac_reward_zero_std": 0.0, "grad_norm": 0.013156220316886902, "kl": 0.0007490101343137212, "learning_rate": 6.571428571428572e-06, "loss": -0.0012, "num_tokens": 1770738.0, "reward": 2.9093494415283203, "reward_std": 4.354193210601807, "rewards/_dispatch_reward/mean": 2.9093494415283203, "rewards/_dispatch_reward/std": 4.354193210601807, "sampling/importance_sampling_ratio/max": 1.6265262365341187, "sampling/importance_sampling_ratio/mean": 0.6805465221405029, "sampling/importance_sampling_ratio/min": 0.0026759125757962465, "sampling/sampling_logp_difference/max": 0.4296213388442993, "sampling/sampling_logp_difference/mean": 0.012809459120035172, "step": 24, "step_time": 67.15237878700282 }, { "clip_ratio/high_max": 0.007100892311427742, "clip_ratio/high_mean": 0.004247377044521272, "clip_ratio/low_mean": 0.0008122937870211899, "clip_ratio/low_min": 0.0005247487570159137, "clip_ratio/region_mean": 0.005059670831542462, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 178.03125, "completions/mean_terminated_length": 178.03125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.583958063274622, "epoch": 0.0005, "frac_reward_zero_std": 0.0, "grad_norm": 0.009483549743890762, "kl": 0.00117611356836278, "learning_rate": 6.857142857142858e-06, "loss": -0.0058, "num_tokens": 1843514.0, "reward": 5.987891674041748, "reward_std": 4.206455230712891, "rewards/_dispatch_reward/mean": 5.987891674041748, "rewards/_dispatch_reward/std": 4.206455230712891, "sampling/importance_sampling_ratio/max": 1.2595504522323608, "sampling/importance_sampling_ratio/mean": 0.4059632420539856, "sampling/importance_sampling_ratio/min": 1.8272161241839058e-06, "sampling/sampling_logp_difference/max": 0.9262754917144775, "sampling/sampling_logp_difference/mean": 0.02728903479874134, "step": 25, "step_time": 72.96448353200685 }, { "clip_ratio/high_max": 0.006182103767059743, "clip_ratio/high_mean": 0.0037271984620019794, "clip_ratio/low_mean": 0.0003773117423406802, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004104510211618617, "completions/clipped_ratio": 0.0, "completions/max_length": 1298.0, "completions/max_terminated_length": 1298.0, "completions/mean_length": 235.71875, "completions/mean_terminated_length": 235.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5523776076734066, "epoch": 0.00052, "frac_reward_zero_std": 0.0, "grad_norm": 0.007155085913836956, "kl": 0.0011198167048860341, "learning_rate": 7.1428571428571436e-06, "loss": -0.0005, "num_tokens": 1920482.0, "reward": 5.081435680389404, "reward_std": 4.353846073150635, "rewards/_dispatch_reward/mean": 5.081435680389404, "rewards/_dispatch_reward/std": 4.353846073150635, "sampling/importance_sampling_ratio/max": 1.0659159421920776, "sampling/importance_sampling_ratio/mean": 0.38374269008636475, "sampling/importance_sampling_ratio/min": 6.281219468162735e-12, "sampling/sampling_logp_difference/max": 0.8278406858444214, "sampling/sampling_logp_difference/mean": 0.027728049084544182, "step": 26, "step_time": 98.4568334949945 }, { "clip_ratio/high_max": 0.008924347494030371, "clip_ratio/high_mean": 0.004462173747015186, "clip_ratio/low_mean": 0.0015605021035298705, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006022675821441226, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 85.5625, "completions/mean_terminated_length": 85.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.33285857360169757, "epoch": 0.00054, "frac_reward_zero_std": 0.0, "grad_norm": 0.010880419053137302, "kl": 0.0011197539679415058, "learning_rate": 7.428571428571429e-06, "loss": 0.0032, "num_tokens": 2024608.0, "reward": 3.6858019828796387, "reward_std": 5.164435386657715, "rewards/_dispatch_reward/mean": 3.6858019828796387, "rewards/_dispatch_reward/std": 5.164435386657715, "sampling/importance_sampling_ratio/max": 1.2996236085891724, "sampling/importance_sampling_ratio/mean": 0.732099175453186, "sampling/importance_sampling_ratio/min": 0.03455425426363945, "sampling/sampling_logp_difference/max": 0.9168803691864014, "sampling/sampling_logp_difference/mean": 0.024052917957305908, "step": 27, "step_time": 58.99345494199952 }, { "clip_ratio/high_max": 0.0029420487117022276, "clip_ratio/high_mean": 0.0014710243558511138, "clip_ratio/low_mean": 0.000155472633196041, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016264969890471548, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 44.65625, "completions/mean_terminated_length": 44.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.17506308463089226, "epoch": 0.00056, "frac_reward_zero_std": 0.0, "grad_norm": 0.007396813947707415, "kl": 0.00044269679888486735, "learning_rate": 7.714285714285716e-06, "loss": 0.0008, "num_tokens": 2174423.0, "reward": 1.1599459648132324, "reward_std": 3.9757213592529297, "rewards/_dispatch_reward/mean": 1.1599459648132324, "rewards/_dispatch_reward/std": 3.975721597671509, "sampling/importance_sampling_ratio/max": 1.213560938835144, "sampling/importance_sampling_ratio/mean": 0.8505507707595825, "sampling/importance_sampling_ratio/min": 0.03090948425233364, "sampling/sampling_logp_difference/max": 0.5677609443664551, "sampling/sampling_logp_difference/mean": 0.012984106317162514, "step": 28, "step_time": 68.44782462599687 }, { "clip_ratio/high_max": 0.004619319050107151, "clip_ratio/high_mean": 0.0025808092032093555, "clip_ratio/low_mean": 0.0016296468093059957, "clip_ratio/low_min": 0.00032637076219543815, "clip_ratio/region_mean": 0.004210455983411521, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 186.84375, "completions/mean_terminated_length": 186.84375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.5688229724764824, "epoch": 0.00058, "frac_reward_zero_std": 0.0, "grad_norm": 0.018515318632125854, "kl": 0.0013667424791492522, "learning_rate": 8.000000000000001e-06, "loss": -0.004, "num_tokens": 2210913.0, "reward": 7.494377136230469, "reward_std": 1.6056934595108032, "rewards/_dispatch_reward/mean": 7.494377136230469, "rewards/_dispatch_reward/std": 1.6056934595108032, "sampling/importance_sampling_ratio/max": 2.078017473220825, "sampling/importance_sampling_ratio/mean": 0.331025093793869, "sampling/importance_sampling_ratio/min": 0.004665270447731018, "sampling/sampling_logp_difference/max": 0.5339968204498291, "sampling/sampling_logp_difference/mean": 0.023812616243958473, "step": 29, "step_time": 64.19203726600244 }, { "clip_ratio/high_max": 0.026041667093522847, "clip_ratio/high_mean": 0.013467262091580778, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013467262091580778, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 37.59375, "completions/mean_terminated_length": 37.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.183818920282647, "epoch": 0.0006, "frac_reward_zero_std": 0.0, "grad_norm": 0.0071673765778541565, "kl": 0.001944019940860242, "learning_rate": 8.285714285714287e-06, "loss": -0.0012, "num_tokens": 2367899.0, "reward": 1.659142255783081, "reward_std": 5.015908718109131, "rewards/_dispatch_reward/mean": 1.659142255783081, "rewards/_dispatch_reward/std": 5.015908241271973, "sampling/importance_sampling_ratio/max": 1.345532774925232, "sampling/importance_sampling_ratio/mean": 0.8848145604133606, "sampling/importance_sampling_ratio/min": 0.18236808478832245, "sampling/sampling_logp_difference/max": 0.40476560592651367, "sampling/sampling_logp_difference/mean": 0.010478072799742222, "step": 30, "step_time": 69.87089692599693 }, { "clip_ratio/high_max": 0.006121268626884557, "clip_ratio/high_mean": 0.0030606343134422787, "clip_ratio/low_mean": 0.0012486299383454025, "clip_ratio/low_min": 0.0006476683774963021, "clip_ratio/region_mean": 0.004309264237235766, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 164.5625, "completions/mean_terminated_length": 164.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5194896943867207, "epoch": 0.00062, "frac_reward_zero_std": 0.0, "grad_norm": 0.01138492301106453, "kl": 0.0011847462155856192, "learning_rate": 8.571428571428571e-06, "loss": -0.0062, "num_tokens": 2451108.0, "reward": 4.4887261390686035, "reward_std": 3.793579578399658, "rewards/_dispatch_reward/mean": 4.4887261390686035, "rewards/_dispatch_reward/std": 3.7935798168182373, "sampling/importance_sampling_ratio/max": 1.4072153568267822, "sampling/importance_sampling_ratio/mean": 0.44676199555397034, "sampling/importance_sampling_ratio/min": 2.846842835424468e-05, "sampling/sampling_logp_difference/max": 0.8780875205993652, "sampling/sampling_logp_difference/mean": 0.025232184678316116, "step": 31, "step_time": 88.69532321900624 }, { "clip_ratio/high_max": 0.001911135099362582, "clip_ratio/high_mean": 0.000955567549681291, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01137223420664668, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 59.125, "completions/mean_terminated_length": 59.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.24192802239849698, "epoch": 0.00064, "frac_reward_zero_std": 0.0, "grad_norm": 0.004926626570522785, "kl": 0.010344951566121807, "learning_rate": 8.857142857142858e-06, "loss": -0.0018, "num_tokens": 2635061.0, "reward": 0.8189454078674316, "reward_std": 3.5781614780426025, "rewards/_dispatch_reward/mean": 0.8189454078674316, "rewards/_dispatch_reward/std": 3.5781612396240234, "sampling/importance_sampling_ratio/max": 1.4798288345336914, "sampling/importance_sampling_ratio/mean": 0.8191201686859131, "sampling/importance_sampling_ratio/min": 0.011969967745244503, "sampling/sampling_logp_difference/max": 0.9833095073699951, "sampling/sampling_logp_difference/mean": 0.027340415865182877, "step": 32, "step_time": 77.33599497902469 }, { "clip_ratio/high_max": 0.005233386065810919, "clip_ratio/high_mean": 0.003829600813332945, "clip_ratio/low_mean": 0.0018207434732175898, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005650344141031383, "completions/clipped_ratio": 0.0, "completions/max_length": 1172.0, "completions/max_terminated_length": 1172.0, "completions/mean_length": 197.6875, "completions/mean_terminated_length": 197.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.47484813444316387, "epoch": 0.00066, "frac_reward_zero_std": 0.0, "grad_norm": 0.009056804701685905, "kl": 0.0013335544645087793, "learning_rate": 9.142857142857144e-06, "loss": -0.0028, "num_tokens": 2719758.0, "reward": 5.25334358215332, "reward_std": 4.086261749267578, "rewards/_dispatch_reward/mean": 5.25334358215332, "rewards/_dispatch_reward/std": 4.086261749267578, "sampling/importance_sampling_ratio/max": 1.2484526634216309, "sampling/importance_sampling_ratio/mean": 0.5199525952339172, "sampling/importance_sampling_ratio/min": 2.673025863153544e-12, "sampling/sampling_logp_difference/max": 0.5819270610809326, "sampling/sampling_logp_difference/mean": 0.026656974107027054, "step": 33, "step_time": 99.14059063902096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0005853353350175894, "epoch": 0.00068, "frac_reward_zero_std": 0.0, "grad_norm": 3.8580616092076525e-05, "kl": 6.113201492796705e-07, "learning_rate": 9.42857142857143e-06, "loss": -0.0, "num_tokens": 2961120.0, "reward": -1.053731918334961, "reward_std": 0.5529842376708984, "rewards/_dispatch_reward/mean": -1.053731918334961, "rewards/_dispatch_reward/std": 0.5529841780662537, "sampling/importance_sampling_ratio/max": 0.999992847442627, "sampling/importance_sampling_ratio/mean": 0.9999217987060547, "sampling/importance_sampling_ratio/min": 0.9980286955833435, "sampling/sampling_logp_difference/max": 0.001961317379027605, "sampling/sampling_logp_difference/mean": 2.823377690219786e-05, "step": 34, "step_time": 82.58755905900762 }, { "clip_ratio/high_max": 0.008132743969326839, "clip_ratio/high_mean": 0.004193663233309053, "clip_ratio/low_mean": 0.0010917649779003114, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005285428196657449, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 130.53125, "completions/mean_terminated_length": 130.53125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.43125689217959007, "epoch": 0.0007, "frac_reward_zero_std": 0.0, "grad_norm": 0.007139499299228191, "kl": 0.0011299467951175757, "learning_rate": 9.714285714285715e-06, "loss": -0.0014, "num_tokens": 3094548.0, "reward": 2.2313008308410645, "reward_std": 3.8178205490112305, "rewards/_dispatch_reward/mean": 2.2313008308410645, "rewards/_dispatch_reward/std": 3.8178205490112305, "sampling/importance_sampling_ratio/max": 1.6531254053115845, "sampling/importance_sampling_ratio/mean": 0.6061273813247681, "sampling/importance_sampling_ratio/min": 0.0009113638079725206, "sampling/sampling_logp_difference/max": 0.5405864715576172, "sampling/sampling_logp_difference/mean": 0.024779872968792915, "step": 35, "step_time": 89.02141472401127 }, { "clip_ratio/high_max": 0.004321634216466919, "clip_ratio/high_mean": 0.0026319201424485072, "clip_ratio/low_mean": 0.000901452629477717, "clip_ratio/low_min": 0.00013440860493574291, "clip_ratio/region_mean": 0.003533372830133885, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 242.1875, "completions/mean_terminated_length": 242.1875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.6462938487529755, "epoch": 0.00072, "frac_reward_zero_std": 0.0, "grad_norm": 0.007205578498542309, "kl": 0.0016471190610900521, "learning_rate": 1e-05, "loss": -0.0014, "num_tokens": 3134609.0, "reward": 9.145380020141602, "reward_std": 3.903296709060669, "rewards/_dispatch_reward/mean": 9.145380020141602, "rewards/_dispatch_reward/std": 3.903296947479248, "sampling/importance_sampling_ratio/max": 0.6258358955383301, "sampling/importance_sampling_ratio/mean": 0.14388859272003174, "sampling/importance_sampling_ratio/min": 2.2541855742019834e-06, "sampling/sampling_logp_difference/max": 0.8020744323730469, "sampling/sampling_logp_difference/mean": 0.026974039152264595, "step": 36, "step_time": 76.58179467700393 }, { "clip_ratio/high_max": 0.005726933144615032, "clip_ratio/high_mean": 0.003037077702174429, "clip_ratio/low_mean": 0.0002093808216159232, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0032464585310663097, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 200.1875, "completions/mean_terminated_length": 200.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5727038569748402, "epoch": 0.00074, "frac_reward_zero_std": 0.0, "grad_norm": 0.02006140537559986, "kl": 0.001625359283934813, "learning_rate": 9.998815709812376e-06, "loss": -0.0058, "num_tokens": 3238256.0, "reward": 6.689815521240234, "reward_std": 5.825323581695557, "rewards/_dispatch_reward/mean": 6.689815521240234, "rewards/_dispatch_reward/std": 5.825323581695557, "sampling/importance_sampling_ratio/max": 1.6268012523651123, "sampling/importance_sampling_ratio/mean": 0.39891862869262695, "sampling/importance_sampling_ratio/min": 1.152023969552829e-06, "sampling/sampling_logp_difference/max": 0.7040293216705322, "sampling/sampling_logp_difference/mean": 0.028268281370401382, "step": 37, "step_time": 109.18746044600266 }, { "clip_ratio/high_max": 0.0074598678620532155, "clip_ratio/high_mean": 0.0037299339310266078, "clip_ratio/low_mean": 0.00028669723542407155, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004016631166450679, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 37.78125, "completions/mean_terminated_length": 37.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.17505362825068005, "epoch": 0.00076, "frac_reward_zero_std": 0.0, "grad_norm": 0.007274220697581768, "kl": 0.0007693237591865909, "learning_rate": 9.995263587272567e-06, "loss": -0.0012, "num_tokens": 3419269.0, "reward": 1.2304946184158325, "reward_std": 3.969721794128418, "rewards/_dispatch_reward/mean": 1.2304946184158325, "rewards/_dispatch_reward/std": 3.969721794128418, "sampling/importance_sampling_ratio/max": 1.1863714456558228, "sampling/importance_sampling_ratio/mean": 0.8759454488754272, "sampling/importance_sampling_ratio/min": 0.1944216936826706, "sampling/sampling_logp_difference/max": 0.5156512260437012, "sampling/sampling_logp_difference/mean": 0.012628378346562386, "step": 38, "step_time": 83.71088723700086 }, { "clip_ratio/high_max": 0.0044490770233096555, "clip_ratio/high_mean": 0.0022245385116548277, "clip_ratio/low_mean": 0.00031115004821913317, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002535688523494173, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 279.5625, "completions/mean_terminated_length": 222.51612854003906, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.4930221550166607, "epoch": 0.00078, "frac_reward_zero_std": 0.25, "grad_norm": 0.006831540260463953, "kl": 0.0012955782149219885, "learning_rate": 9.989345875977304e-06, "loss": -0.0025, "num_tokens": 3522270.0, "reward": 4.621129989624023, "reward_std": 3.7266364097595215, "rewards/_dispatch_reward/mean": 4.621129989624023, "rewards/_dispatch_reward/std": 3.7266364097595215, "sampling/importance_sampling_ratio/max": 1.00277578830719, "sampling/importance_sampling_ratio/mean": 0.3822411894798279, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8860452175140381, "sampling/sampling_logp_difference/mean": 0.02683982066810131, "step": 39, "step_time": 115.52355178999278 }, { "clip_ratio/high_max": 0.002568041512859054, "clip_ratio/high_mean": 0.001284020756429527, "clip_ratio/low_mean": 0.0001502403902122751, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001434261146641802, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 138.53125, "completions/mean_terminated_length": 138.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.33654895424842834, "epoch": 0.0008, "frac_reward_zero_std": 0.0, "grad_norm": 0.004731995053589344, "kl": 0.0011872488248627633, "learning_rate": 9.981066313679877e-06, "loss": -0.0039, "num_tokens": 3674673.0, "reward": 3.011484146118164, "reward_std": 4.585468769073486, "rewards/_dispatch_reward/mean": 3.011484146118164, "rewards/_dispatch_reward/std": 4.585468292236328, "sampling/importance_sampling_ratio/max": 1.0014727115631104, "sampling/importance_sampling_ratio/mean": 0.5896348357200623, "sampling/importance_sampling_ratio/min": 7.688206096645445e-05, "sampling/sampling_logp_difference/max": 0.5600206851959229, "sampling/sampling_logp_difference/mean": 0.013642973266541958, "step": 40, "step_time": 124.35661156599963 }, { "clip_ratio/high_max": 0.004234651743900031, "clip_ratio/high_mean": 0.0021173258719500154, "clip_ratio/low_mean": 0.0009653382294345647, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00308266410138458, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 49.09375, "completions/mean_terminated_length": 49.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.22011170587938977, "epoch": 0.00082, "frac_reward_zero_std": 0.0, "grad_norm": 0.006720908917486668, "kl": 0.0008953095966717228, "learning_rate": 9.970430129929293e-06, "loss": 0.0, "num_tokens": 3885396.0, "reward": 0.8941321969032288, "reward_std": 3.445835590362549, "rewards/_dispatch_reward/mean": 0.8941321969032288, "rewards/_dispatch_reward/std": 3.445835828781128, "sampling/importance_sampling_ratio/max": 1.0005685091018677, "sampling/importance_sampling_ratio/mean": 0.8388948440551758, "sampling/importance_sampling_ratio/min": 0.09574428200721741, "sampling/sampling_logp_difference/max": 0.5382180213928223, "sampling/sampling_logp_difference/mean": 0.012245646677911282, "step": 41, "step_time": 92.02000498400594 }, { "clip_ratio/high_max": 0.0019035532604902983, "clip_ratio/high_mean": 0.0009517766302451491, "clip_ratio/low_mean": 0.001191068033222109, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002142844663467258, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 45.8125, "completions/mean_terminated_length": 45.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.18138621716934722, "epoch": 0.00084, "frac_reward_zero_std": 0.0, "grad_norm": 0.00852085743099451, "kl": 0.0006909975757783404, "learning_rate": 9.957444042767179e-06, "loss": -0.0004, "num_tokens": 4092383.0, "reward": 1.431800127029419, "reward_std": 4.547163009643555, "rewards/_dispatch_reward/mean": 1.431800127029419, "rewards/_dispatch_reward/std": 4.547163009643555, "sampling/importance_sampling_ratio/max": 1.0209298133850098, "sampling/importance_sampling_ratio/mean": 0.856784462928772, "sampling/importance_sampling_ratio/min": 0.06062455102801323, "sampling/sampling_logp_difference/max": 0.38622474670410156, "sampling/sampling_logp_difference/mean": 0.011158864945173264, "step": 42, "step_time": 87.67940832000022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0008112995728879469, "epoch": 0.00086, "frac_reward_zero_std": 0.0, "grad_norm": 1.608729871804826e-05, "kl": 8.94069701606881e-09, "learning_rate": 9.942116254484521e-06, "loss": -0.0, "num_tokens": 4377341.0, "reward": -1.2443116903305054, "reward_std": 0.2223791629076004, "rewards/_dispatch_reward/mean": -1.2443116903305054, "rewards/_dispatch_reward/std": 0.2223791480064392, "sampling/importance_sampling_ratio/max": 1.0007485151290894, "sampling/importance_sampling_ratio/mean": 1.0000085830688477, "sampling/importance_sampling_ratio/min": 0.999955415725708, "sampling/sampling_logp_difference/max": 0.0007260828278958797, "sampling/sampling_logp_difference/mean": 1.4709847164340317e-05, "step": 43, "step_time": 99.83407927100052 }, { "clip_ratio/high_max": 0.0033617592125665396, "clip_ratio/high_mean": 0.0016808796062832698, "clip_ratio/low_mean": 0.00013185653369873762, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018127361399820074, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 48.59375, "completions/mean_terminated_length": 48.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.18187530830618925, "epoch": 0.00088, "frac_reward_zero_std": 0.0, "grad_norm": 0.008785068057477474, "kl": 0.001074353451258503, "learning_rate": 9.924456446440927e-06, "loss": 0.0, "num_tokens": 4595054.0, "reward": 0.8892784118652344, "reward_std": 3.5477054119110107, "rewards/_dispatch_reward/mean": 0.8892784118652344, "rewards/_dispatch_reward/std": 3.5477051734924316, "sampling/importance_sampling_ratio/max": 0.9999911785125732, "sampling/importance_sampling_ratio/mean": 0.8556162714958191, "sampling/importance_sampling_ratio/min": 0.2585143446922302, "sampling/sampling_logp_difference/max": 0.4261045455932617, "sampling/sampling_logp_difference/mean": 0.011128053069114685, "step": 44, "step_time": 90.33923576699453 }, { "clip_ratio/high_max": 0.005055090092355385, "clip_ratio/high_mean": 0.0025275450461776927, "clip_ratio/low_mean": 0.010712175630033016, "clip_ratio/low_min": 0.0005910165491513908, "clip_ratio/region_mean": 0.013239720676210709, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 98.625, "completions/mean_terminated_length": 98.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2906395886093378, "epoch": 0.0009, "frac_reward_zero_std": 0.0, "grad_norm": 0.007414973806589842, "kl": 0.0019741024589166045, "learning_rate": 9.904475772949665e-06, "loss": -0.0038, "num_tokens": 4752696.0, "reward": 3.3065011501312256, "reward_std": 4.428520679473877, "rewards/_dispatch_reward/mean": 3.3065011501312256, "rewards/_dispatch_reward/std": 4.4285197257995605, "sampling/importance_sampling_ratio/max": 1.4952305555343628, "sampling/importance_sampling_ratio/mean": 0.6812023520469666, "sampling/importance_sampling_ratio/min": 0.004352574236690998, "sampling/sampling_logp_difference/max": 0.5085635185241699, "sampling/sampling_logp_difference/mean": 0.01729891076683998, "step": 45, "step_time": 94.67123420500138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.006393522209691582, "epoch": 0.00092, "frac_reward_zero_std": 0.0, "grad_norm": 0.00017838655912782997, "kl": 0.00011525210235774352, "learning_rate": 9.882186854232367e-06, "loss": -0.0, "num_tokens": 5033021.0, "reward": -1.1816649436950684, "reward_std": 0.19231897592544556, "rewards/_dispatch_reward/mean": -1.1816649436950684, "rewards/_dispatch_reward/std": 0.19231897592544556, "sampling/importance_sampling_ratio/max": 1.0462512969970703, "sampling/importance_sampling_ratio/mean": 1.001424789428711, "sampling/importance_sampling_ratio/min": 0.9997555613517761, "sampling/sampling_logp_difference/max": 0.046176210045814514, "sampling/sampling_logp_difference/mean": 0.0005228003719821572, "step": 46, "step_time": 95.09760556400579 }, { "clip_ratio/high_max": 0.002622804546263069, "clip_ratio/high_mean": 0.0015204323863144964, "clip_ratio/low_mean": 0.0007803065163898282, "clip_ratio/low_min": 0.0005091649945825338, "clip_ratio/region_mean": 0.0023007388808764517, "completions/clipped_ratio": 0.0, "completions/max_length": 1091.0, "completions/max_terminated_length": 1091.0, "completions/mean_length": 204.28125, "completions/mean_terminated_length": 204.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5276281349360943, "epoch": 0.00094, "frac_reward_zero_std": 0.25, "grad_norm": 0.0020519369281828403, "kl": 0.0022789838985772803, "learning_rate": 9.857603768447822e-06, "loss": -0.0, "num_tokens": 5126262.0, "reward": 5.915221214294434, "reward_std": 5.048460483551025, "rewards/_dispatch_reward/mean": 5.915221214294434, "rewards/_dispatch_reward/std": 5.048460006713867, "sampling/importance_sampling_ratio/max": 1.0008785724639893, "sampling/importance_sampling_ratio/mean": 0.3696955740451813, "sampling/importance_sampling_ratio/min": 9.062721240127303e-09, "sampling/sampling_logp_difference/max": 0.7791495323181152, "sampling/sampling_logp_difference/mean": 0.02858079969882965, "step": 47, "step_time": 101.12222818900045 }, { "clip_ratio/high_max": 0.0029862732626497746, "clip_ratio/high_mean": 0.0016634363564662635, "clip_ratio/low_mean": 0.0008538897527614608, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025173261019517668, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 175.1875, "completions/mean_terminated_length": 114.7741928100586, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.46684860810637474, "epoch": 0.00096, "frac_reward_zero_std": 0.0, "grad_norm": 0.026494067162275314, "kl": 0.0018744520202744752, "learning_rate": 9.830742042799913e-06, "loss": -0.0039, "num_tokens": 5286798.0, "reward": 2.8440439701080322, "reward_std": 4.178732872009277, "rewards/_dispatch_reward/mean": 2.8440439701080322, "rewards/_dispatch_reward/std": 4.178732872009277, "sampling/importance_sampling_ratio/max": 1.0188325643539429, "sampling/importance_sampling_ratio/mean": 0.6148038506507874, "sampling/importance_sampling_ratio/min": 1.9325112621260132e-09, "sampling/sampling_logp_difference/max": 0.4544100761413574, "sampling/sampling_logp_difference/mean": 0.022511716932058334, "step": 48, "step_time": 122.34452678399975 }, { "clip_ratio/high_max": 0.004316396691137925, "clip_ratio/high_mean": 0.0037547791725955904, "clip_ratio/low_mean": 0.0005475876678247005, "clip_ratio/low_min": 0.00035511364694684744, "clip_ratio/region_mean": 0.004302366898627952, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 143.21875, "completions/mean_terminated_length": 143.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.47758977860212326, "epoch": 0.00098, "frac_reward_zero_std": 0.0, "grad_norm": 0.01684115268290043, "kl": 0.002730547290411778, "learning_rate": 9.801618643730292e-06, "loss": -0.0002, "num_tokens": 5387272.0, "reward": 7.64976692199707, "reward_std": 5.87539529800415, "rewards/_dispatch_reward/mean": 7.64976692199707, "rewards/_dispatch_reward/std": 5.87539529800415, "sampling/importance_sampling_ratio/max": 1.6533435583114624, "sampling/importance_sampling_ratio/mean": 0.514319121837616, "sampling/importance_sampling_ratio/min": 0.0009031257359310985, "sampling/sampling_logp_difference/max": 1.2167229652404785, "sampling/sampling_logp_difference/mean": 0.026262516155838966, "step": 49, "step_time": 91.79330294199463 }, { "clip_ratio/high_max": 0.006119212484918535, "clip_ratio/high_mean": 0.004732819041237235, "clip_ratio/low_mean": 0.0008056239748839289, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0055384429870173335, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 87.78125, "completions/mean_terminated_length": 87.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3011600449681282, "epoch": 0.001, "frac_reward_zero_std": 0.0, "grad_norm": 0.01269652508199215, "kl": 0.0032866905676200986, "learning_rate": 9.770251966202029e-06, "loss": -0.0006, "num_tokens": 5541768.0, "reward": 3.141866683959961, "reward_std": 4.4505181312561035, "rewards/_dispatch_reward/mean": 3.141866683959961, "rewards/_dispatch_reward/std": 4.4505181312561035, "sampling/importance_sampling_ratio/max": 1.649505853652954, "sampling/importance_sampling_ratio/mean": 0.7044443488121033, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5605548620223999, "sampling/sampling_logp_difference/mean": 0.021160298958420753, "step": 50, "step_time": 98.10317065600248 }, { "clip_ratio/high_max": 0.010189392720349133, "clip_ratio/high_mean": 0.006060676998458803, "clip_ratio/low_mean": 0.0006750412285327911, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006735718212439679, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 139.96875, "completions/mean_terminated_length": 139.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.442113908007741, "epoch": 0.00102, "frac_reward_zero_std": 0.0, "grad_norm": 0.013396586291491985, "kl": 0.0034333480580244213, "learning_rate": 9.736661822080943e-06, "loss": -0.0064, "num_tokens": 5627206.0, "reward": 5.226400852203369, "reward_std": 3.826179027557373, "rewards/_dispatch_reward/mean": 5.226400852203369, "rewards/_dispatch_reward/std": 3.826178789138794, "sampling/importance_sampling_ratio/max": 1.497654676437378, "sampling/importance_sampling_ratio/mean": 0.5884408950805664, "sampling/importance_sampling_ratio/min": 0.004016702529042959, "sampling/sampling_logp_difference/max": 1.028564691543579, "sampling/sampling_logp_difference/mean": 0.022922225296497345, "step": 51, "step_time": 86.19129810099548 }, { "clip_ratio/high_max": 0.006017926352797076, "clip_ratio/high_mean": 0.0040219257498392835, "clip_ratio/low_mean": 0.000564306574233342, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004586232345900498, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 205.0625, "completions/mean_terminated_length": 145.61289978027344, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5015206560492516, "epoch": 0.00104, "frac_reward_zero_std": 0.0, "grad_norm": 0.013659187592566013, "kl": 0.002838586748111993, "learning_rate": 9.700869427622014e-06, "loss": -0.0009, "num_tokens": 5733605.0, "reward": 6.685307502746582, "reward_std": 5.804135799407959, "rewards/_dispatch_reward/mean": 6.685307502746582, "rewards/_dispatch_reward/std": 5.804135322570801, "sampling/importance_sampling_ratio/max": 1.0001060962677002, "sampling/importance_sampling_ratio/mean": 0.49562621116638184, "sampling/importance_sampling_ratio/min": 3.8747170124079266e-16, "sampling/sampling_logp_difference/max": 0.5828416347503662, "sampling/sampling_logp_difference/mean": 0.023624923080205917, "step": 52, "step_time": 113.47440205099701 }, { "clip_ratio/high_max": 0.003565533785149455, "clip_ratio/high_mean": 0.002294554462423548, "clip_ratio/low_mean": 0.0002680120160221122, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025625665148254484, "completions/clipped_ratio": 0.0, "completions/max_length": 1867.0, "completions/max_terminated_length": 1867.0, "completions/mean_length": 298.15625, "completions/mean_terminated_length": 298.15625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.6001667827367783, "epoch": 0.00106, "frac_reward_zero_std": 0.25, "grad_norm": 0.006183857098221779, "kl": 0.0034337107208557427, "learning_rate": 9.662897390068735e-06, "loss": -0.0012, "num_tokens": 5776086.0, "reward": 9.917367935180664, "reward_std": 3.750269651412964, "rewards/_dispatch_reward/mean": 9.917367935180664, "rewards/_dispatch_reward/std": 3.7502694129943848, "sampling/importance_sampling_ratio/max": 0.7825208902359009, "sampling/importance_sampling_ratio/mean": 0.22198987007141113, "sampling/importance_sampling_ratio/min": 2.639378380566923e-14, "sampling/sampling_logp_difference/max": 0.6501812934875488, "sampling/sampling_logp_difference/mean": 0.02595674991607666, "step": 53, "step_time": 92.85906081800931 }, { "clip_ratio/high_max": 0.0028834628174081445, "clip_ratio/high_mean": 0.0014417314087040722, "clip_ratio/low_mean": 0.0007313800670090131, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021731114975409582, "completions/clipped_ratio": 0.0, "completions/max_length": 909.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 281.71875, "completions/mean_terminated_length": 281.71875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.6317033842206001, "epoch": 0.00108, "frac_reward_zero_std": 0.25, "grad_norm": 0.008015364408493042, "kl": 0.004350849922047928, "learning_rate": 9.622769693373892e-06, "loss": 0.0, "num_tokens": 5815865.0, "reward": 6.637714862823486, "reward_std": 1.3837485313415527, "rewards/_dispatch_reward/mean": 6.637714862823486, "rewards/_dispatch_reward/std": 1.3837485313415527, "sampling/importance_sampling_ratio/max": 0.5636690258979797, "sampling/importance_sampling_ratio/mean": 0.1337074339389801, "sampling/importance_sampling_ratio/min": 1.958301396598472e-07, "sampling/sampling_logp_difference/max": 1.2753636837005615, "sampling/sampling_logp_difference/mean": 0.027083205059170723, "step": 54, "step_time": 70.00975042999198 }, { "clip_ratio/high_max": 0.00682789774145931, "clip_ratio/high_mean": 0.003413948870729655, "clip_ratio/low_mean": 0.0004340277810115367, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0038479766517411917, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 102.90625, "completions/mean_terminated_length": 102.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.39303808473050594, "epoch": 0.0011, "frac_reward_zero_std": 0.0, "grad_norm": 0.007574125658720732, "kl": 0.00406463113904465, "learning_rate": 9.580511683050793e-06, "loss": -0.0003, "num_tokens": 5976662.0, "reward": 3.517942428588867, "reward_std": 4.828723430633545, "rewards/_dispatch_reward/mean": 3.517942428588867, "rewards/_dispatch_reward/std": 4.828723907470703, "sampling/importance_sampling_ratio/max": 0.9999936819076538, "sampling/importance_sampling_ratio/mean": 0.5975947380065918, "sampling/importance_sampling_ratio/min": 0.006443084683269262, "sampling/sampling_logp_difference/max": 1.088891625404358, "sampling/sampling_logp_difference/mean": 0.024115335196256638, "step": 55, "step_time": 89.8712716250011 }, { "clip_ratio/high_max": 0.002959180681500584, "clip_ratio/high_mean": 0.001479590340750292, "clip_ratio/low_mean": 0.001389213364745956, "clip_ratio/low_min": 0.0003709198790602386, "clip_ratio/region_mean": 0.0028688037127722055, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 142.5625, "completions/mean_terminated_length": 142.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5603683553636074, "epoch": 0.00112, "frac_reward_zero_std": 0.0, "grad_norm": 0.006758315023034811, "kl": 0.002921493010944687, "learning_rate": 9.536150050164488e-06, "loss": -0.0016, "num_tokens": 6141561.0, "reward": 2.3384432792663574, "reward_std": 3.822960376739502, "rewards/_dispatch_reward/mean": 2.3384432792663574, "rewards/_dispatch_reward/std": 3.822960376739502, "sampling/importance_sampling_ratio/max": 1.0247381925582886, "sampling/importance_sampling_ratio/mean": 0.5408236980438232, "sampling/importance_sampling_ratio/min": 2.8452670903789112e-06, "sampling/sampling_logp_difference/max": 0.9780905246734619, "sampling/sampling_logp_difference/mean": 0.028260692954063416, "step": 56, "step_time": 100.65156877000118 } ], "logging_steps": 1.0, "max_steps": 160, "num_input_tokens_seen": 6141561, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }