{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.01425, "eval_steps": 500, "global_step": 1425, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 2.3125, "completions/mean_terminated_length": 2.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.804141163825989, "epoch": 1e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.021086419001221657, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0004, "num_tokens": 39077.0, "reward": -0.34472769498825073, "reward_std": 0.5002915263175964, "rewards/rollout_reward_func/mean": -0.34472769498825073, "rewards/rollout_reward_func/std": 0.8051971793174744, "sampling/importance_sampling_ratio/max": 0.04277191311120987, "sampling/importance_sampling_ratio/mean": 0.025814546272158623, "sampling/importance_sampling_ratio/min": 9.891788067761809e-06, "sampling/sampling_logp_difference/max": 3.0097832679748535, "sampling/sampling_logp_difference/mean": 1.7041127681732178, "step": 1, "step_time": 6.8686070639960235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.804141163825989, "epoch": 2e-05, "grad_norm": 0.02090727910399437, "kl": 0.0, "learning_rate": 2.8571428571428575e-07, "loss": -0.0004, "step": 2, "step_time": 3.2826551260004635 }, { "clip_ratio/high_max": 0.015224359463900328, "clip_ratio/high_mean": 0.007612179731950164, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007612179731950164, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 4.59375, "completions/mean_terminated_length": 2.9642858505249023, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.384084284305573, "epoch": 3e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.02722231112420559, "kl": 0.0011425166740082204, "learning_rate": 5.714285714285715e-07, "loss": 0.0, "num_tokens": 79358.0, "reward": -0.12078246474266052, "reward_std": 0.8405250310897827, "rewards/rollout_reward_func/mean": -0.12078246474266052, "rewards/rollout_reward_func/std": 0.902726948261261, "sampling/importance_sampling_ratio/max": 0.04705209285020828, "sampling/importance_sampling_ratio/mean": 0.02143000438809395, "sampling/importance_sampling_ratio/min": 1.208989051508478e-16, "sampling/sampling_logp_difference/max": 4.700081825256348, "sampling/sampling_logp_difference/mean": 1.7616089582443237, "step": 3, "step_time": 6.540354332995776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.395957291126251, "epoch": 4e-05, "grad_norm": 0.026882857084274292, "kl": 0.0009081153402803466, "learning_rate": 8.571428571428572e-07, "loss": 0.0, "step": 4, "step_time": 3.3787786200118717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 3.03125, "completions/mean_terminated_length": 2.1666667461395264, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.897692441940308, "epoch": 5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.01995067298412323, "kl": 0.0009780586769920774, "learning_rate": 1.142857142857143e-06, "loss": -0.0004, "num_tokens": 120624.0, "reward": -0.3276440501213074, "reward_std": 0.7164573669433594, "rewards/rollout_reward_func/mean": -0.3276440501213074, "rewards/rollout_reward_func/std": 0.7698000073432922, "sampling/importance_sampling_ratio/max": 0.04114896431565285, "sampling/importance_sampling_ratio/mean": 0.021779228001832962, "sampling/importance_sampling_ratio/min": 1.506796125589141e-13, "sampling/sampling_logp_difference/max": 4.342257499694824, "sampling/sampling_logp_difference/mean": 1.8802902698516846, "step": 5, "step_time": 7.577862517988251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.894662261009216, "epoch": 6e-05, "grad_norm": 0.021226217970252037, "kl": 0.0011208637151867151, "learning_rate": 1.4285714285714286e-06, "loss": -0.0004, "step": 6, "step_time": 3.3525919580060872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 3.71875, "completions/mean_terminated_length": 2.4482758045196533, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.621833205223083, "epoch": 7e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.014551177620887756, "kl": 0.0009743689370225184, "learning_rate": 1.7142857142857145e-06, "loss": -0.0005, "num_tokens": 160096.0, "reward": -0.13357606530189514, "reward_std": 0.4125376045703888, "rewards/rollout_reward_func/mean": -0.13357606530189514, "rewards/rollout_reward_func/std": 0.7049857974052429, "sampling/importance_sampling_ratio/max": 0.037679728120565414, "sampling/importance_sampling_ratio/mean": 0.020056068897247314, "sampling/importance_sampling_ratio/min": 5.495116758543264e-14, "sampling/sampling_logp_difference/max": 3.643705368041992, "sampling/sampling_logp_difference/mean": 1.766145944595337, "step": 7, "step_time": 6.018518894998124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.61597204208374, "epoch": 8e-05, "grad_norm": 0.014140527695417404, "kl": 0.0011069410320487805, "learning_rate": 2.0000000000000003e-06, "loss": -0.0005, "step": 8, "step_time": 3.0815371319986298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 4.1875, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.541855096817017, "epoch": 9e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.014159276150166988, "kl": 0.0006379802453011507, "learning_rate": 2.285714285714286e-06, "loss": -0.0005, "num_tokens": 201510.0, "reward": -0.17673549056053162, "reward_std": 0.12811192870140076, "rewards/rollout_reward_func/mean": -0.17673549056053162, "rewards/rollout_reward_func/std": 0.5169209837913513, "sampling/importance_sampling_ratio/max": 0.04576457664370537, "sampling/importance_sampling_ratio/mean": 0.021619606763124466, "sampling/importance_sampling_ratio/min": 8.835530397599545e-12, "sampling/sampling_logp_difference/max": 3.463005304336548, "sampling/sampling_logp_difference/mean": 1.529081106185913, "step": 9, "step_time": 6.0105151389943785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 8.55346429347992, "epoch": 0.0001, "grad_norm": 0.01377441268414259, "kl": 0.0012424856176949106, "learning_rate": 2.571428571428571e-06, "loss": -0.0005, "step": 10, "step_time": 4.179794515002868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 2.6875, "completions/mean_terminated_length": 2.2580645084381104, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.501883625984192, "epoch": 0.00011, "frac_reward_zero_std": 0.0, "grad_norm": 0.017675049602985382, "kl": 0.002578977117082104, "learning_rate": 2.8571428571428573e-06, "loss": -0.0002, "num_tokens": 241495.0, "reward": -0.3304590880870819, "reward_std": 0.7891688942909241, "rewards/rollout_reward_func/mean": -0.3304590880870819, "rewards/rollout_reward_func/std": 0.9192269444465637, "sampling/importance_sampling_ratio/max": 0.042586781084537506, "sampling/importance_sampling_ratio/mean": 0.024834325537085533, "sampling/importance_sampling_ratio/min": 1.4227766187424606e-10, "sampling/sampling_logp_difference/max": 2.9632835388183594, "sampling/sampling_logp_difference/mean": 1.667797565460205, "step": 11, "step_time": 5.877453792003507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.493280053138733, "epoch": 0.00012, "grad_norm": 0.018936991691589355, "kl": 0.0035525854327715933, "learning_rate": 3.142857142857143e-06, "loss": -0.0002, "step": 12, "step_time": 3.1586806250124937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 2.15625, "completions/mean_terminated_length": 2.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.86107325553894, "epoch": 0.00013, "frac_reward_zero_std": 0.0, "grad_norm": 0.030837446451187134, "kl": 0.0033286443213000894, "learning_rate": 3.428571428571429e-06, "loss": -0.0006, "num_tokens": 277779.0, "reward": -0.27611443400382996, "reward_std": 0.9773558378219604, "rewards/rollout_reward_func/mean": -0.27611443400382996, "rewards/rollout_reward_func/std": 1.0310955047607422, "sampling/importance_sampling_ratio/max": 0.05434391275048256, "sampling/importance_sampling_ratio/mean": 0.0269374568015337, "sampling/importance_sampling_ratio/min": 1.2384259662212571e-06, "sampling/sampling_logp_difference/max": 3.4077227115631104, "sampling/sampling_logp_difference/mean": 1.8426249027252197, "step": 13, "step_time": 5.615947227001016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.837531089782715, "epoch": 0.00014, "grad_norm": 0.029717400670051575, "kl": 0.004254205559846014, "learning_rate": 3.7142857142857146e-06, "loss": -0.0006, "step": 14, "step_time": 3.049010601993359 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 2.375, "completions/mean_terminated_length": 2.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.60869961977005, "epoch": 0.00015, "frac_reward_zero_std": 0.0, "grad_norm": 0.01971513219177723, "kl": 0.015398650721181184, "learning_rate": 4.000000000000001e-06, "loss": -0.0003, "num_tokens": 318880.0, "reward": -0.3409580588340759, "reward_std": 0.39791858196258545, "rewards/rollout_reward_func/mean": -0.3409580588340759, "rewards/rollout_reward_func/std": 0.6122872233390808, "sampling/importance_sampling_ratio/max": 0.04986758902668953, "sampling/importance_sampling_ratio/mean": 0.02900122106075287, "sampling/importance_sampling_ratio/min": 0.00015352272021118551, "sampling/sampling_logp_difference/max": 2.4108948707580566, "sampling/sampling_logp_difference/mean": 1.6273603439331055, "step": 15, "step_time": 5.734060158021748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.582240581512451, "epoch": 0.00016, "grad_norm": 0.019075050950050354, "kl": 0.02632860589073971, "learning_rate": 4.2857142857142855e-06, "loss": -0.0003, "step": 16, "step_time": 4.087025551998522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.387096643447876, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.473009467124939, "epoch": 0.00017, "frac_reward_zero_std": 0.0, "grad_norm": 0.02019246481359005, "kl": 0.031145804678089917, "learning_rate": 4.571428571428572e-06, "loss": -0.0002, "num_tokens": 361652.0, "reward": -0.15312647819519043, "reward_std": 0.29230499267578125, "rewards/rollout_reward_func/mean": -0.15312647819519043, "rewards/rollout_reward_func/std": 0.5114664435386658, "sampling/importance_sampling_ratio/max": 0.05833182856440544, "sampling/importance_sampling_ratio/mean": 0.03276589512825012, "sampling/importance_sampling_ratio/min": 1.5690637808138397e-12, "sampling/sampling_logp_difference/max": 3.322146415710449, "sampling/sampling_logp_difference/mean": 1.5804604291915894, "step": 17, "step_time": 5.971076883994101 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03645833348855376, "entropy": 8.4635009765625, "epoch": 0.00018, "grad_norm": 0.023768926039338112, "kl": 0.051918624667450786, "learning_rate": 4.857142857142858e-06, "loss": -0.0002, "step": 18, "step_time": 3.1525027329917066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.517181515693665, "epoch": 0.00019, "frac_reward_zero_std": 0.0, "grad_norm": 0.02663504146039486, "kl": 0.05074083153158426, "learning_rate": 5.142857142857142e-06, "loss": -0.0007, "num_tokens": 402349.0, "reward": -0.0015620067715644836, "reward_std": 0.6003050208091736, "rewards/rollout_reward_func/mean": -0.0015620067715644836, "rewards/rollout_reward_func/std": 0.8002142906188965, "sampling/importance_sampling_ratio/max": 0.07228195667266846, "sampling/importance_sampling_ratio/mean": 0.03649972379207611, "sampling/importance_sampling_ratio/min": 7.433489934705051e-10, "sampling/sampling_logp_difference/max": 3.706282138824463, "sampling/sampling_logp_difference/mean": 1.6122421026229858, "step": 19, "step_time": 5.994059606993687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.459820449352264, "epoch": 0.0002, "grad_norm": 0.028400776907801628, "kl": 0.06513146217912436, "learning_rate": 5.428571428571429e-06, "loss": -0.0008, "step": 20, "step_time": 3.168843407984241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 2.28125, "completions/mean_terminated_length": 2.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.414528727531433, "epoch": 0.00021, "frac_reward_zero_std": 0.0, "grad_norm": 0.041610222309827805, "kl": 0.10625919885933399, "learning_rate": 5.7142857142857145e-06, "loss": -0.0009, "num_tokens": 443305.0, "reward": -0.04175537824630737, "reward_std": 0.5788869857788086, "rewards/rollout_reward_func/mean": -0.04175537824630737, "rewards/rollout_reward_func/std": 0.6744015216827393, "sampling/importance_sampling_ratio/max": 0.08651857078075409, "sampling/importance_sampling_ratio/mean": 0.044956766068935394, "sampling/importance_sampling_ratio/min": 4.773658474732656e-06, "sampling/sampling_logp_difference/max": 2.399306535720825, "sampling/sampling_logp_difference/mean": 1.513183832168579, "step": 21, "step_time": 5.628514475996781 }, { "clip_ratio/high_max": 0.09375, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0516826924867928, "entropy": 8.304786682128906, "epoch": 0.00022, "grad_norm": 0.0393240787088871, "kl": 0.13910233229398727, "learning_rate": 6e-06, "loss": -0.0011, "step": 22, "step_time": 4.0276253200063366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.387096643447876, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.221793234348297, "epoch": 0.00023, "frac_reward_zero_std": 0.0, "grad_norm": 0.045835234224796295, "kl": 0.11171833425760269, "learning_rate": 6.285714285714286e-06, "loss": -0.0002, "num_tokens": 484651.0, "reward": -0.2754765450954437, "reward_std": 0.49494460225105286, "rewards/rollout_reward_func/mean": -0.2754765450954437, "rewards/rollout_reward_func/std": 0.5475416779518127, "sampling/importance_sampling_ratio/max": 0.09660900384187698, "sampling/importance_sampling_ratio/mean": 0.0465397983789444, "sampling/importance_sampling_ratio/min": 2.1275207290338827e-11, "sampling/sampling_logp_difference/max": 4.015285015106201, "sampling/sampling_logp_difference/mean": 1.5347381830215454, "step": 23, "step_time": 5.8680630790040595 }, { "clip_ratio/high_max": 0.1631944444961846, "clip_ratio/high_mean": 0.0972222222480923, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0972222222480923, "entropy": 8.092307925224304, "epoch": 0.00024, "grad_norm": 0.03743673115968704, "kl": 0.14567330665886402, "learning_rate": 6.571428571428572e-06, "loss": -0.0003, "step": 24, "step_time": 3.155461798996839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.011805951595306, "epoch": 0.00025, "frac_reward_zero_std": 0.25, "grad_norm": 0.03010385110974312, "kl": 0.1659590769559145, "learning_rate": 6.857142857142858e-06, "loss": -0.0016, "num_tokens": 524938.0, "reward": -0.141236811876297, "reward_std": 0.45568937063217163, "rewards/rollout_reward_func/mean": -0.141236811876297, "rewards/rollout_reward_func/std": 0.7183913588523865, "sampling/importance_sampling_ratio/max": 0.13219918310642242, "sampling/importance_sampling_ratio/mean": 0.06431391835212708, "sampling/importance_sampling_ratio/min": 4.849007141454441e-14, "sampling/sampling_logp_difference/max": 4.452613830566406, "sampling/sampling_logp_difference/mean": 1.5246989727020264, "step": 25, "step_time": 5.8789110200086725 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 7.854423761367798, "epoch": 0.00026, "grad_norm": 0.029288187623023987, "kl": 0.20448201149702072, "learning_rate": 7.1428571428571436e-06, "loss": -0.0018, "step": 26, "step_time": 3.135556111992628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.064516067504883, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.019250452518463, "epoch": 0.00027, "frac_reward_zero_std": 0.25, "grad_norm": 0.024609368294477463, "kl": 0.30220172367990017, "learning_rate": 7.428571428571429e-06, "loss": -0.0023, "num_tokens": 566038.0, "reward": 0.27311062812805176, "reward_std": 0.23113927245140076, "rewards/rollout_reward_func/mean": 0.27311062812805176, "rewards/rollout_reward_func/std": 0.5104420781135559, "sampling/importance_sampling_ratio/max": 0.16034293174743652, "sampling/importance_sampling_ratio/mean": 0.1032928079366684, "sampling/importance_sampling_ratio/min": 4.027131073058854e-09, "sampling/sampling_logp_difference/max": 3.9229788780212402, "sampling/sampling_logp_difference/mean": 1.1824378967285156, "step": 27, "step_time": 6.261046100000385 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 6.758126735687256, "epoch": 0.00028, "grad_norm": 0.020915290340781212, "kl": 0.33748972974717617, "learning_rate": 7.714285714285716e-06, "loss": -0.0025, "step": 28, "step_time": 3.5864762339988374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.15625, "completions/mean_terminated_length": 2.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.900339066982269, "epoch": 0.00029, "frac_reward_zero_std": 0.25, "grad_norm": 0.06545689702033997, "kl": 0.4023558087646961, "learning_rate": 8.000000000000001e-06, "loss": -0.0001, "num_tokens": 607123.0, "reward": -0.023281961679458618, "reward_std": 0.24928632378578186, "rewards/rollout_reward_func/mean": -0.023281961679458618, "rewards/rollout_reward_func/std": 0.49906328320503235, "sampling/importance_sampling_ratio/max": 0.21181833744049072, "sampling/importance_sampling_ratio/mean": 0.11253634095191956, "sampling/importance_sampling_ratio/min": 7.621605618624017e-05, "sampling/sampling_logp_difference/max": 5.094117641448975, "sampling/sampling_logp_difference/mean": 1.247454047203064, "step": 29, "step_time": 5.690983062995656 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.057291666977107525, "entropy": 6.646870732307434, "epoch": 0.0003, "grad_norm": 0.029126061126589775, "kl": 0.4450713973492384, "learning_rate": 8.285714285714287e-06, "loss": -0.0003, "step": 30, "step_time": 3.067743674990197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.62805300951004, "epoch": 0.00031, "frac_reward_zero_std": 0.25, "grad_norm": 0.038303401321172714, "kl": 0.4778400659561157, "learning_rate": 8.571428571428571e-06, "loss": -0.0023, "num_tokens": 647562.0, "reward": 0.06927576661109924, "reward_std": 0.3969299793243408, "rewards/rollout_reward_func/mean": 0.06927576661109924, "rewards/rollout_reward_func/std": 0.5694372653961182, "sampling/importance_sampling_ratio/max": 0.1987183392047882, "sampling/importance_sampling_ratio/mean": 0.12737762928009033, "sampling/importance_sampling_ratio/min": 8.123014413885521e-12, "sampling/sampling_logp_difference/max": 4.197339057922363, "sampling/sampling_logp_difference/mean": 1.2267866134643555, "step": 31, "step_time": 5.9312318389929715 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 6.443451225757599, "epoch": 0.00032, "grad_norm": 0.04942414537072182, "kl": 0.5627163536846638, "learning_rate": 8.857142857142858e-06, "loss": -0.0024, "step": 32, "step_time": 3.146329178016458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.043189287185669, "epoch": 0.00033, "frac_reward_zero_std": 0.25, "grad_norm": 0.05507216230034828, "kl": 0.55271340534091, "learning_rate": 9.142857142857144e-06, "loss": -0.0013, "num_tokens": 688091.0, "reward": 0.23827168345451355, "reward_std": 0.16493774950504303, "rewards/rollout_reward_func/mean": 0.23827168345451355, "rewards/rollout_reward_func/std": 0.38320621848106384, "sampling/importance_sampling_ratio/max": 0.20761720836162567, "sampling/importance_sampling_ratio/mean": 0.1632969081401825, "sampling/importance_sampling_ratio/min": 1.746756905784963e-10, "sampling/sampling_logp_difference/max": 4.064532279968262, "sampling/sampling_logp_difference/mean": 0.9916760325431824, "step": 33, "step_time": 6.221571026006131 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 5.976642966270447, "epoch": 0.00034, "grad_norm": 0.023765303194522858, "kl": 0.5510763712227345, "learning_rate": 9.42857142857143e-06, "loss": -0.0015, "step": 34, "step_time": 3.059485895006219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.8760687708854675, "epoch": 0.00035, "frac_reward_zero_std": 0.0, "grad_norm": 0.041264478117227554, "kl": 0.5403554290533066, "learning_rate": 9.714285714285715e-06, "loss": -0.0043, "num_tokens": 728988.0, "reward": 0.23375797271728516, "reward_std": 0.32435545325279236, "rewards/rollout_reward_func/mean": 0.23375797271728516, "rewards/rollout_reward_func/std": 0.5637208819389343, "sampling/importance_sampling_ratio/max": 0.22456371784210205, "sampling/importance_sampling_ratio/mean": 0.1642601042985916, "sampling/importance_sampling_ratio/min": 2.6190324214914185e-11, "sampling/sampling_logp_difference/max": 4.547166347503662, "sampling/sampling_logp_difference/mean": 1.0391957759857178, "step": 35, "step_time": 5.716201814997476 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 5.789307236671448, "epoch": 0.00036, "grad_norm": 0.04315108060836792, "kl": 0.5575271435081959, "learning_rate": 1e-05, "loss": -0.0045, "step": 36, "step_time": 3.059362929998315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.319090962409973, "epoch": 0.00037, "frac_reward_zero_std": 0.25, "grad_norm": 0.13476166129112244, "kl": 0.4533018358051777, "learning_rate": 9.9999999995372e-06, "loss": -0.001, "num_tokens": 769521.0, "reward": -0.024775587022304535, "reward_std": 0.37057363986968994, "rewards/rollout_reward_func/mean": -0.024775587022304535, "rewards/rollout_reward_func/std": 0.5002303719520569, "sampling/importance_sampling_ratio/max": 0.2429308444261551, "sampling/importance_sampling_ratio/mean": 0.15311548113822937, "sampling/importance_sampling_ratio/min": 3.798589220949111e-10, "sampling/sampling_logp_difference/max": 3.77895450592041, "sampling/sampling_logp_difference/mean": 1.011993169784546, "step": 37, "step_time": 5.871374611000647 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.06597222201526165, "clip_ratio/low_min": 0.03819444449618459, "clip_ratio/region_mean": 0.09722222201526165, "entropy": 6.291596055030823, "epoch": 0.00038, "grad_norm": 0.042685460299253464, "kl": 0.43803390115499496, "learning_rate": 9.999999998148802e-06, "loss": -0.0013, "step": 38, "step_time": 3.1424877449899213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.09375, "completions/mean_terminated_length": 2.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.5455856919288635, "epoch": 0.00039, "frac_reward_zero_std": 0.25, "grad_norm": 0.14305028319358826, "kl": 0.6039662137627602, "learning_rate": 9.999999995834804e-06, "loss": -0.0002, "num_tokens": 811876.0, "reward": 0.26745641231536865, "reward_std": 0.2287818193435669, "rewards/rollout_reward_func/mean": 0.26745641231536865, "rewards/rollout_reward_func/std": 0.5100818872451782, "sampling/importance_sampling_ratio/max": 0.26534032821655273, "sampling/importance_sampling_ratio/mean": 0.20275409519672394, "sampling/importance_sampling_ratio/min": 4.41740412497893e-05, "sampling/sampling_logp_difference/max": 4.4164299964904785, "sampling/sampling_logp_difference/mean": 0.887386679649353, "step": 39, "step_time": 6.901146268995944 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.03125, "clip_ratio/region_mean": 0.0625, "entropy": 5.60874730348587, "epoch": 0.0004, "grad_norm": 0.056196071207523346, "kl": 0.5904825329780579, "learning_rate": 9.999999992595207e-06, "loss": -0.0008, "step": 40, "step_time": 3.161476828994637 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.778358221054077, "epoch": 0.00041, "frac_reward_zero_std": 0.0, "grad_norm": 0.04523981735110283, "kl": 0.6498352512717247, "learning_rate": 9.999999988430008e-06, "loss": -0.0041, "num_tokens": 849789.0, "reward": 0.044687576591968536, "reward_std": 0.5121811032295227, "rewards/rollout_reward_func/mean": 0.044687576591968536, "rewards/rollout_reward_func/std": 0.9590844511985779, "sampling/importance_sampling_ratio/max": 0.27668336033821106, "sampling/importance_sampling_ratio/mean": 0.180934339761734, "sampling/importance_sampling_ratio/min": 4.662759673539681e-16, "sampling/sampling_logp_difference/max": 4.924638748168945, "sampling/sampling_logp_difference/mean": 1.1581236124038696, "step": 41, "step_time": 5.5550260479794815 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.050347222248092294, "entropy": 5.807269752025604, "epoch": 0.00042, "grad_norm": 0.04105974733829498, "kl": 0.6048065461218357, "learning_rate": 9.999999983339212e-06, "loss": -0.0045, "step": 42, "step_time": 2.991777622002701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.191920280456543, "epoch": 0.00043, "frac_reward_zero_std": 0.0, "grad_norm": 0.07600059360265732, "kl": 0.5811958387494087, "learning_rate": 9.999999977322818e-06, "loss": -0.0044, "num_tokens": 892275.0, "reward": 0.056730717420578, "reward_std": 0.1333501935005188, "rewards/rollout_reward_func/mean": 0.056730717420578, "rewards/rollout_reward_func/std": 0.24826164543628693, "sampling/importance_sampling_ratio/max": 0.28766462206840515, "sampling/importance_sampling_ratio/mean": 0.1868109107017517, "sampling/importance_sampling_ratio/min": 6.963342662896777e-13, "sampling/sampling_logp_difference/max": 3.752962350845337, "sampling/sampling_logp_difference/mean": 1.0387778282165527, "step": 43, "step_time": 5.8438641329994425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.03125, "clip_ratio/region_mean": 0.03125, "entropy": 6.216616988182068, "epoch": 0.00044, "grad_norm": 0.037697311490774155, "kl": 0.6042999289929867, "learning_rate": 9.999999970380822e-06, "loss": -0.0046, "step": 44, "step_time": 3.075816426979145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.861033260822296, "epoch": 0.00045, "frac_reward_zero_std": 0.0, "grad_norm": 0.09335865080356598, "kl": 0.5779753252863884, "learning_rate": 9.999999962513228e-06, "loss": -0.0054, "num_tokens": 930900.0, "reward": 0.45843711495399475, "reward_std": 0.40093672275543213, "rewards/rollout_reward_func/mean": 0.45843711495399475, "rewards/rollout_reward_func/std": 0.6467108726501465, "sampling/importance_sampling_ratio/max": 0.3015332520008087, "sampling/importance_sampling_ratio/mean": 0.2018708735704422, "sampling/importance_sampling_ratio/min": 9.441059045656175e-14, "sampling/sampling_logp_difference/max": 5.076256275177002, "sampling/sampling_logp_difference/mean": 1.1219236850738525, "step": 45, "step_time": 6.720672192008351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 5.857311010360718, "epoch": 0.00046, "grad_norm": 0.08536399155855179, "kl": 0.5762329325079918, "learning_rate": 9.999999953720035e-06, "loss": -0.0055, "step": 46, "step_time": 3.0702176170016173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.34375, "completions/mean_terminated_length": 2.034482717514038, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.865797162055969, "epoch": 0.00047, "frac_reward_zero_std": 0.25, "grad_norm": 0.04298432543873787, "kl": 0.520963728427887, "learning_rate": 9.99999994400124e-06, "loss": -0.0047, "num_tokens": 969598.0, "reward": 0.3553660213947296, "reward_std": 0.11983068287372589, "rewards/rollout_reward_func/mean": 0.3553660213947296, "rewards/rollout_reward_func/std": 0.477932870388031, "sampling/importance_sampling_ratio/max": 0.3172101378440857, "sampling/importance_sampling_ratio/mean": 0.14877977967262268, "sampling/importance_sampling_ratio/min": 1.6701958826387253e-12, "sampling/sampling_logp_difference/max": 4.571735858917236, "sampling/sampling_logp_difference/mean": 1.2117297649383545, "step": 47, "step_time": 5.833042127997032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.836369216442108, "epoch": 0.00048, "grad_norm": 0.044783275574445724, "kl": 0.5248145125806332, "learning_rate": 9.999999933356848e-06, "loss": -0.0047, "step": 48, "step_time": 3.069777287011675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.278702974319458, "epoch": 0.00049, "frac_reward_zero_std": 0.25, "grad_norm": 0.07153937220573425, "kl": 0.51900071837008, "learning_rate": 9.999999921786855e-06, "loss": -0.0025, "num_tokens": 1010552.0, "reward": 0.3082437515258789, "reward_std": 0.12377764284610748, "rewards/rollout_reward_func/mean": 0.3082437515258789, "rewards/rollout_reward_func/std": 0.52757328748703, "sampling/importance_sampling_ratio/max": 0.33371904492378235, "sampling/importance_sampling_ratio/mean": 0.1966235637664795, "sampling/importance_sampling_ratio/min": 0.05594140663743019, "sampling/sampling_logp_difference/max": 1.6132444143295288, "sampling/sampling_logp_difference/mean": 0.8999418020248413, "step": 49, "step_time": 5.8815070769924205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.298723101615906, "epoch": 0.0005, "grad_norm": 0.06900808960199356, "kl": 0.5155160520225763, "learning_rate": 9.999999909291265e-06, "loss": -0.0027, "step": 50, "step_time": 3.6122408150113188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.407120168209076, "epoch": 0.00051, "frac_reward_zero_std": 0.0, "grad_norm": 0.026021085679531097, "kl": 0.44845713302493095, "learning_rate": 9.999999895870075e-06, "loss": -0.0021, "num_tokens": 1052518.0, "reward": 0.06029840558767319, "reward_std": 0.12443101406097412, "rewards/rollout_reward_func/mean": 0.06029840558767319, "rewards/rollout_reward_func/std": 0.24826215207576752, "sampling/importance_sampling_ratio/max": 0.34463322162628174, "sampling/importance_sampling_ratio/mean": 0.19298213720321655, "sampling/importance_sampling_ratio/min": 0.03937874734401703, "sampling/sampling_logp_difference/max": 1.6872153282165527, "sampling/sampling_logp_difference/mean": 0.9757665395736694, "step": 51, "step_time": 6.219884559992352 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 6.44112753868103, "epoch": 0.00052, "grad_norm": 0.025523362681269646, "kl": 0.4441776815801859, "learning_rate": 9.999999881523285e-06, "loss": -0.0021, "step": 52, "step_time": 3.0736984860122902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.345748662948608, "epoch": 0.00053, "frac_reward_zero_std": 0.0, "grad_norm": 0.023250682279467583, "kl": 0.6302292793989182, "learning_rate": 9.999999866250896e-06, "loss": -0.0066, "num_tokens": 1094198.0, "reward": 0.09645789861679077, "reward_std": 0.018091298639774323, "rewards/rollout_reward_func/mean": 0.09645789861679077, "rewards/rollout_reward_func/std": 0.022841643542051315, "sampling/importance_sampling_ratio/max": 0.35545721650123596, "sampling/importance_sampling_ratio/mean": 0.19760020077228546, "sampling/importance_sampling_ratio/min": 2.0737038591823152e-14, "sampling/sampling_logp_difference/max": 4.0369553565979, "sampling/sampling_logp_difference/mean": 1.1401538848876953, "step": 53, "step_time": 5.830423184983374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.352689564228058, "epoch": 0.00054, "grad_norm": 0.022878721356391907, "kl": 0.6268869824707508, "learning_rate": 9.999999850052909e-06, "loss": -0.0067, "step": 54, "step_time": 3.0827113049890613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.2374653816223145, "epoch": 0.00055, "frac_reward_zero_std": 0.0, "grad_norm": 0.04477901756763458, "kl": 0.5707668401300907, "learning_rate": 9.99999983292932e-06, "loss": -0.0064, "num_tokens": 1134072.0, "reward": 0.26849862933158875, "reward_std": 0.23555780947208405, "rewards/rollout_reward_func/mean": 0.26849862933158875, "rewards/rollout_reward_func/std": 0.5129947066307068, "sampling/importance_sampling_ratio/max": 0.36542901396751404, "sampling/importance_sampling_ratio/mean": 0.20561003684997559, "sampling/importance_sampling_ratio/min": 2.778129370994975e-08, "sampling/sampling_logp_difference/max": 3.2634332180023193, "sampling/sampling_logp_difference/mean": 0.9654265642166138, "step": 55, "step_time": 5.724141129008785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.2206525802612305, "epoch": 0.00056, "grad_norm": 0.04258221015334129, "kl": 0.5764110684394836, "learning_rate": 9.999999814880132e-06, "loss": -0.0066, "step": 56, "step_time": 3.9716811739999685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 2.3125, "completions/mean_terminated_length": 2.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.497167527675629, "epoch": 0.00057, "frac_reward_zero_std": 0.0, "grad_norm": 0.040904026478528976, "kl": 0.6651644222438335, "learning_rate": 9.999999795905347e-06, "loss": -0.0072, "num_tokens": 1172309.0, "reward": 0.32202214002609253, "reward_std": 0.650565505027771, "rewards/rollout_reward_func/mean": 0.32202214002609253, "rewards/rollout_reward_func/std": 0.874355673789978, "sampling/importance_sampling_ratio/max": 0.37567201256752014, "sampling/importance_sampling_ratio/mean": 0.18809917569160461, "sampling/importance_sampling_ratio/min": 7.830451842494313e-10, "sampling/sampling_logp_difference/max": 3.3472299575805664, "sampling/sampling_logp_difference/mean": 1.1144211292266846, "step": 57, "step_time": 5.592336886984413 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 6.376331448554993, "epoch": 0.00058, "grad_norm": 0.038816433399915695, "kl": 0.7360409870743752, "learning_rate": 9.999999776004962e-06, "loss": -0.0073, "step": 58, "step_time": 2.9854559949963004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 2.71875, "completions/mean_terminated_length": 2.2903225421905518, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.950840592384338, "epoch": 0.00059, "frac_reward_zero_std": 0.5, "grad_norm": 0.01757523976266384, "kl": 0.4347071088850498, "learning_rate": 9.999999755178978e-06, "loss": -0.0042, "num_tokens": 1214976.0, "reward": 0.023588767275214195, "reward_std": 0.1560412049293518, "rewards/rollout_reward_func/mean": 0.023588767275214195, "rewards/rollout_reward_func/std": 0.3107772767543793, "sampling/importance_sampling_ratio/max": 0.39288175106048584, "sampling/importance_sampling_ratio/mean": 0.1911253035068512, "sampling/importance_sampling_ratio/min": 2.9266636314938255e-13, "sampling/sampling_logp_difference/max": 3.7934036254882812, "sampling/sampling_logp_difference/mean": 1.2654043436050415, "step": 59, "step_time": 5.9710896859978675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.868892848491669, "epoch": 0.0006, "grad_norm": 0.01741526462137699, "kl": 0.4515736438333988, "learning_rate": 9.999999733427394e-06, "loss": -0.0043, "step": 60, "step_time": 3.1765272800039384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.702520430088043, "epoch": 0.00061, "frac_reward_zero_std": 0.0, "grad_norm": 0.05299898609519005, "kl": 0.7659006118774414, "learning_rate": 9.99999971075021e-06, "loss": -0.0092, "num_tokens": 1255932.0, "reward": -0.06451074779033661, "reward_std": 0.39714038372039795, "rewards/rollout_reward_func/mean": -0.06451074779033661, "rewards/rollout_reward_func/std": 0.4562624394893646, "sampling/importance_sampling_ratio/max": 0.40396296977996826, "sampling/importance_sampling_ratio/mean": 0.24927690625190735, "sampling/importance_sampling_ratio/min": 0.0218962375074625, "sampling/sampling_logp_difference/max": 2.1303176879882812, "sampling/sampling_logp_difference/mean": 0.8662530183792114, "step": 61, "step_time": 6.205053687008331 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 5.655879020690918, "epoch": 0.00062, "grad_norm": 0.059083953499794006, "kl": 0.7784963548183441, "learning_rate": 9.999999687147426e-06, "loss": -0.0094, "step": 62, "step_time": 3.551443771022605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.369392037391663, "epoch": 0.00063, "frac_reward_zero_std": 0.0, "grad_norm": 0.051719509065151215, "kl": 0.6785860881209373, "learning_rate": 9.999999662619046e-06, "loss": 0.0046, "num_tokens": 1298316.0, "reward": 0.10170774161815643, "reward_std": 0.01008685678243637, "rewards/rollout_reward_func/mean": 0.10170774161815643, "rewards/rollout_reward_func/std": 0.018231580033898354, "sampling/importance_sampling_ratio/max": 0.4154092073440552, "sampling/importance_sampling_ratio/mean": 0.27922001481056213, "sampling/importance_sampling_ratio/min": 0.047062188386917114, "sampling/sampling_logp_difference/max": 1.5716331005096436, "sampling/sampling_logp_difference/mean": 0.7568771243095398, "step": 63, "step_time": 5.687326924991794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.374613374471664, "epoch": 0.00064, "grad_norm": 0.048700276762247086, "kl": 0.6772401928901672, "learning_rate": 9.999999637165062e-06, "loss": 0.0045, "step": 64, "step_time": 3.046681064995937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.561558723449707, "epoch": 0.00065, "frac_reward_zero_std": 0.25, "grad_norm": 0.04483707994222641, "kl": 0.7031252533197403, "learning_rate": 9.999999610785483e-06, "loss": -0.0028, "num_tokens": 1335501.0, "reward": 0.5848931074142456, "reward_std": 0.03281184285879135, "rewards/rollout_reward_func/mean": 0.5848931074142456, "rewards/rollout_reward_func/std": 0.523490846157074, "sampling/importance_sampling_ratio/max": 0.42360442876815796, "sampling/importance_sampling_ratio/mean": 0.2568522095680237, "sampling/importance_sampling_ratio/min": 3.3601756359996626e-13, "sampling/sampling_logp_difference/max": 4.465012073516846, "sampling/sampling_logp_difference/mean": 1.0358225107192993, "step": 65, "step_time": 5.795072059991071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.514351785182953, "epoch": 0.00066, "grad_norm": 0.04399210587143898, "kl": 0.7080776616930962, "learning_rate": 9.999999583480304e-06, "loss": -0.0029, "step": 66, "step_time": 3.090933404986572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.6782761216163635, "epoch": 0.00067, "frac_reward_zero_std": 0.25, "grad_norm": 0.029382430016994476, "kl": 0.6794880852103233, "learning_rate": 9.999999555249524e-06, "loss": -0.0005, "num_tokens": 1375705.0, "reward": 0.3511698246002197, "reward_std": 0.009392601437866688, "rewards/rollout_reward_func/mean": 0.3511698246002197, "rewards/rollout_reward_func/std": 0.4419592320919037, "sampling/importance_sampling_ratio/max": 0.431738018989563, "sampling/importance_sampling_ratio/mean": 0.259292334318161, "sampling/importance_sampling_ratio/min": 1.2127249204851864e-10, "sampling/sampling_logp_difference/max": 2.997020721435547, "sampling/sampling_logp_difference/mean": 0.9113994240760803, "step": 67, "step_time": 6.2037850440174225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.650863826274872, "epoch": 0.00068, "grad_norm": 0.027852684259414673, "kl": 0.6643334701657295, "learning_rate": 9.999999526093148e-06, "loss": -0.0006, "step": 68, "step_time": 3.556687438991503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.372278153896332, "epoch": 0.00069, "frac_reward_zero_std": 0.25, "grad_norm": 0.05564887076616287, "kl": 0.6969815865159035, "learning_rate": 9.999999496011169e-06, "loss": -0.007, "num_tokens": 1414916.0, "reward": 0.06316475570201874, "reward_std": 0.003922661300748587, "rewards/rollout_reward_func/mean": 0.06316475570201874, "rewards/rollout_reward_func/std": 0.7782765030860901, "sampling/importance_sampling_ratio/max": 0.4360193610191345, "sampling/importance_sampling_ratio/mean": 0.2801935374736786, "sampling/importance_sampling_ratio/min": 0.055089451372623444, "sampling/sampling_logp_difference/max": 1.5896259546279907, "sampling/sampling_logp_difference/mean": 0.7383360266685486, "step": 69, "step_time": 5.899082501993689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.375374913215637, "epoch": 0.0007, "grad_norm": 0.05658377334475517, "kl": 0.6970969885587692, "learning_rate": 9.999999465003593e-06, "loss": -0.0071, "step": 70, "step_time": 3.2069817849987885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.738184988498688, "epoch": 0.00071, "frac_reward_zero_std": 0.5, "grad_norm": 0.028864745050668716, "kl": 0.5545742604881525, "learning_rate": 9.999999433070417e-06, "loss": 0.0071, "num_tokens": 1455781.0, "reward": 0.3544788956642151, "reward_std": 0.0001902134099509567, "rewards/rollout_reward_func/mean": 0.3544788956642151, "rewards/rollout_reward_func/std": 0.4396328628063202, "sampling/importance_sampling_ratio/max": 0.44120344519615173, "sampling/importance_sampling_ratio/mean": 0.25845491886138916, "sampling/importance_sampling_ratio/min": 0.04290040209889412, "sampling/sampling_logp_difference/max": 1.8144071102142334, "sampling/sampling_logp_difference/mean": 0.8358955383300781, "step": 71, "step_time": 5.87098409099417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.7715848088264465, "epoch": 0.00072, "grad_norm": 0.02752612717449665, "kl": 0.5494990553706884, "learning_rate": 9.999999400211643e-06, "loss": 0.0071, "step": 72, "step_time": 3.132804701999703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.423518776893616, "epoch": 0.00073, "frac_reward_zero_std": 0.0, "grad_norm": 0.034972112625837326, "kl": 0.5510088801383972, "learning_rate": 9.99999936642727e-06, "loss": 0.0042, "num_tokens": 1497447.0, "reward": 0.34944948554039, "reward_std": 0.009640626609325409, "rewards/rollout_reward_func/mean": 0.34944948554039, "rewards/rollout_reward_func/std": 0.4315033257007599, "sampling/importance_sampling_ratio/max": 0.4413518011569977, "sampling/importance_sampling_ratio/mean": 0.27573162317276, "sampling/importance_sampling_ratio/min": 2.585718675618409e-06, "sampling/sampling_logp_difference/max": 2.725471019744873, "sampling/sampling_logp_difference/mean": 0.8319124579429626, "step": 73, "step_time": 5.972790507003083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.395976930856705, "epoch": 0.00074, "grad_norm": 0.03590435907244682, "kl": 0.5529137346893549, "learning_rate": 9.999999331717294e-06, "loss": 0.0041, "step": 74, "step_time": 3.4616275989974383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.783660411834717, "epoch": 0.00075, "frac_reward_zero_std": 0.0, "grad_norm": 0.05065322294831276, "kl": 0.7521476745605469, "learning_rate": 9.999999296081722e-06, "loss": -0.0097, "num_tokens": 1536942.0, "reward": 0.26236292719841003, "reward_std": 0.15180015563964844, "rewards/rollout_reward_func/mean": 0.26236292719841003, "rewards/rollout_reward_func/std": 0.5627131462097168, "sampling/importance_sampling_ratio/max": 0.442481130361557, "sampling/importance_sampling_ratio/mean": 0.2475152313709259, "sampling/importance_sampling_ratio/min": 1.7338362733390622e-12, "sampling/sampling_logp_difference/max": 4.475118637084961, "sampling/sampling_logp_difference/mean": 1.0161981582641602, "step": 75, "step_time": 5.644787922006799 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.019097222248092294, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03472222201526165, "entropy": 5.700053095817566, "epoch": 0.00076, "grad_norm": 0.045220356434583664, "kl": 0.769106075167656, "learning_rate": 9.999999259520549e-06, "loss": -0.0098, "step": 76, "step_time": 2.994386686012149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 3.3125, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.760234534740448, "epoch": 0.00077, "frac_reward_zero_std": 0.5, "grad_norm": 0.04997079446911812, "kl": 0.7448510676622391, "learning_rate": 9.99999922203378e-06, "loss": -0.0065, "num_tokens": 1574456.0, "reward": 0.835075855255127, "reward_std": 0.016522476449608803, "rewards/rollout_reward_func/mean": 0.835075855255127, "rewards/rollout_reward_func/std": 0.4353982210159302, "sampling/importance_sampling_ratio/max": 0.4458758533000946, "sampling/importance_sampling_ratio/mean": 0.3343481719493866, "sampling/importance_sampling_ratio/min": 2.49275189519952e-15, "sampling/sampling_logp_difference/max": 3.802536964416504, "sampling/sampling_logp_difference/mean": 1.075225591659546, "step": 77, "step_time": 5.641382827008783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "entropy": 4.620594024658203, "epoch": 0.00078, "grad_norm": 0.03672964125871658, "kl": 0.757639929652214, "learning_rate": 9.99999918362141e-06, "loss": -0.0068, "step": 78, "step_time": 3.0560661029885523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 2.59375, "completions/mean_terminated_length": 2.161290168762207, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.3395925760269165, "epoch": 0.00079, "frac_reward_zero_std": 0.0, "grad_norm": 0.03131585195660591, "kl": 0.5411134883761406, "learning_rate": 9.99999914428344e-06, "loss": -0.0042, "num_tokens": 1616935.0, "reward": 0.0980401560664177, "reward_std": 0.014159942045807838, "rewards/rollout_reward_func/mean": 0.0980401560664177, "rewards/rollout_reward_func/std": 0.02592449076473713, "sampling/importance_sampling_ratio/max": 0.4311515986919403, "sampling/importance_sampling_ratio/mean": 0.20880773663520813, "sampling/importance_sampling_ratio/min": 1.0127561012041042e-09, "sampling/sampling_logp_difference/max": 3.8504981994628906, "sampling/sampling_logp_difference/mean": 1.0421233177185059, "step": 79, "step_time": 6.161564932008332 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 6.207388997077942, "epoch": 0.0008, "grad_norm": 0.03315846249461174, "kl": 0.5694654919207096, "learning_rate": 9.999999104019872e-06, "loss": -0.0043, "step": 80, "step_time": 3.4920099070004653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.7728171050548553, "epoch": 0.00081, "frac_reward_zero_std": 0.5, "grad_norm": 0.05986538529396057, "kl": 0.7720751687884331, "learning_rate": 9.999999062830703e-06, "loss": -0.0012, "num_tokens": 1655010.0, "reward": 0.8402578234672546, "reward_std": 0.005887978244572878, "rewards/rollout_reward_func/mean": 0.8402578234672546, "rewards/rollout_reward_func/std": 0.43189260363578796, "sampling/importance_sampling_ratio/max": 0.4549393057823181, "sampling/importance_sampling_ratio/mean": 0.3981129229068756, "sampling/importance_sampling_ratio/min": 1.5785398943535256e-07, "sampling/sampling_logp_difference/max": 2.695570230484009, "sampling/sampling_logp_difference/mean": 0.5471782684326172, "step": 81, "step_time": 5.626842674988438 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.688737243413925, "epoch": 0.00082, "grad_norm": 0.07228672504425049, "kl": 0.7831979915499687, "learning_rate": 9.999999020715937e-06, "loss": -0.0014, "step": 82, "step_time": 3.069644031995267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.112238109111786, "epoch": 0.00083, "frac_reward_zero_std": 0.0, "grad_norm": 0.2715820074081421, "kl": 0.8755503222346306, "learning_rate": 9.999998977675572e-06, "loss": -0.0041, "num_tokens": 1696882.0, "reward": 0.05948735028505325, "reward_std": 0.11417518556118011, "rewards/rollout_reward_func/mean": 0.05948735028505325, "rewards/rollout_reward_func/std": 0.23073840141296387, "sampling/importance_sampling_ratio/max": 0.47475460171699524, "sampling/importance_sampling_ratio/mean": 0.3058124780654907, "sampling/importance_sampling_ratio/min": 0.060201507061719894, "sampling/sampling_logp_difference/max": 1.7248427867889404, "sampling/sampling_logp_difference/mean": 0.6877350807189941, "step": 83, "step_time": 5.745609735997277 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 4.8907055258750916, "epoch": 0.00084, "grad_norm": 0.15514779090881348, "kl": 0.9618244245648384, "learning_rate": 9.999998933709607e-06, "loss": -0.0048, "step": 84, "step_time": 3.0664146290000645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.556428998708725, "epoch": 0.00085, "frac_reward_zero_std": 0.25, "grad_norm": 0.09369160234928131, "kl": 0.8377725705504417, "learning_rate": 9.999998888818043e-06, "loss": -0.0068, "num_tokens": 1738178.0, "reward": 0.09367585182189941, "reward_std": 0.0117766372859478, "rewards/rollout_reward_func/mean": 0.09367585182189941, "rewards/rollout_reward_func/std": 0.020539239048957825, "sampling/importance_sampling_ratio/max": 0.4531070590019226, "sampling/importance_sampling_ratio/mean": 0.3308722972869873, "sampling/importance_sampling_ratio/min": 4.442696481966185e-13, "sampling/sampling_logp_difference/max": 5.297337532043457, "sampling/sampling_logp_difference/mean": 0.8209755420684814, "step": 85, "step_time": 6.597681734005164 }, { "clip_ratio/high_max": 0.1631944444961846, "clip_ratio/high_mean": 0.0972222222480923, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1128472222480923, "entropy": 4.439495712518692, "epoch": 0.00086, "grad_norm": 0.05795273184776306, "kl": 0.8096081390976906, "learning_rate": 9.99999884300088e-06, "loss": -0.0072, "step": 86, "step_time": 3.0732986300063203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.53125, "completions/mean_terminated_length": 2.096774101257324, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.9817507565021515, "epoch": 0.00087, "frac_reward_zero_std": 0.0, "grad_norm": 0.06314535439014435, "kl": 1.0483805388212204, "learning_rate": 9.999998796258118e-06, "loss": -0.0134, "num_tokens": 1779698.0, "reward": 0.3124326467514038, "reward_std": 0.10719546675682068, "rewards/rollout_reward_func/mean": 0.3124326467514038, "rewards/rollout_reward_func/std": 0.49548837542533875, "sampling/importance_sampling_ratio/max": 0.4624929130077362, "sampling/importance_sampling_ratio/mean": 0.34971198439598083, "sampling/importance_sampling_ratio/min": 2.388819666521158e-05, "sampling/sampling_logp_difference/max": 4.358837127685547, "sampling/sampling_logp_difference/mean": 0.6962816715240479, "step": 87, "step_time": 5.733359099001973 }, { "clip_ratio/high_max": 0.0982142873108387, "clip_ratio/high_mean": 0.04910714365541935, "clip_ratio/low_mean": 0.055803571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.10491071455180645, "entropy": 3.8515259623527527, "epoch": 0.00088, "grad_norm": 0.1547096073627472, "kl": 1.3599929213523865, "learning_rate": 9.999998748589757e-06, "loss": -0.0134, "step": 88, "step_time": 3.0689072649984155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.74305522441864, "epoch": 0.00089, "frac_reward_zero_std": 0.25, "grad_norm": 0.1690363585948944, "kl": 0.9720192663371563, "learning_rate": 9.999998699995797e-06, "loss": 0.0021, "num_tokens": 1819190.0, "reward": 0.35492223501205444, "reward_std": 0.000685733393765986, "rewards/rollout_reward_func/mean": 0.35492223501205444, "rewards/rollout_reward_func/std": 0.43937695026397705, "sampling/importance_sampling_ratio/max": 0.6247501969337463, "sampling/importance_sampling_ratio/mean": 0.4159059524536133, "sampling/importance_sampling_ratio/min": 0.07858673483133316, "sampling/sampling_logp_difference/max": 1.643073320388794, "sampling/sampling_logp_difference/mean": 0.4801587462425232, "step": 89, "step_time": 5.581674157991074 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 3.705870360136032, "epoch": 0.0009, "grad_norm": 0.1340617686510086, "kl": 0.9997036382555962, "learning_rate": 9.999998650476238e-06, "loss": 0.0015, "step": 90, "step_time": 3.489366996996978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.277135908603668, "epoch": 0.00091, "frac_reward_zero_std": 0.25, "grad_norm": 0.093402199447155, "kl": 1.0625631213188171, "learning_rate": 9.99999860003108e-06, "loss": -0.001, "num_tokens": 1860082.0, "reward": 0.09559863060712814, "reward_std": 0.009422440081834793, "rewards/rollout_reward_func/mean": 0.09559863060712814, "rewards/rollout_reward_func/std": 0.020481860265135765, "sampling/importance_sampling_ratio/max": 0.47370073199272156, "sampling/importance_sampling_ratio/mean": 0.3820411264896393, "sampling/importance_sampling_ratio/min": 3.627580236198469e-09, "sampling/sampling_logp_difference/max": 3.1058528423309326, "sampling/sampling_logp_difference/mean": 0.6034049987792969, "step": 91, "step_time": 6.144097350996162 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 4.212139576673508, "epoch": 0.00092, "grad_norm": 0.10329366475343704, "kl": 1.065591812133789, "learning_rate": 9.999998548660322e-06, "loss": -0.0011, "step": 92, "step_time": 3.0393546789928223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.466689556837082, "epoch": 0.00093, "frac_reward_zero_std": 0.5, "grad_norm": 0.06940946727991104, "kl": 0.912627562880516, "learning_rate": 9.999998496363967e-06, "loss": 0.0005, "num_tokens": 1901099.0, "reward": 0.3543928265571594, "reward_std": 0.0001632563944440335, "rewards/rollout_reward_func/mean": 0.3543928265571594, "rewards/rollout_reward_func/std": 0.4393298029899597, "sampling/importance_sampling_ratio/max": 0.47620147466659546, "sampling/importance_sampling_ratio/mean": 0.4435492157936096, "sampling/importance_sampling_ratio/min": 0.20771564543247223, "sampling/sampling_logp_difference/max": 1.0073816776275635, "sampling/sampling_logp_difference/mean": 0.4159742593765259, "step": 93, "step_time": 5.56826259499212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.4768240451812744, "epoch": 0.00094, "grad_norm": 0.06512295454740524, "kl": 0.9092684537172318, "learning_rate": 9.999998443142012e-06, "loss": 0.0006, "step": 94, "step_time": 3.051681128992641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.198994040489197, "epoch": 0.00095, "frac_reward_zero_std": 0.0, "grad_norm": 0.24604950845241547, "kl": 1.3009350299835205, "learning_rate": 9.999998388994457e-06, "loss": -0.0084, "num_tokens": 1943391.0, "reward": 0.059337396174669266, "reward_std": 0.13493110239505768, "rewards/rollout_reward_func/mean": 0.059337396174669266, "rewards/rollout_reward_func/std": 0.24879279732704163, "sampling/importance_sampling_ratio/max": 0.5403016209602356, "sampling/importance_sampling_ratio/mean": 0.38597220182418823, "sampling/importance_sampling_ratio/min": 5.2319517607202215e-09, "sampling/sampling_logp_difference/max": 3.389029026031494, "sampling/sampling_logp_difference/mean": 0.6595216989517212, "step": 95, "step_time": 5.869619782009977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 4.148535788059235, "epoch": 0.00096, "grad_norm": 0.1220366507768631, "kl": 1.2078630328178406, "learning_rate": 9.999998333921305e-06, "loss": -0.009, "step": 96, "step_time": 3.5478959410029347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.7522710263729095, "epoch": 0.00097, "frac_reward_zero_std": 0.25, "grad_norm": 0.16321660578250885, "kl": 0.7714603431522846, "learning_rate": 9.999998277922554e-06, "loss": -0.0014, "num_tokens": 1983793.0, "reward": 0.3525138199329376, "reward_std": 0.010195292532444, "rewards/rollout_reward_func/mean": 0.3525138199329376, "rewards/rollout_reward_func/std": 0.4408356249332428, "sampling/importance_sampling_ratio/max": 0.4873700737953186, "sampling/importance_sampling_ratio/mean": 0.4259200394153595, "sampling/importance_sampling_ratio/min": 1.9737948254994198e-12, "sampling/sampling_logp_difference/max": 4.240769386291504, "sampling/sampling_logp_difference/mean": 0.6410037279129028, "step": 97, "step_time": 6.1277645800000755 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.791998475790024, "epoch": 0.00098, "grad_norm": 0.04603012278676033, "kl": 0.7792509905993938, "learning_rate": 9.999998220998203e-06, "loss": -0.0013, "step": 98, "step_time": 3.0685929150131415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.766343265771866, "epoch": 0.00099, "frac_reward_zero_std": 0.25, "grad_norm": 0.0858863815665245, "kl": 0.8525614440441132, "learning_rate": 9.999998163148253e-06, "loss": -0.001, "num_tokens": 2026475.0, "reward": 0.1049031913280487, "reward_std": 0.00011427758727222681, "rewards/rollout_reward_func/mean": 0.1049031913280487, "rewards/rollout_reward_func/std": 0.00021403587015811354, "sampling/importance_sampling_ratio/max": 0.4860244393348694, "sampling/importance_sampling_ratio/mean": 0.4145754277706146, "sampling/importance_sampling_ratio/min": 0.040692396461963654, "sampling/sampling_logp_difference/max": 2.5232598781585693, "sampling/sampling_logp_difference/mean": 0.48860836029052734, "step": 99, "step_time": 5.674107129001641 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 3.867912143468857, "epoch": 0.001, "grad_norm": 0.06980477273464203, "kl": 0.8391188830137253, "learning_rate": 9.999998104372703e-06, "loss": -0.0013, "step": 100, "step_time": 3.0750840730033815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.01041179895401, "epoch": 0.00101, "frac_reward_zero_std": 0.25, "grad_norm": 0.0858769565820694, "kl": 0.8901920691132545, "learning_rate": 9.999998044671557e-06, "loss": -0.0002, "num_tokens": 2067292.0, "reward": 0.34309378266334534, "reward_std": 0.0015316897770389915, "rewards/rollout_reward_func/mean": 0.34309378266334534, "rewards/rollout_reward_func/std": 0.4204331934452057, "sampling/importance_sampling_ratio/max": 0.4901995062828064, "sampling/importance_sampling_ratio/mean": 0.4026803970336914, "sampling/importance_sampling_ratio/min": 0.09460750967264175, "sampling/sampling_logp_difference/max": 1.7307145595550537, "sampling/sampling_logp_difference/mean": 0.49271371960639954, "step": 101, "step_time": 5.582912827994733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 4.092369079589844, "epoch": 0.00102, "grad_norm": 0.08861951529979706, "kl": 0.8729632422327995, "learning_rate": 9.999997984044808e-06, "loss": -0.0004, "step": 102, "step_time": 3.906076982995728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 2.40625, "completions/mean_terminated_length": 2.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.281353950500488, "epoch": 0.00103, "frac_reward_zero_std": 0.0, "grad_norm": 0.13856643438339233, "kl": 0.9568207189440727, "learning_rate": 9.999997922492466e-06, "loss": -0.0107, "num_tokens": 2109025.0, "reward": 0.028657134622335434, "reward_std": 0.21550066769123077, "rewards/rollout_reward_func/mean": 0.028657134622335434, "rewards/rollout_reward_func/std": 0.28859615325927734, "sampling/importance_sampling_ratio/max": 0.5140408277511597, "sampling/importance_sampling_ratio/mean": 0.3746081590652466, "sampling/importance_sampling_ratio/min": 3.1472406902821604e-08, "sampling/sampling_logp_difference/max": 4.0845794677734375, "sampling/sampling_logp_difference/mean": 0.6514796018600464, "step": 103, "step_time": 5.766497299984621 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 4.233341336250305, "epoch": 0.00104, "grad_norm": 0.08066459745168686, "kl": 0.9660584107041359, "learning_rate": 9.999997860014521e-06, "loss": -0.0111, "step": 104, "step_time": 3.0788512520157383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.045157462358475, "epoch": 0.00105, "frac_reward_zero_std": 0.25, "grad_norm": 0.14875918626785278, "kl": 0.9786011204123497, "learning_rate": 9.99999779661098e-06, "loss": 0.0029, "num_tokens": 2149512.0, "reward": 0.3481953740119934, "reward_std": 0.0004853513091802597, "rewards/rollout_reward_func/mean": 0.3481953740119934, "rewards/rollout_reward_func/std": 0.44308581948280334, "sampling/importance_sampling_ratio/max": 0.4969439208507538, "sampling/importance_sampling_ratio/mean": 0.38708949089050293, "sampling/importance_sampling_ratio/min": 0.06396304816007614, "sampling/sampling_logp_difference/max": 1.7104167938232422, "sampling/sampling_logp_difference/mean": 0.5313997268676758, "step": 105, "step_time": 5.649835083000653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 3.9545077681541443, "epoch": 0.00106, "grad_norm": 0.08066889643669128, "kl": 0.9673162773251534, "learning_rate": 9.999997732281837e-06, "loss": 0.0025, "step": 106, "step_time": 3.0727372399996966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.6210631132125854, "epoch": 0.00107, "frac_reward_zero_std": 0.5, "grad_norm": 0.14429202675819397, "kl": 1.0030174776911736, "learning_rate": 9.999997667027097e-06, "loss": -0.0049, "num_tokens": 2190007.0, "reward": 0.3044784665107727, "reward_std": 0.12422968447208405, "rewards/rollout_reward_func/mean": 0.3044784665107727, "rewards/rollout_reward_func/std": 0.5294240117073059, "sampling/importance_sampling_ratio/max": 0.49577948451042175, "sampling/importance_sampling_ratio/mean": 0.40947794914245605, "sampling/importance_sampling_ratio/min": 0.07679010927677155, "sampling/sampling_logp_difference/max": 1.5833899974822998, "sampling/sampling_logp_difference/mean": 0.49631696939468384, "step": 107, "step_time": 5.856092450005235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 3.461278796195984, "epoch": 0.00108, "grad_norm": 0.0306269358843565, "kl": 1.0262186154723167, "learning_rate": 9.999997600846756e-06, "loss": -0.0052, "step": 108, "step_time": 3.9295252350129886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.870085060596466, "epoch": 0.00109, "frac_reward_zero_std": 0.25, "grad_norm": 0.1076216846704483, "kl": 0.9785957634449005, "learning_rate": 9.99999753374082e-06, "loss": -0.0032, "num_tokens": 2231975.0, "reward": 0.10506629943847656, "reward_std": 0.00026580048142932355, "rewards/rollout_reward_func/mean": 0.10506629943847656, "rewards/rollout_reward_func/std": 0.0005259743775241077, "sampling/importance_sampling_ratio/max": 0.49911683797836304, "sampling/importance_sampling_ratio/mean": 0.40485680103302, "sampling/importance_sampling_ratio/min": 0.11004036664962769, "sampling/sampling_logp_difference/max": 1.3422538042068481, "sampling/sampling_logp_difference/mean": 0.49367719888687134, "step": 109, "step_time": 5.645512027003861 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.846251904964447, "epoch": 0.0011, "grad_norm": 0.11298065632581711, "kl": 0.9780672788619995, "learning_rate": 9.999997465709281e-06, "loss": -0.0037, "step": 110, "step_time": 3.071401318993594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.6239935755729675, "epoch": 0.00111, "frac_reward_zero_std": 0.75, "grad_norm": 0.08215931057929993, "kl": 1.0264135226607323, "learning_rate": 9.999997396752146e-06, "loss": 0.0024, "num_tokens": 2268128.0, "reward": 0.8523024320602417, "reward_std": 0.00029523519333451986, "rewards/rollout_reward_func/mean": 0.8523024320602417, "rewards/rollout_reward_func/std": 0.4381996989250183, "sampling/importance_sampling_ratio/max": 0.4984234571456909, "sampling/importance_sampling_ratio/mean": 0.42202669382095337, "sampling/importance_sampling_ratio/min": 0.09427148103713989, "sampling/sampling_logp_difference/max": 1.540094256401062, "sampling/sampling_logp_difference/mean": 0.45967811346054077, "step": 111, "step_time": 5.549906354986888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.669855058193207, "epoch": 0.00112, "grad_norm": 0.09115087240934372, "kl": 1.019152969121933, "learning_rate": 9.999997326869412e-06, "loss": 0.0026, "step": 112, "step_time": 3.004537762994005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.962161362171173, "epoch": 0.00113, "frac_reward_zero_std": 0.5, "grad_norm": 0.0800398662686348, "kl": 1.0876579582691193, "learning_rate": 9.99999725606108e-06, "loss": -0.0025, "num_tokens": 2309960.0, "reward": 0.09554949402809143, "reward_std": 0.007209730334579945, "rewards/rollout_reward_func/mean": 0.09554949402809143, "rewards/rollout_reward_func/std": 0.02018265798687935, "sampling/importance_sampling_ratio/max": 0.5003674626350403, "sampling/importance_sampling_ratio/mean": 0.3908698856830597, "sampling/importance_sampling_ratio/min": 8.979669438580196e-12, "sampling/sampling_logp_difference/max": 3.1236257553100586, "sampling/sampling_logp_difference/mean": 0.690907895565033, "step": 113, "step_time": 6.372976364000351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.019097222248092294, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019097222248092294, "entropy": 3.9651391804218292, "epoch": 0.00114, "grad_norm": 0.05788580700755119, "kl": 1.086029477417469, "learning_rate": 9.999997184327149e-06, "loss": -0.0026, "step": 114, "step_time": 3.566471656005888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.6770279705524445, "epoch": 0.00115, "frac_reward_zero_std": 0.25, "grad_norm": 0.051569245755672455, "kl": 1.1373351141810417, "learning_rate": 9.999997111667619e-06, "loss": -0.0084, "num_tokens": 2351958.0, "reward": 0.06768360733985901, "reward_std": 0.030640859156847, "rewards/rollout_reward_func/mean": 0.06768360733985901, "rewards/rollout_reward_func/std": 0.060047101229429245, "sampling/importance_sampling_ratio/max": 0.5042310953140259, "sampling/importance_sampling_ratio/mean": 0.4103297293186188, "sampling/importance_sampling_ratio/min": 0.010575676336884499, "sampling/sampling_logp_difference/max": 3.089411973953247, "sampling/sampling_logp_difference/mean": 0.5414048433303833, "step": 115, "step_time": 5.927852010005154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.6528416574001312, "epoch": 0.00116, "grad_norm": 0.056602876633405685, "kl": 1.1722295805811882, "learning_rate": 9.999997038082489e-06, "loss": -0.0084, "step": 116, "step_time": 3.1892200049987878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 2.28125, "completions/mean_terminated_length": 2.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.213207185268402, "epoch": 0.00117, "frac_reward_zero_std": 0.0, "grad_norm": 0.11330419778823853, "kl": 0.9412969946861267, "learning_rate": 9.999996963571762e-06, "loss": 0.0008, "num_tokens": 2393653.0, "reward": 0.10076075047254562, "reward_std": 0.011638693511486053, "rewards/rollout_reward_func/mean": 0.10076075047254562, "rewards/rollout_reward_func/std": 0.01884334348142147, "sampling/importance_sampling_ratio/max": 0.5011688470840454, "sampling/importance_sampling_ratio/mean": 0.3566637635231018, "sampling/importance_sampling_ratio/min": 2.1415016249193286e-07, "sampling/sampling_logp_difference/max": 3.7446162700653076, "sampling/sampling_logp_difference/mean": 0.6863818168640137, "step": 117, "step_time": 5.729019544989569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 4.129977196455002, "epoch": 0.00118, "grad_norm": 0.0724434107542038, "kl": 0.9720676839351654, "learning_rate": 9.999996888135438e-06, "loss": 0.0006, "step": 118, "step_time": 3.0802293039960205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.387096643447876, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.8174341917037964, "epoch": 0.00119, "frac_reward_zero_std": 0.25, "grad_norm": 0.050672754645347595, "kl": 0.8171752318739891, "learning_rate": 9.999996811773512e-06, "loss": -0.0073, "num_tokens": 2433823.0, "reward": 0.3471165895462036, "reward_std": 0.01890595816075802, "rewards/rollout_reward_func/mean": 0.3471165895462036, "rewards/rollout_reward_func/std": 0.4446687698364258, "sampling/importance_sampling_ratio/max": 0.5097929835319519, "sampling/importance_sampling_ratio/mean": 0.4329725205898285, "sampling/importance_sampling_ratio/min": 1.9214516859111103e-13, "sampling/sampling_logp_difference/max": 4.564438343048096, "sampling/sampling_logp_difference/mean": 0.7434948086738586, "step": 119, "step_time": 6.659768166988215 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.7280977368354797, "epoch": 0.0012, "grad_norm": 0.038431525230407715, "kl": 0.8281378448009491, "learning_rate": 9.999996734485989e-06, "loss": -0.0077, "step": 120, "step_time": 3.098136821994558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.346911609172821, "epoch": 0.00121, "frac_reward_zero_std": 0.25, "grad_norm": 0.06066334992647171, "kl": 0.7883583307266235, "learning_rate": 9.999996656272867e-06, "loss": -0.0046, "num_tokens": 2472867.0, "reward": 0.5911115407943726, "reward_std": 0.0062848343513906, "rewards/rollout_reward_func/mean": 0.5911115407943726, "rewards/rollout_reward_func/std": 0.49528977274894714, "sampling/importance_sampling_ratio/max": 0.5119594931602478, "sampling/importance_sampling_ratio/mean": 0.44440752267837524, "sampling/importance_sampling_ratio/min": 2.9538183010657804e-09, "sampling/sampling_logp_difference/max": 2.7434892654418945, "sampling/sampling_logp_difference/mean": 0.5534473061561584, "step": 121, "step_time": 5.591803961004189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 3.28483584523201, "epoch": 0.00122, "grad_norm": 0.020887916907668114, "kl": 0.7884719707071781, "learning_rate": 9.999996577134147e-06, "loss": -0.0048, "step": 122, "step_time": 3.0566713540029014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.2563277781009674, "epoch": 0.00123, "frac_reward_zero_std": 0.0, "grad_norm": 0.04797399789094925, "kl": 1.200133576989174, "learning_rate": 9.999996497069828e-06, "loss": 0.001, "num_tokens": 2514363.0, "reward": 0.2756161093711853, "reward_std": 0.19160021841526031, "rewards/rollout_reward_func/mean": 0.2756161093711853, "rewards/rollout_reward_func/std": 0.46415621042251587, "sampling/importance_sampling_ratio/max": 0.8039597868919373, "sampling/importance_sampling_ratio/mean": 0.4761509299278259, "sampling/importance_sampling_ratio/min": 9.533727141075587e-10, "sampling/sampling_logp_difference/max": 3.719825267791748, "sampling/sampling_logp_difference/mean": 0.5524382591247559, "step": 123, "step_time": 5.750677712007018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.2121150195598602, "epoch": 0.00124, "grad_norm": 0.03727919980883598, "kl": 1.2085038051009178, "learning_rate": 9.99999641607991e-06, "loss": 0.0012, "step": 124, "step_time": 3.0839741859963397 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.09375, "completions/mean_terminated_length": 2.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.1156879663467407, "epoch": 0.00125, "frac_reward_zero_std": 0.25, "grad_norm": 0.07257883250713348, "kl": 1.2345582768321037, "learning_rate": 9.999996334164396e-06, "loss": 0.0002, "num_tokens": 2554578.0, "reward": 0.35318511724472046, "reward_std": 0.009650146588683128, "rewards/rollout_reward_func/mean": 0.35318511724472046, "rewards/rollout_reward_func/std": 0.4331287741661072, "sampling/importance_sampling_ratio/max": 0.838970959186554, "sampling/importance_sampling_ratio/mean": 0.47919386625289917, "sampling/importance_sampling_ratio/min": 0.00017681617464404553, "sampling/sampling_logp_difference/max": 4.860955715179443, "sampling/sampling_logp_difference/mean": 0.4965651035308838, "step": 125, "step_time": 6.4784793210055795 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.0925288796424866, "epoch": 0.00126, "grad_norm": 0.07087775319814682, "kl": 1.23670444637537, "learning_rate": 9.999996251323281e-06, "loss": 0.0, "step": 126, "step_time": 3.0731618000063463 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.20419842004776, "epoch": 0.00127, "frac_reward_zero_std": 0.25, "grad_norm": 0.1995907425880432, "kl": 1.1825120151042938, "learning_rate": 9.99999616755657e-06, "loss": 0.0076, "num_tokens": 2592612.0, "reward": 0.5301260948181152, "reward_std": 0.19864480197429657, "rewards/rollout_reward_func/mean": 0.5301260948181152, "rewards/rollout_reward_func/std": 0.583208441734314, "sampling/importance_sampling_ratio/max": 0.8972416520118713, "sampling/importance_sampling_ratio/mean": 0.4556633234024048, "sampling/importance_sampling_ratio/min": 5.597634578080601e-10, "sampling/sampling_logp_difference/max": 4.207612991333008, "sampling/sampling_logp_difference/mean": 0.6268583536148071, "step": 127, "step_time": 5.656626146002964 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019097222248092294, "entropy": 3.1755579113960266, "epoch": 0.00128, "grad_norm": 0.24610351026058197, "kl": 1.1974003240466118, "learning_rate": 9.999996082864259e-06, "loss": 0.0067, "step": 128, "step_time": 3.071516509000503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.53125, "completions/mean_terminated_length": 2.096774101257324, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.266002893447876, "epoch": 0.00129, "frac_reward_zero_std": 0.5, "grad_norm": 0.010048607364296913, "kl": 0.9733711779117584, "learning_rate": 9.99999599724635e-06, "loss": -0.0083, "num_tokens": 2633780.0, "reward": 0.3484850525856018, "reward_std": 0.01875840499997139, "rewards/rollout_reward_func/mean": 0.3484850525856018, "rewards/rollout_reward_func/std": 0.4438955783843994, "sampling/importance_sampling_ratio/max": 0.5814123749732971, "sampling/importance_sampling_ratio/mean": 0.4625992178916931, "sampling/importance_sampling_ratio/min": 2.2912618469206336e-09, "sampling/sampling_logp_difference/max": 4.920463562011719, "sampling/sampling_logp_difference/mean": 0.5750842690467834, "step": 129, "step_time": 6.0185301170058665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.2652773559093475, "epoch": 0.0013, "grad_norm": 0.010237341746687889, "kl": 0.9733765721321106, "learning_rate": 9.999995910702842e-06, "loss": -0.0083, "step": 130, "step_time": 3.166177634986525 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.2535327076911926, "epoch": 0.00131, "frac_reward_zero_std": 0.25, "grad_norm": 0.2387905865907669, "kl": 2.1812868639826775, "learning_rate": 9.999995823233738e-06, "loss": -0.0074, "num_tokens": 2675550.0, "reward": 0.0931454449892044, "reward_std": 0.01654176414012909, "rewards/rollout_reward_func/mean": 0.0931454449892044, "rewards/rollout_reward_func/std": 0.02628944255411625, "sampling/importance_sampling_ratio/max": 0.607771635055542, "sampling/importance_sampling_ratio/mean": 0.43172788619995117, "sampling/importance_sampling_ratio/min": 1.2036712230233115e-13, "sampling/sampling_logp_difference/max": 4.246298789978027, "sampling/sampling_logp_difference/mean": 0.7824053168296814, "step": 131, "step_time": 6.784090413006197 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.2912452816963196, "epoch": 0.00132, "grad_norm": 0.09111316502094269, "kl": 1.5084282532334328, "learning_rate": 9.999995734839033e-06, "loss": -0.0081, "step": 132, "step_time": 3.084837205999065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.9825261533260345, "epoch": 0.00133, "frac_reward_zero_std": 0.75, "grad_norm": 0.009552489034831524, "kl": 1.2319138050079346, "learning_rate": 9.999995645518729e-06, "loss": 0.0015, "num_tokens": 2718678.0, "reward": -0.17062929272651672, "reward_std": 0.00011364868259988725, "rewards/rollout_reward_func/mean": -0.17062929272651672, "rewards/rollout_reward_func/std": 0.48548850417137146, "sampling/importance_sampling_ratio/max": 0.7188218832015991, "sampling/importance_sampling_ratio/mean": 0.5057423114776611, "sampling/importance_sampling_ratio/min": 0.431903213262558, "sampling/sampling_logp_difference/max": 0.840796947479248, "sampling/sampling_logp_difference/mean": 0.375448077917099, "step": 133, "step_time": 5.974982450992684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.0126626193523407, "epoch": 0.00134, "grad_norm": 0.012782225385308266, "kl": 1.227381408214569, "learning_rate": 9.999995555272829e-06, "loss": 0.0016, "step": 134, "step_time": 3.1891902619900065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.2452589571475983, "epoch": 0.00135, "frac_reward_zero_std": 0.5, "grad_norm": 0.024763504043221474, "kl": 1.1489246413111687, "learning_rate": 9.99999546410133e-06, "loss": -0.0034, "num_tokens": 2760428.0, "reward": 0.09615066647529602, "reward_std": 0.01001292560249567, "rewards/rollout_reward_func/mean": 0.09615066647529602, "rewards/rollout_reward_func/std": 0.021001165732741356, "sampling/importance_sampling_ratio/max": 0.5585006475448608, "sampling/importance_sampling_ratio/mean": 0.4828481078147888, "sampling/importance_sampling_ratio/min": 3.1043907444683327e-09, "sampling/sampling_logp_difference/max": 3.092977285385132, "sampling/sampling_logp_difference/mean": 0.5110899209976196, "step": 135, "step_time": 5.786729104991537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.2777431905269623, "epoch": 0.00136, "grad_norm": 0.026866303756833076, "kl": 1.1450117975473404, "learning_rate": 9.999995372004231e-06, "loss": -0.0035, "step": 136, "step_time": 3.515143459000683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.160526394844055, "epoch": 0.00137, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018684895476326346, "kl": 0.9170939326286316, "learning_rate": 9.999995278981537e-06, "loss": 0.0011, "num_tokens": 2800716.0, "reward": 0.6029314994812012, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6029314994812012, "rewards/rollout_reward_func/std": 0.5058632493019104, "sampling/importance_sampling_ratio/max": 0.544912576675415, "sampling/importance_sampling_ratio/mean": 0.47711968421936035, "sampling/importance_sampling_ratio/min": 0.19973118603229523, "sampling/sampling_logp_difference/max": 0.8415690660476685, "sampling/sampling_logp_difference/mean": 0.382478803396225, "step": 137, "step_time": 6.1707015889987815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.1832054555416107, "epoch": 0.00138, "grad_norm": 0.0018490822985768318, "kl": 0.914090134203434, "learning_rate": 9.999995185033245e-06, "loss": 0.0011, "step": 138, "step_time": 3.017572591001226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 3.09375, "completions/mean_terminated_length": 2.2333333492279053, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.8272139728069305, "epoch": 0.00139, "frac_reward_zero_std": 0.25, "grad_norm": 0.024720754474401474, "kl": 0.8635407313704491, "learning_rate": 9.999995090159351e-06, "loss": -0.0116, "num_tokens": 2839099.0, "reward": 0.33942559361457825, "reward_std": 0.02563362754881382, "rewards/rollout_reward_func/mean": 0.33942559361457825, "rewards/rollout_reward_func/std": 0.44945305585861206, "sampling/importance_sampling_ratio/max": 0.5159437656402588, "sampling/importance_sampling_ratio/mean": 0.41782891750335693, "sampling/importance_sampling_ratio/min": 9.430958791689648e-11, "sampling/sampling_logp_difference/max": 4.098474502563477, "sampling/sampling_logp_difference/mean": 0.8239774703979492, "step": 139, "step_time": 5.853180288984731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.8531467616558075, "epoch": 0.0014, "grad_norm": 0.024831190705299377, "kl": 0.8580404669046402, "learning_rate": 9.999994994359862e-06, "loss": -0.0116, "step": 140, "step_time": 3.0823587460035924 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.1000001430511475, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.658608168363571, "epoch": 0.00141, "frac_reward_zero_std": 0.25, "grad_norm": 0.03892930597066879, "kl": 0.9957849457859993, "learning_rate": 9.999994897634775e-06, "loss": -0.0094, "num_tokens": 2879824.0, "reward": 0.19961999356746674, "reward_std": 0.1579120010137558, "rewards/rollout_reward_func/mean": 0.19961999356746674, "rewards/rollout_reward_func/std": 0.33259817957878113, "sampling/importance_sampling_ratio/max": 0.5171502232551575, "sampling/importance_sampling_ratio/mean": 0.4329766035079956, "sampling/importance_sampling_ratio/min": 2.0039572432017927e-11, "sampling/sampling_logp_difference/max": 4.331401824951172, "sampling/sampling_logp_difference/mean": 0.8344852328300476, "step": 141, "step_time": 5.824428660991543 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019097222248092294, "entropy": 3.661935418844223, "epoch": 0.00142, "grad_norm": 0.05640319362282753, "kl": 1.017067477107048, "learning_rate": 9.999994799984088e-06, "loss": -0.0095, "step": 142, "step_time": 3.5263630360059324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.53125, "completions/mean_terminated_length": 2.096774101257324, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.5336678624153137, "epoch": 0.00143, "frac_reward_zero_std": 0.25, "grad_norm": 0.058964844793081284, "kl": 1.069240778684616, "learning_rate": 9.999994701407805e-06, "loss": -0.007, "num_tokens": 2921595.0, "reward": -0.17456060647964478, "reward_std": 0.01057820301502943, "rewards/rollout_reward_func/mean": -0.17456060647964478, "rewards/rollout_reward_func/std": 0.4808507561683655, "sampling/importance_sampling_ratio/max": 0.5257615447044373, "sampling/importance_sampling_ratio/mean": 0.46021825075149536, "sampling/importance_sampling_ratio/min": 2.0095499225704534e-14, "sampling/sampling_logp_difference/max": 4.777149200439453, "sampling/sampling_logp_difference/mean": 0.6903709769248962, "step": 143, "step_time": 5.8049651050023385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.541070282459259, "epoch": 0.00144, "grad_norm": 0.05616610869765282, "kl": 1.0641427338123322, "learning_rate": 9.999994601905921e-06, "loss": -0.0071, "step": 144, "step_time": 3.086268138999003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 2.25, "completions/mean_terminated_length": 2.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.6055936217308044, "epoch": 0.00145, "frac_reward_zero_std": 0.5, "grad_norm": 0.04997464641928673, "kl": 0.955722838640213, "learning_rate": 9.999994501478441e-06, "loss": -0.0065, "num_tokens": 2960571.0, "reward": 0.2821550965309143, "reward_std": 0.1337042599916458, "rewards/rollout_reward_func/mean": 0.2821550965309143, "rewards/rollout_reward_func/std": 0.5400002002716064, "sampling/importance_sampling_ratio/max": 0.5186118483543396, "sampling/importance_sampling_ratio/mean": 0.432806134223938, "sampling/importance_sampling_ratio/min": 4.392423136323487e-08, "sampling/sampling_logp_difference/max": 4.353190898895264, "sampling/sampling_logp_difference/mean": 0.6038384437561035, "step": 145, "step_time": 5.712949166001636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.603723645210266, "epoch": 0.00146, "grad_norm": 0.044969454407691956, "kl": 0.9906712025403976, "learning_rate": 9.999994400125363e-06, "loss": -0.0066, "step": 146, "step_time": 3.0143205960048363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.5516068935394287, "epoch": 0.00147, "frac_reward_zero_std": 0.5, "grad_norm": 0.04507181793451309, "kl": 1.1149482503533363, "learning_rate": 9.999994297846687e-06, "loss": -0.0019, "num_tokens": 3000443.0, "reward": 0.34860536456108093, "reward_std": 0.008653172291815281, "rewards/rollout_reward_func/mean": 0.34860536456108093, "rewards/rollout_reward_func/std": 0.4319983720779419, "sampling/importance_sampling_ratio/max": 0.5287869572639465, "sampling/importance_sampling_ratio/mean": 0.45761263370513916, "sampling/importance_sampling_ratio/min": 1.6159059645914908e-09, "sampling/sampling_logp_difference/max": 2.2048468589782715, "sampling/sampling_logp_difference/mean": 0.549673318862915, "step": 147, "step_time": 5.735741602991766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.534568667411804, "epoch": 0.00148, "grad_norm": 0.048216793686151505, "kl": 1.1174000799655914, "learning_rate": 9.999994194642413e-06, "loss": -0.002, "step": 148, "step_time": 3.936288215998502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.191679358482361, "epoch": 0.00149, "frac_reward_zero_std": 0.5, "grad_norm": 0.03925507515668869, "kl": 1.2042394131422043, "learning_rate": 9.99999409051254e-06, "loss": 0.0005, "num_tokens": 3040216.0, "reward": 0.34972235560417175, "reward_std": 0.0002834983170032501, "rewards/rollout_reward_func/mean": 0.34972235560417175, "rewards/rollout_reward_func/std": 0.4421542286872864, "sampling/importance_sampling_ratio/max": 0.5309657454490662, "sampling/importance_sampling_ratio/mean": 0.4742198586463928, "sampling/importance_sampling_ratio/min": 0.1925262063741684, "sampling/sampling_logp_difference/max": 0.8444068431854248, "sampling/sampling_logp_difference/mean": 0.3892626464366913, "step": 149, "step_time": 5.726431257011427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.172209858894348, "epoch": 0.0015, "grad_norm": 0.0459773987531662, "kl": 1.1947643607854843, "learning_rate": 9.999993985457072e-06, "loss": 0.0004, "step": 150, "step_time": 3.07338682601403 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.056642919778824, "epoch": 0.00151, "frac_reward_zero_std": 0.5, "grad_norm": 0.033513545989990234, "kl": 0.813825324177742, "learning_rate": 9.999993879476003e-06, "loss": 0.0049, "num_tokens": 3080660.0, "reward": 0.04657064378261566, "reward_std": 0.00028795251273550093, "rewards/rollout_reward_func/mean": 0.04657064378261566, "rewards/rollout_reward_func/std": 0.7728968858718872, "sampling/importance_sampling_ratio/max": 0.5308986902236938, "sampling/importance_sampling_ratio/mean": 0.48274150490760803, "sampling/importance_sampling_ratio/min": 0.07654794305562973, "sampling/sampling_logp_difference/max": 1.4534010887145996, "sampling/sampling_logp_difference/mean": 0.3958326578140259, "step": 151, "step_time": 5.50366706699424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.0422452688217163, "epoch": 0.00152, "grad_norm": 0.046411897987127304, "kl": 0.806418314576149, "learning_rate": 9.999993772569339e-06, "loss": 0.0049, "step": 152, "step_time": 3.0542532709951047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.2411508560180664, "epoch": 0.00153, "frac_reward_zero_std": 0.5, "grad_norm": 0.06323983520269394, "kl": 1.2580524235963821, "learning_rate": 9.999993664737076e-06, "loss": 0.0031, "num_tokens": 3120747.0, "reward": 0.3541725277900696, "reward_std": 0.0004397781740408391, "rewards/rollout_reward_func/mean": 0.3541725277900696, "rewards/rollout_reward_func/std": 0.439815878868103, "sampling/importance_sampling_ratio/max": 0.5432706475257874, "sampling/importance_sampling_ratio/mean": 0.48585808277130127, "sampling/importance_sampling_ratio/min": 0.19858351349830627, "sampling/sampling_logp_difference/max": 0.9231637716293335, "sampling/sampling_logp_difference/mean": 0.38338640332221985, "step": 153, "step_time": 5.750313245000143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.2258640825748444, "epoch": 0.00154, "grad_norm": 0.06322702765464783, "kl": 1.2604417949914932, "learning_rate": 9.999993555979215e-06, "loss": 0.003, "step": 154, "step_time": 3.9260411499999464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.1000001430511475, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.6723279654979706, "epoch": 0.00155, "frac_reward_zero_std": 0.5, "grad_norm": 0.010950242169201374, "kl": 0.950068112462759, "learning_rate": 9.999993446295754e-06, "loss": -0.0097, "num_tokens": 3158363.0, "reward": 0.5884703397750854, "reward_std": 0.02114904299378395, "rewards/rollout_reward_func/mean": 0.5884703397750854, "rewards/rollout_reward_func/std": 0.5046162605285645, "sampling/importance_sampling_ratio/max": 0.5471625924110413, "sampling/importance_sampling_ratio/mean": 0.43581655621528625, "sampling/importance_sampling_ratio/min": 2.1638184408319383e-13, "sampling/sampling_logp_difference/max": 4.653512477874756, "sampling/sampling_logp_difference/mean": 0.7587471604347229, "step": 155, "step_time": 5.714177548979933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.640718460083008, "epoch": 0.00156, "grad_norm": 0.010739185847342014, "kl": 0.952693372964859, "learning_rate": 9.999993335686697e-06, "loss": -0.0097, "step": 156, "step_time": 3.0548302790048183 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.992147773504257, "epoch": 0.00157, "frac_reward_zero_std": 0.25, "grad_norm": 1.4627033472061157, "kl": 0.9475488141179085, "learning_rate": 9.999993224152043e-06, "loss": -0.0003, "num_tokens": 3199205.0, "reward": -0.2278449833393097, "reward_std": 0.13235662877559662, "rewards/rollout_reward_func/mean": -0.2278449833393097, "rewards/rollout_reward_func/std": 0.49393460154533386, "sampling/importance_sampling_ratio/max": 0.5985190272331238, "sampling/importance_sampling_ratio/mean": 0.49765583872795105, "sampling/importance_sampling_ratio/min": 0.15994401276111603, "sampling/sampling_logp_difference/max": 1.1343603134155273, "sampling/sampling_logp_difference/mean": 0.37977564334869385, "step": 157, "step_time": 5.72615775099257 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 2.923297166824341, "epoch": 0.00158, "grad_norm": 0.13072077929973602, "kl": 1.4181053265929222, "learning_rate": 9.999993111691792e-06, "loss": -0.0005, "step": 158, "step_time": 3.0546323050148203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.8551849722862244, "epoch": 0.00159, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009124053176492453, "kl": 0.9472675919532776, "learning_rate": 9.999992998305941e-06, "loss": 0.0012, "num_tokens": 3236340.0, "reward": 0.8468048572540283, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8468048572540283, "rewards/rollout_reward_func/std": 0.44751667976379395, "sampling/importance_sampling_ratio/max": 0.5303928852081299, "sampling/importance_sampling_ratio/mean": 0.5092862844467163, "sampling/importance_sampling_ratio/min": 0.449641615152359, "sampling/sampling_logp_difference/max": 0.7990290522575378, "sampling/sampling_logp_difference/mean": 0.33799707889556885, "step": 159, "step_time": 6.337209439014259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8733853101730347, "epoch": 0.0016, "grad_norm": 0.0068571912124753, "kl": 0.937136709690094, "learning_rate": 9.999992883994494e-06, "loss": 0.0012, "step": 160, "step_time": 2.9907529699994484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.09375, "completions/mean_terminated_length": 2.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.0595704317092896, "epoch": 0.00161, "frac_reward_zero_std": 0.5, "grad_norm": 0.017956765368580818, "kl": 1.0842097252607346, "learning_rate": 9.999992768757449e-06, "loss": -0.0031, "num_tokens": 3276500.0, "reward": 0.3514135181903839, "reward_std": 0.009302475489675999, "rewards/rollout_reward_func/mean": 0.3514135181903839, "rewards/rollout_reward_func/std": 0.44181883335113525, "sampling/importance_sampling_ratio/max": 0.6619308590888977, "sampling/importance_sampling_ratio/mean": 0.48427507281303406, "sampling/importance_sampling_ratio/min": 0.00015231186989694834, "sampling/sampling_logp_difference/max": 5.370872974395752, "sampling/sampling_logp_difference/mean": 0.47925904393196106, "step": 161, "step_time": 5.887808569990739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.0561551451683044, "epoch": 0.00162, "grad_norm": 0.01809580996632576, "kl": 1.0828409641981125, "learning_rate": 9.999992652594807e-06, "loss": -0.0031, "step": 162, "step_time": 3.149244059008197 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 3.125, "completions/mean_terminated_length": 2.266666889190674, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.472727954387665, "epoch": 0.00163, "frac_reward_zero_std": 0.25, "grad_norm": 0.04362531751394272, "kl": 0.9365215823054314, "learning_rate": 9.999992535506568e-06, "loss": -0.0073, "num_tokens": 3315514.0, "reward": 0.4534215033054352, "reward_std": 0.27720433473587036, "rewards/rollout_reward_func/mean": 0.4534215033054352, "rewards/rollout_reward_func/std": 0.6487917900085449, "sampling/importance_sampling_ratio/max": 0.570979118347168, "sampling/importance_sampling_ratio/mean": 0.4567375183105469, "sampling/importance_sampling_ratio/min": 5.0237845133915826e-11, "sampling/sampling_logp_difference/max": 3.749945878982544, "sampling/sampling_logp_difference/mean": 0.759648859500885, "step": 163, "step_time": 5.695311598996341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.4810463786125183, "epoch": 0.00164, "grad_norm": 0.053128451108932495, "kl": 0.9279332086443901, "learning_rate": 9.99999241749273e-06, "loss": -0.0072, "step": 164, "step_time": 3.0776850289912545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.862192064523697, "epoch": 0.00165, "frac_reward_zero_std": 1.0, "grad_norm": 0.015856457874178886, "kl": 1.1280961260199547, "learning_rate": 9.999992298553295e-06, "loss": 0.0014, "num_tokens": 3358368.0, "reward": 0.10497531294822693, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.10497531294822693, "rewards/rollout_reward_func/std": 0.0001362916809739545, "sampling/importance_sampling_ratio/max": 0.6382718682289124, "sampling/importance_sampling_ratio/mean": 0.5057328939437866, "sampling/importance_sampling_ratio/min": 0.15540577471256256, "sampling/sampling_logp_difference/max": 1.1654324531555176, "sampling/sampling_logp_difference/mean": 0.38584330677986145, "step": 165, "step_time": 6.59020085100201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8655210435390472, "epoch": 0.00166, "grad_norm": 0.012912724167108536, "kl": 1.1088050976395607, "learning_rate": 9.999992178688262e-06, "loss": 0.0014, "step": 166, "step_time": 3.0297017539851367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.0758056938648224, "epoch": 0.00167, "frac_reward_zero_std": 0.25, "grad_norm": 0.18162639439105988, "kl": 1.3370719999074936, "learning_rate": 9.999992057897633e-06, "loss": 0.0003, "num_tokens": 3399858.0, "reward": 0.10509223490953445, "reward_std": 0.0006218636408448219, "rewards/rollout_reward_func/mean": 0.10509223490953445, "rewards/rollout_reward_func/std": 0.0012723985128104687, "sampling/importance_sampling_ratio/max": 0.7846220135688782, "sampling/importance_sampling_ratio/mean": 0.494211882352829, "sampling/importance_sampling_ratio/min": 0.1489037424325943, "sampling/sampling_logp_difference/max": 1.2071449756622314, "sampling/sampling_logp_difference/mean": 0.4408376216888428, "step": 167, "step_time": 6.094906667000032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 3.062567561864853, "epoch": 0.00168, "grad_norm": 0.11885369569063187, "kl": 1.3122014254331589, "learning_rate": 9.999991936181406e-06, "loss": -0.0005, "step": 168, "step_time": 3.153792119999707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.939668595790863, "epoch": 0.00169, "frac_reward_zero_std": 0.0, "grad_norm": 0.26487165689468384, "kl": 1.1495357155799866, "learning_rate": 9.999991813539582e-06, "loss": -0.0027, "num_tokens": 3442030.0, "reward": 0.1040605679154396, "reward_std": 0.0003108101664111018, "rewards/rollout_reward_func/mean": 0.1040605679154396, "rewards/rollout_reward_func/std": 0.0012002212461084127, "sampling/importance_sampling_ratio/max": 0.6398003101348877, "sampling/importance_sampling_ratio/mean": 0.48581477999687195, "sampling/importance_sampling_ratio/min": 0.08106659352779388, "sampling/sampling_logp_difference/max": 1.8439216613769531, "sampling/sampling_logp_difference/mean": 0.4256422817707062, "step": 169, "step_time": 5.819434368990187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.03125, "clip_ratio/region_mean": 0.078125, "entropy": 2.9098407328128815, "epoch": 0.0017, "grad_norm": 0.04170040413737297, "kl": 1.1434208825230598, "learning_rate": 9.999991689972159e-06, "loss": -0.0034, "step": 170, "step_time": 3.0891880889903405 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.53125, "completions/mean_terminated_length": 2.096774101257324, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.322201192378998, "epoch": 0.00171, "frac_reward_zero_std": 0.75, "grad_norm": 0.08540276437997818, "kl": 1.1183969974517822, "learning_rate": 9.999991565479141e-06, "loss": 0.0039, "num_tokens": 3480309.0, "reward": 0.2482661008834839, "reward_std": 0.2835019528865814, "rewards/rollout_reward_func/mean": 0.2482661008834839, "rewards/rollout_reward_func/std": 0.7541937232017517, "sampling/importance_sampling_ratio/max": 0.5319186449050903, "sampling/importance_sampling_ratio/mean": 0.4513311982154846, "sampling/importance_sampling_ratio/min": 5.890941068287248e-09, "sampling/sampling_logp_difference/max": 4.415996551513672, "sampling/sampling_logp_difference/mean": 0.5671193599700928, "step": 171, "step_time": 6.15850018501078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.327172815799713, "epoch": 0.00172, "grad_norm": 0.16008314490318298, "kl": 1.1121483370661736, "learning_rate": 9.999991440060524e-06, "loss": 0.0039, "step": 172, "step_time": 3.0537015330119175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.8874706029891968, "epoch": 0.00173, "frac_reward_zero_std": 0.25, "grad_norm": 0.05827116593718529, "kl": 1.175444170832634, "learning_rate": 9.99999131371631e-06, "loss": -0.0019, "num_tokens": 3521034.0, "reward": 0.24405920505523682, "reward_std": 0.137240931391716, "rewards/rollout_reward_func/mean": 0.24405920505523682, "rewards/rollout_reward_func/std": 0.3579370081424713, "sampling/importance_sampling_ratio/max": 0.6063523292541504, "sampling/importance_sampling_ratio/mean": 0.5063562989234924, "sampling/importance_sampling_ratio/min": 0.12135006487369537, "sampling/sampling_logp_difference/max": 1.4151973724365234, "sampling/sampling_logp_difference/mean": 0.3711392283439636, "step": 173, "step_time": 5.651538388010522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.8923940658569336, "epoch": 0.00174, "grad_norm": 0.0607294999063015, "kl": 1.1910716891288757, "learning_rate": 9.999991186446498e-06, "loss": -0.0022, "step": 174, "step_time": 3.065023601986468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.80071821808815, "epoch": 0.00175, "frac_reward_zero_std": 0.75, "grad_norm": 0.0142047805711627, "kl": 1.0754879489541054, "learning_rate": 9.99999105825109e-06, "loss": 0.0005, "num_tokens": 3563650.0, "reward": 0.10491235554218292, "reward_std": 0.0002869318414013833, "rewards/rollout_reward_func/mean": 0.10491235554218292, "rewards/rollout_reward_func/std": 0.0005648896913044155, "sampling/importance_sampling_ratio/max": 0.5579941272735596, "sampling/importance_sampling_ratio/mean": 0.486553430557251, "sampling/importance_sampling_ratio/min": 0.08260821551084518, "sampling/sampling_logp_difference/max": 1.7915232181549072, "sampling/sampling_logp_difference/mean": 0.4026690721511841, "step": 175, "step_time": 5.667827104000025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.7998538613319397, "epoch": 0.00176, "grad_norm": 0.03960075229406357, "kl": 1.152650646865368, "learning_rate": 9.999990929130086e-06, "loss": 0.0005, "step": 176, "step_time": 3.515853357006563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.35668683052063, "epoch": 0.00177, "frac_reward_zero_std": 0.5, "grad_norm": 0.03751267120242119, "kl": 0.9379284754395485, "learning_rate": 9.999990799083483e-06, "loss": -0.0094, "num_tokens": 3606106.0, "reward": 0.0981510654091835, "reward_std": 0.01846645027399063, "rewards/rollout_reward_func/mean": 0.0981510654091835, "rewards/rollout_reward_func/std": 0.025749722495675087, "sampling/importance_sampling_ratio/max": 0.7818470597267151, "sampling/importance_sampling_ratio/mean": 0.5020959377288818, "sampling/importance_sampling_ratio/min": 2.9064048289745715e-16, "sampling/sampling_logp_difference/max": 4.660819053649902, "sampling/sampling_logp_difference/mean": 0.8528721928596497, "step": 177, "step_time": 6.349083066997991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.357927292585373, "epoch": 0.00178, "grad_norm": 0.0248373132199049, "kl": 0.9389834105968475, "learning_rate": 9.999990668111284e-06, "loss": -0.0094, "step": 178, "step_time": 3.114700108009856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.2827356159687042, "epoch": 0.00179, "frac_reward_zero_std": 0.5, "grad_norm": 0.031863220036029816, "kl": 1.0419833660125732, "learning_rate": 9.999990536213489e-06, "loss": -0.0024, "num_tokens": 3648190.0, "reward": 0.10190771520137787, "reward_std": 0.010095315054059029, "rewards/rollout_reward_func/mean": 0.10190771520137787, "rewards/rollout_reward_func/std": 0.018685033544898033, "sampling/importance_sampling_ratio/max": 0.5324880480766296, "sampling/importance_sampling_ratio/mean": 0.47539302706718445, "sampling/importance_sampling_ratio/min": 1.2442981757801452e-13, "sampling/sampling_logp_difference/max": 4.417169570922852, "sampling/sampling_logp_difference/mean": 0.621713399887085, "step": 179, "step_time": 5.854680160016869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.2983651757240295, "epoch": 0.0018, "grad_norm": 0.03253919258713722, "kl": 1.039329893887043, "learning_rate": 9.999990403390095e-06, "loss": -0.0024, "step": 180, "step_time": 3.086985930996889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.075312703847885, "epoch": 0.00181, "frac_reward_zero_std": 0.0, "grad_norm": 0.08742506057024002, "kl": 1.0626276284456253, "learning_rate": 9.999990269641104e-06, "loss": -0.0013, "num_tokens": 3689332.0, "reward": 0.22649192810058594, "reward_std": 0.1368427872657776, "rewards/rollout_reward_func/mean": 0.22649192810058594, "rewards/rollout_reward_func/std": 0.3349002003669739, "sampling/importance_sampling_ratio/max": 0.6287684440612793, "sampling/importance_sampling_ratio/mean": 0.4783359169960022, "sampling/importance_sampling_ratio/min": 0.08426797389984131, "sampling/sampling_logp_difference/max": 1.7224879264831543, "sampling/sampling_logp_difference/mean": 0.43184757232666016, "step": 181, "step_time": 5.680436845992517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.104294538497925, "epoch": 0.00182, "grad_norm": 0.09242373704910278, "kl": 1.0521759241819382, "learning_rate": 9.999990134966518e-06, "loss": -0.0015, "step": 182, "step_time": 3.9492803850007476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.140883892774582, "epoch": 0.00183, "frac_reward_zero_std": 0.75, "grad_norm": 0.04976051673293114, "kl": 0.9197333976626396, "learning_rate": 9.999989999366333e-06, "loss": -0.0013, "num_tokens": 3725306.0, "reward": 0.7133400440216064, "reward_std": 0.2605983018875122, "rewards/rollout_reward_func/mean": 0.7133400440216064, "rewards/rollout_reward_func/std": 0.6536937952041626, "sampling/importance_sampling_ratio/max": 0.5335836410522461, "sampling/importance_sampling_ratio/mean": 0.4689674377441406, "sampling/importance_sampling_ratio/min": 0.2392517775297165, "sampling/sampling_logp_difference/max": 1.4556828737258911, "sampling/sampling_logp_difference/mean": 0.3905387222766876, "step": 183, "step_time": 5.4669754380011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.184466302394867, "epoch": 0.00184, "grad_norm": 0.03425725921988487, "kl": 0.9138208255171776, "learning_rate": 9.999989862840553e-06, "loss": -0.0015, "step": 184, "step_time": 3.031428288995812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.307426691055298, "epoch": 0.00185, "frac_reward_zero_std": 0.0, "grad_norm": 0.015808088704943657, "kl": 1.0711713954806328, "learning_rate": 9.999989725389174e-06, "loss": -0.0049, "num_tokens": 3768004.0, "reward": 0.09825509786605835, "reward_std": 0.012305784039199352, "rewards/rollout_reward_func/mean": 0.09825509786605835, "rewards/rollout_reward_func/std": 0.025777338072657585, "sampling/importance_sampling_ratio/max": 0.5722131729125977, "sampling/importance_sampling_ratio/mean": 0.4847007989883423, "sampling/importance_sampling_ratio/min": 3.631208804357708e-16, "sampling/sampling_logp_difference/max": 4.230863571166992, "sampling/sampling_logp_difference/mean": 0.6603448987007141, "step": 185, "step_time": 5.819966102011676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.3535059988498688, "epoch": 0.00186, "grad_norm": 0.016078084707260132, "kl": 1.064581759274006, "learning_rate": 9.9999895870122e-06, "loss": -0.0049, "step": 186, "step_time": 3.094432375000906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 3.03125, "completions/mean_terminated_length": 2.1666667461395264, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.212633401155472, "epoch": 0.00187, "frac_reward_zero_std": 0.5, "grad_norm": 0.08338230848312378, "kl": 0.9381228722631931, "learning_rate": 9.999989447709628e-06, "loss": -0.0078, "num_tokens": 3808414.0, "reward": 0.2405119240283966, "reward_std": 0.1517173945903778, "rewards/rollout_reward_func/mean": 0.2405119240283966, "rewards/rollout_reward_func/std": 0.3686937391757965, "sampling/importance_sampling_ratio/max": 0.527626633644104, "sampling/importance_sampling_ratio/mean": 0.33936697244644165, "sampling/importance_sampling_ratio/min": 2.8206352184478867e-12, "sampling/sampling_logp_difference/max": 4.107314586639404, "sampling/sampling_logp_difference/mean": 0.9883315563201904, "step": 187, "step_time": 5.833929234999232 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 4.2318147122859955, "epoch": 0.00188, "grad_norm": 0.03932695463299751, "kl": 0.9462133981287479, "learning_rate": 9.99998930748146e-06, "loss": -0.0079, "step": 188, "step_time": 3.9323983890208183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.4181185960769653, "epoch": 0.00189, "frac_reward_zero_std": 0.25, "grad_norm": 0.04768482223153114, "kl": 1.0852136835455894, "learning_rate": 9.999989166327695e-06, "loss": 0.001, "num_tokens": 3850568.0, "reward": 0.10414525866508484, "reward_std": 0.0006422754959203303, "rewards/rollout_reward_func/mean": 0.10414525866508484, "rewards/rollout_reward_func/std": 0.0012503702891990542, "sampling/importance_sampling_ratio/max": 0.5867984890937805, "sampling/importance_sampling_ratio/mean": 0.4314883351325989, "sampling/importance_sampling_ratio/min": 0.0719183012843132, "sampling/sampling_logp_difference/max": 2.230182647705078, "sampling/sampling_logp_difference/mean": 0.5168430209159851, "step": 189, "step_time": 5.731634771014797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.440379321575165, "epoch": 0.0019, "grad_norm": 0.049473874270915985, "kl": 1.0757825300097466, "learning_rate": 9.999989024248333e-06, "loss": 0.001, "step": 190, "step_time": 3.039653528991039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 2.625, "completions/mean_terminated_length": 2.1935482025146484, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.7905180752277374, "epoch": 0.00191, "frac_reward_zero_std": 0.0, "grad_norm": 0.05591803044080734, "kl": 1.313558466732502, "learning_rate": 9.999988881243376e-06, "loss": -0.0075, "num_tokens": 3891838.0, "reward": 0.09106000512838364, "reward_std": 0.019212841987609863, "rewards/rollout_reward_func/mean": 0.09106000512838364, "rewards/rollout_reward_func/std": 0.03151549771428108, "sampling/importance_sampling_ratio/max": 0.5269871950149536, "sampling/importance_sampling_ratio/mean": 0.3762998580932617, "sampling/importance_sampling_ratio/min": 3.520064808526513e-07, "sampling/sampling_logp_difference/max": 3.4528965950012207, "sampling/sampling_logp_difference/mean": 0.6665354371070862, "step": 191, "step_time": 5.818886676999682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.8096237182617188, "epoch": 0.00192, "grad_norm": 0.0643792375922203, "kl": 1.318204626441002, "learning_rate": 9.99998873731282e-06, "loss": -0.0075, "step": 192, "step_time": 3.0796325400006026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.5113094449043274, "epoch": 0.00193, "frac_reward_zero_std": 0.25, "grad_norm": 0.12636831402778625, "kl": 0.9427657425403595, "learning_rate": 9.99998859245667e-06, "loss": -0.0019, "num_tokens": 3933605.0, "reward": 0.31205272674560547, "reward_std": 0.11043389141559601, "rewards/rollout_reward_func/mean": 0.31205272674560547, "rewards/rollout_reward_func/std": 0.4202013313770294, "sampling/importance_sampling_ratio/max": 0.5263416767120361, "sampling/importance_sampling_ratio/mean": 0.44312649965286255, "sampling/importance_sampling_ratio/min": 2.656977285755979e-13, "sampling/sampling_logp_difference/max": 3.7979488372802734, "sampling/sampling_logp_difference/mean": 0.6642450094223022, "step": 193, "step_time": 5.8634724090006785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.4996842741966248, "epoch": 0.00194, "grad_norm": 0.047876615077257156, "kl": 0.9746604859828949, "learning_rate": 9.999988446674922e-06, "loss": -0.002, "step": 194, "step_time": 3.6142754290121957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 3.5625, "completions/mean_terminated_length": 2.7333335876464844, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.056300908327103, "epoch": 0.00195, "frac_reward_zero_std": 0.0, "grad_norm": 0.08196251094341278, "kl": 0.9415072277188301, "learning_rate": 9.999988299967575e-06, "loss": -0.0092, "num_tokens": 3974719.0, "reward": 0.08687788248062134, "reward_std": 0.033235762268304825, "rewards/rollout_reward_func/mean": 0.08687788248062134, "rewards/rollout_reward_func/std": 0.038032416254282, "sampling/importance_sampling_ratio/max": 0.5310492515563965, "sampling/importance_sampling_ratio/mean": 0.3413558304309845, "sampling/importance_sampling_ratio/min": 3.134405102546249e-14, "sampling/sampling_logp_difference/max": 4.91648006439209, "sampling/sampling_logp_difference/mean": 0.9509263038635254, "step": 195, "step_time": 6.065785322993179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0024999999441206455, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024999999441206455, "entropy": 4.033932089805603, "epoch": 0.00196, "grad_norm": 0.07318440824747086, "kl": 0.9330675974488258, "learning_rate": 9.999988152334635e-06, "loss": -0.0094, "step": 196, "step_time": 3.0818552100026864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.4730121195316315, "epoch": 0.00197, "frac_reward_zero_std": 0.0, "grad_norm": 0.06865391880273819, "kl": 0.8950194120407104, "learning_rate": 9.999988003776098e-06, "loss": -0.0071, "num_tokens": 4014767.0, "reward": 0.3478992283344269, "reward_std": 0.01861293613910675, "rewards/rollout_reward_func/mean": 0.3478992283344269, "rewards/rollout_reward_func/std": 0.43660980463027954, "sampling/importance_sampling_ratio/max": 0.5349845886230469, "sampling/importance_sampling_ratio/mean": 0.45049068331718445, "sampling/importance_sampling_ratio/min": 2.3987625774601895e-10, "sampling/sampling_logp_difference/max": 4.497707843780518, "sampling/sampling_logp_difference/mean": 0.7104134559631348, "step": 197, "step_time": 5.792226134013617 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.394730180501938, "epoch": 0.00198, "grad_norm": 0.03389181196689606, "kl": 0.8903237357735634, "learning_rate": 9.999987854291966e-06, "loss": -0.0075, "step": 198, "step_time": 3.0901418659923365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.2352229356765747, "epoch": 0.00199, "frac_reward_zero_std": 0.75, "grad_norm": 0.026221705600619316, "kl": 1.2221079543232918, "learning_rate": 9.999987703882235e-06, "loss": 0.0011, "num_tokens": 4057199.0, "reward": -0.16853860020637512, "reward_std": 0.0002275537553941831, "rewards/rollout_reward_func/mean": -0.16853860020637512, "rewards/rollout_reward_func/std": 0.4811019003391266, "sampling/importance_sampling_ratio/max": 0.5865843892097473, "sampling/importance_sampling_ratio/mean": 0.46995362639427185, "sampling/importance_sampling_ratio/min": 0.1251952052116394, "sampling/sampling_logp_difference/max": 2.0189950466156006, "sampling/sampling_logp_difference/mean": 0.43730515241622925, "step": 199, "step_time": 6.403038424003171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.1884956061840057, "epoch": 0.002, "grad_norm": 0.01956435851752758, "kl": 1.2282055467367172, "learning_rate": 9.999987552546909e-06, "loss": 0.0012, "step": 200, "step_time": 3.6340297870119684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.01334211230278, "epoch": 0.00201, "frac_reward_zero_std": 0.5, "grad_norm": 0.07650106400251389, "kl": 1.129251167178154, "learning_rate": 9.999987400285985e-06, "loss": 0.0073, "num_tokens": 4099525.0, "reward": -0.057673532515764236, "reward_std": 0.12961073219776154, "rewards/rollout_reward_func/mean": -0.057673532515764236, "rewards/rollout_reward_func/std": 0.3758700489997864, "sampling/importance_sampling_ratio/max": 0.5737296342849731, "sampling/importance_sampling_ratio/mean": 0.48970121145248413, "sampling/importance_sampling_ratio/min": 0.1675138622522354, "sampling/sampling_logp_difference/max": 1.5558722019195557, "sampling/sampling_logp_difference/mean": 0.38629350066185, "step": 201, "step_time": 5.595162739999068 }, { "clip_ratio/high_max": 0.125, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 3.0064628422260284, "epoch": 0.00202, "grad_norm": 0.020587949082255363, "kl": 1.1225366070866585, "learning_rate": 9.999987247099467e-06, "loss": 0.007, "step": 202, "step_time": 3.0522180030093295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.0323283076286316, "epoch": 0.00203, "frac_reward_zero_std": 0.5, "grad_norm": 0.04398106038570404, "kl": 1.1479391157627106, "learning_rate": 9.999987092987352e-06, "loss": 0.0027, "num_tokens": 4139087.0, "reward": 0.35510963201522827, "reward_std": 0.0009730220190249383, "rewards/rollout_reward_func/mean": 0.35510963201522827, "rewards/rollout_reward_func/std": 0.43927597999572754, "sampling/importance_sampling_ratio/max": 0.5706567764282227, "sampling/importance_sampling_ratio/mean": 0.48784390091896057, "sampling/importance_sampling_ratio/min": 0.24014858901500702, "sampling/sampling_logp_difference/max": 1.1894651651382446, "sampling/sampling_logp_difference/mean": 0.37197446823120117, "step": 203, "step_time": 5.677023896001629 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.0169288218021393, "epoch": 0.00204, "grad_norm": 0.02346901036798954, "kl": 1.1376503184437752, "learning_rate": 9.999986937949641e-06, "loss": 0.0025, "step": 204, "step_time": 3.070843043991772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.1219694316387177, "epoch": 0.00205, "frac_reward_zero_std": 0.0, "grad_norm": 0.16872908174991608, "kl": 1.3547747731208801, "learning_rate": 9.999986781986334e-06, "loss": 0.0029, "num_tokens": 4181077.0, "reward": 0.10579735040664673, "reward_std": 0.0016780896112322807, "rewards/rollout_reward_func/mean": 0.10579735040664673, "rewards/rollout_reward_func/std": 0.002675863681361079, "sampling/importance_sampling_ratio/max": 0.5779612064361572, "sampling/importance_sampling_ratio/mean": 0.46720463037490845, "sampling/importance_sampling_ratio/min": 0.11750894039869308, "sampling/sampling_logp_difference/max": 1.3703045845031738, "sampling/sampling_logp_difference/mean": 0.42310434579849243, "step": 205, "step_time": 6.669799314993725 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 3.2185517251491547, "epoch": 0.00206, "grad_norm": 0.0882721096277237, "kl": 1.2883388549089432, "learning_rate": 9.999986625097431e-06, "loss": 0.0023, "step": 206, "step_time": 3.0785389119992033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.3229293823242188, "epoch": 0.00207, "frac_reward_zero_std": 0.0, "grad_norm": 0.1134374737739563, "kl": 0.9719199165701866, "learning_rate": 9.999986467282931e-06, "loss": 0.0053, "num_tokens": 4222144.0, "reward": -0.15626564621925354, "reward_std": 0.08119312673807144, "rewards/rollout_reward_func/mean": -0.15626564621925354, "rewards/rollout_reward_func/std": 0.4850826859474182, "sampling/importance_sampling_ratio/max": 0.5526522994041443, "sampling/importance_sampling_ratio/mean": 0.48423677682876587, "sampling/importance_sampling_ratio/min": 0.18179330229759216, "sampling/sampling_logp_difference/max": 1.3798019886016846, "sampling/sampling_logp_difference/mean": 0.3824259340763092, "step": 207, "step_time": 5.672306085005403 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 3.538396805524826, "epoch": 0.00208, "grad_norm": 0.10532759130001068, "kl": 0.951496034860611, "learning_rate": 9.999986308542834e-06, "loss": 0.0047, "step": 208, "step_time": 3.038938960984524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.09375, "completions/mean_terminated_length": 2.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.9349718391895294, "epoch": 0.00209, "frac_reward_zero_std": 0.25, "grad_norm": 0.1274743378162384, "kl": 1.1178549379110336, "learning_rate": 9.999986148877143e-06, "loss": 0.0007, "num_tokens": 4263609.0, "reward": 0.09666769206523895, "reward_std": 0.007862415164709091, "rewards/rollout_reward_func/mean": 0.09666769206523895, "rewards/rollout_reward_func/std": 0.020571541041135788, "sampling/importance_sampling_ratio/max": 0.5479804277420044, "sampling/importance_sampling_ratio/mean": 0.4341402053833008, "sampling/importance_sampling_ratio/min": 4.769947554450482e-05, "sampling/sampling_logp_difference/max": 4.468949317932129, "sampling/sampling_logp_difference/mean": 0.5288084745407104, "step": 209, "step_time": 6.00973936200171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.9365717470645905, "epoch": 0.0021, "grad_norm": 0.12229771912097931, "kl": 1.1176573187112808, "learning_rate": 9.999985988285857e-06, "loss": 0.0007, "step": 210, "step_time": 3.150281561014708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.7732940912246704, "epoch": 0.00211, "frac_reward_zero_std": 0.0, "grad_norm": 0.1511836051940918, "kl": 1.013918623328209, "learning_rate": 9.999985826768975e-06, "loss": -0.0085, "num_tokens": 4304699.0, "reward": 0.34872573614120483, "reward_std": 0.019251566380262375, "rewards/rollout_reward_func/mean": 0.34872573614120483, "rewards/rollout_reward_func/std": 0.43613797426223755, "sampling/importance_sampling_ratio/max": 0.5340574979782104, "sampling/importance_sampling_ratio/mean": 0.4492183327674866, "sampling/importance_sampling_ratio/min": 6.285296588748324e-11, "sampling/sampling_logp_difference/max": 3.7368485927581787, "sampling/sampling_logp_difference/mean": 0.7556399703025818, "step": 211, "step_time": 6.614602826019109 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 3.764702945947647, "epoch": 0.00212, "grad_norm": 0.07976053655147552, "kl": 1.0212063416838646, "learning_rate": 9.999985664326495e-06, "loss": -0.0088, "step": 212, "step_time": 3.094371262995992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.301596850156784, "epoch": 0.00213, "frac_reward_zero_std": 0.25, "grad_norm": 0.09150262176990509, "kl": 1.1500666588544846, "learning_rate": 9.99998550095842e-06, "loss": -0.0057, "num_tokens": 4344770.0, "reward": 0.3101494014263153, "reward_std": 0.1246422529220581, "rewards/rollout_reward_func/mean": 0.3101494014263153, "rewards/rollout_reward_func/std": 0.5270316004753113, "sampling/importance_sampling_ratio/max": 0.5314987897872925, "sampling/importance_sampling_ratio/mean": 0.47096309065818787, "sampling/importance_sampling_ratio/min": 0.09663449972867966, "sampling/sampling_logp_difference/max": 1.4566822052001953, "sampling/sampling_logp_difference/mean": 0.39800477027893066, "step": 213, "step_time": 5.85133547699661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.2723195552825928, "epoch": 0.00214, "grad_norm": 0.04962296038866043, "kl": 1.1633280515670776, "learning_rate": 9.999985336664749e-06, "loss": -0.0059, "step": 214, "step_time": 3.067458388992236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.687484085559845, "epoch": 0.00215, "frac_reward_zero_std": 0.25, "grad_norm": 0.16615843772888184, "kl": 0.9298897534608841, "learning_rate": 9.999985171445482e-06, "loss": -0.0011, "num_tokens": 4385705.0, "reward": -0.18701379001140594, "reward_std": 0.005726714618504047, "rewards/rollout_reward_func/mean": -0.18701379001140594, "rewards/rollout_reward_func/std": 0.512151300907135, "sampling/importance_sampling_ratio/max": 0.5357128977775574, "sampling/importance_sampling_ratio/mean": 0.44710949063301086, "sampling/importance_sampling_ratio/min": 0.23366554081439972, "sampling/sampling_logp_difference/max": 0.7602696418762207, "sampling/sampling_logp_difference/mean": 0.41456338763237, "step": 215, "step_time": 5.753249802000937 }, { "clip_ratio/high_max": 0.09375, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 3.757579505443573, "epoch": 0.00216, "grad_norm": 0.11860741674900055, "kl": 0.925678163766861, "learning_rate": 9.99998500530062e-06, "loss": -0.0018, "step": 216, "step_time": 3.07244190799247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.8419592082500458, "epoch": 0.00217, "frac_reward_zero_std": 0.5, "grad_norm": 0.034585658460855484, "kl": 1.0210292339324951, "learning_rate": 9.999984838230163e-06, "loss": -0.0033, "num_tokens": 4427355.0, "reward": -0.17084823548793793, "reward_std": 0.00018171124975197017, "rewards/rollout_reward_func/mean": -0.17084823548793793, "rewards/rollout_reward_func/std": 0.48548567295074463, "sampling/importance_sampling_ratio/max": 0.5612608194351196, "sampling/importance_sampling_ratio/mean": 0.44698256254196167, "sampling/importance_sampling_ratio/min": 3.45093989191983e-13, "sampling/sampling_logp_difference/max": 3.429149627685547, "sampling/sampling_logp_difference/mean": 0.652337908744812, "step": 217, "step_time": 6.869651154993335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.96540305018425, "epoch": 0.00218, "grad_norm": 0.03025275468826294, "kl": 1.013981468975544, "learning_rate": 9.999984670234109e-06, "loss": -0.0034, "step": 218, "step_time": 3.0875530690100277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.028893679380417, "epoch": 0.00219, "frac_reward_zero_std": 0.0, "grad_norm": 0.22829869389533997, "kl": 0.8682743459939957, "learning_rate": 9.99998450131246e-06, "loss": -0.0013, "num_tokens": 4467323.0, "reward": 0.2657945156097412, "reward_std": 0.20264331996440887, "rewards/rollout_reward_func/mean": 0.2657945156097412, "rewards/rollout_reward_func/std": 0.47996053099632263, "sampling/importance_sampling_ratio/max": 0.5363833904266357, "sampling/importance_sampling_ratio/mean": 0.41862747073173523, "sampling/importance_sampling_ratio/min": 8.349894109471734e-09, "sampling/sampling_logp_difference/max": 3.609619617462158, "sampling/sampling_logp_difference/mean": 0.5638303160667419, "step": 219, "step_time": 5.913487627003633 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 4.010756075382233, "epoch": 0.0022, "grad_norm": 0.11434531211853027, "kl": 0.8748476356267929, "learning_rate": 9.999984331465216e-06, "loss": -0.0016, "step": 220, "step_time": 3.0692228019906906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.344943046569824, "epoch": 0.00221, "frac_reward_zero_std": 0.75, "grad_norm": 0.05820532515645027, "kl": 0.8804432451725006, "learning_rate": 9.999984160692378e-06, "loss": 0.004, "num_tokens": 4507343.0, "reward": 0.35496869683265686, "reward_std": 0.00035730033414438367, "rewards/rollout_reward_func/mean": 0.35496869683265686, "rewards/rollout_reward_func/std": 0.4389958679676056, "sampling/importance_sampling_ratio/max": 0.538268506526947, "sampling/importance_sampling_ratio/mean": 0.47968432307243347, "sampling/importance_sampling_ratio/min": 0.2892380654811859, "sampling/sampling_logp_difference/max": 0.7567209601402283, "sampling/sampling_logp_difference/mean": 0.3736250400543213, "step": 221, "step_time": 5.90626233000512 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.285503089427948, "epoch": 0.00222, "grad_norm": 0.04278058186173439, "kl": 0.8884100764989853, "learning_rate": 9.999983988993942e-06, "loss": 0.0038, "step": 222, "step_time": 3.5212722959913663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.53125, "completions/mean_terminated_length": 2.096774101257324, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.8244956731796265, "epoch": 0.00223, "frac_reward_zero_std": 0.0, "grad_norm": 0.05055146664381027, "kl": 0.7781495861709118, "learning_rate": 9.99998381636991e-06, "loss": -0.0081, "num_tokens": 4550508.0, "reward": 0.09855036437511444, "reward_std": 0.018824413418769836, "rewards/rollout_reward_func/mean": 0.09855036437511444, "rewards/rollout_reward_func/std": 0.025901876389980316, "sampling/importance_sampling_ratio/max": 0.5900764465332031, "sampling/importance_sampling_ratio/mean": 0.4548878073692322, "sampling/importance_sampling_ratio/min": 2.4932991435355767e-11, "sampling/sampling_logp_difference/max": 4.977603912353516, "sampling/sampling_logp_difference/mean": 0.6876980662345886, "step": 223, "step_time": 6.303814834980585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.7239099740982056, "epoch": 0.00224, "grad_norm": 0.05007939040660858, "kl": 0.7903397865593433, "learning_rate": 9.999983642820286e-06, "loss": -0.0083, "step": 224, "step_time": 3.1326980529993307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.4683939814567566, "epoch": 0.00225, "frac_reward_zero_std": 0.75, "grad_norm": 0.008336830884218216, "kl": 1.0281647071242332, "learning_rate": 9.999983468345063e-06, "loss": -0.0037, "num_tokens": 4593572.0, "reward": 0.10190531611442566, "reward_std": 0.009437021799385548, "rewards/rollout_reward_func/mean": 0.10190531611442566, "rewards/rollout_reward_func/std": 0.018620258197188377, "sampling/importance_sampling_ratio/max": 0.5419731736183167, "sampling/importance_sampling_ratio/mean": 0.4800655245780945, "sampling/importance_sampling_ratio/min": 3.5716611959668576e-12, "sampling/sampling_logp_difference/max": 3.197612762451172, "sampling/sampling_logp_difference/mean": 0.5817776322364807, "step": 225, "step_time": 6.10778537200531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.384273409843445, "epoch": 0.00226, "grad_norm": 0.0072293756529688835, "kl": 1.036849968135357, "learning_rate": 9.999983292944247e-06, "loss": -0.0037, "step": 226, "step_time": 3.1810197519953363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.070807695388794, "epoch": 0.00227, "frac_reward_zero_std": 0.5, "grad_norm": 0.0319557711482048, "kl": 0.9699734672904015, "learning_rate": 9.999983116617835e-06, "loss": 0.0015, "num_tokens": 4634831.0, "reward": 0.35298365354537964, "reward_std": 0.0006889895885251462, "rewards/rollout_reward_func/mean": 0.35298365354537964, "rewards/rollout_reward_func/std": 0.4362066388130188, "sampling/importance_sampling_ratio/max": 0.5395305156707764, "sampling/importance_sampling_ratio/mean": 0.502470076084137, "sampling/importance_sampling_ratio/min": 0.40259674191474915, "sampling/sampling_logp_difference/max": 0.7184443473815918, "sampling/sampling_logp_difference/mean": 0.3460143208503723, "step": 227, "step_time": 5.901703498013376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.011317938566208, "epoch": 0.00228, "grad_norm": 0.03411926329135895, "kl": 0.9751377105712891, "learning_rate": 9.999982939365828e-06, "loss": 0.0015, "step": 228, "step_time": 4.0468402309925295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.930236577987671, "epoch": 0.00229, "frac_reward_zero_std": 0.75, "grad_norm": 0.06445597857236862, "kl": 0.8648795112967491, "learning_rate": 9.999982761188226e-06, "loss": 0.001, "num_tokens": 4675933.0, "reward": 0.352865606546402, "reward_std": 0.00039271340938284993, "rewards/rollout_reward_func/mean": 0.352865606546402, "rewards/rollout_reward_func/std": 0.4402967393398285, "sampling/importance_sampling_ratio/max": 0.5411971807479858, "sampling/importance_sampling_ratio/mean": 0.5085799694061279, "sampling/importance_sampling_ratio/min": 0.39823856949806213, "sampling/sampling_logp_difference/max": 0.7327598929405212, "sampling/sampling_logp_difference/mean": 0.3393840491771698, "step": 229, "step_time": 5.6176345640124055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.9003276228904724, "epoch": 0.0023, "grad_norm": 0.056783527135849, "kl": 0.8621901944279671, "learning_rate": 9.999982582085029e-06, "loss": 0.0008, "step": 230, "step_time": 3.0043900579985348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.908518075942993, "epoch": 0.00231, "frac_reward_zero_std": 0.25, "grad_norm": 0.07860822230577469, "kl": 0.8797250986099243, "learning_rate": 9.999982402056237e-06, "loss": 0.0034, "num_tokens": 4716532.0, "reward": 0.35270029306411743, "reward_std": 0.00033244348014704883, "rewards/rollout_reward_func/mean": 0.35270029306411743, "rewards/rollout_reward_func/std": 0.4363725185394287, "sampling/importance_sampling_ratio/max": 0.5991141200065613, "sampling/importance_sampling_ratio/mean": 0.5068061351776123, "sampling/importance_sampling_ratio/min": 0.10003629326820374, "sampling/sampling_logp_difference/max": 1.5748744010925293, "sampling/sampling_logp_difference/mean": 0.3654421269893646, "step": 231, "step_time": 5.720960710983491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8922443985939026, "epoch": 0.00232, "grad_norm": 0.07824774831533432, "kl": 0.8780804947018623, "learning_rate": 9.999982221101849e-06, "loss": 0.0032, "step": 232, "step_time": 3.0716037020174554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.913668006658554, "epoch": 0.00233, "frac_reward_zero_std": 1.0, "grad_norm": 0.001364217372611165, "kl": 1.1125427410006523, "learning_rate": 9.999982039221867e-06, "loss": 0.0014, "num_tokens": 4757152.0, "reward": 0.3542647063732147, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.3542647063732147, "rewards/rollout_reward_func/std": 0.4397609531879425, "sampling/importance_sampling_ratio/max": 0.6305478811264038, "sampling/importance_sampling_ratio/mean": 0.5248870849609375, "sampling/importance_sampling_ratio/min": 0.4736892879009247, "sampling/sampling_logp_difference/max": 0.742901086807251, "sampling/sampling_logp_difference/mean": 0.33867835998535156, "step": 233, "step_time": 6.004552255995804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.903167337179184, "epoch": 0.00234, "grad_norm": 0.0014363423688337207, "kl": 1.112814411520958, "learning_rate": 9.99998185641629e-06, "loss": 0.0014, "step": 234, "step_time": 4.124413916993944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.875055432319641, "epoch": 0.00235, "frac_reward_zero_std": 0.25, "grad_norm": 0.11083708703517914, "kl": 1.1022979840636253, "learning_rate": 9.999981672685119e-06, "loss": 0.0014, "num_tokens": 4796713.0, "reward": 0.35448789596557617, "reward_std": 0.0002665168431121856, "rewards/rollout_reward_func/mean": 0.35448789596557617, "rewards/rollout_reward_func/std": 0.43963006138801575, "sampling/importance_sampling_ratio/max": 0.5667515993118286, "sampling/importance_sampling_ratio/mean": 0.524036169052124, "sampling/importance_sampling_ratio/min": 0.4953070282936096, "sampling/sampling_logp_difference/max": 0.7007182836532593, "sampling/sampling_logp_difference/mean": 0.3346641957759857, "step": 235, "step_time": 5.686777784001606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.8624038994312286, "epoch": 0.00236, "grad_norm": 0.017689477652311325, "kl": 1.10901640355587, "learning_rate": 9.999981488028352e-06, "loss": 0.0013, "step": 236, "step_time": 3.069339071989816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.09375, "completions/mean_terminated_length": 2.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.9975451827049255, "epoch": 0.00237, "frac_reward_zero_std": 0.5, "grad_norm": 0.10727136582136154, "kl": 1.1399618238210678, "learning_rate": 9.99998130244599e-06, "loss": -0.0063, "num_tokens": 4837054.0, "reward": 0.05901163071393967, "reward_std": 0.009450226090848446, "rewards/rollout_reward_func/mean": 0.05901163071393967, "rewards/rollout_reward_func/std": 0.7745389938354492, "sampling/importance_sampling_ratio/max": 0.546413242816925, "sampling/importance_sampling_ratio/mean": 0.498068243265152, "sampling/importance_sampling_ratio/min": 0.00025696906959638, "sampling/sampling_logp_difference/max": 4.916529655456543, "sampling/sampling_logp_difference/mean": 0.441055566072464, "step": 237, "step_time": 5.693612243019743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 2.9855331480503082, "epoch": 0.00238, "grad_norm": 0.01252499595284462, "kl": 1.1465537324547768, "learning_rate": 9.999981115938033e-06, "loss": -0.0064, "step": 238, "step_time": 3.03204517600534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7528444826602936, "epoch": 0.00239, "frac_reward_zero_std": 1.0, "grad_norm": 0.00041618748218752444, "kl": 0.8719698637723923, "learning_rate": 9.999980928504482e-06, "loss": 0.0011, "num_tokens": 4873697.0, "reward": 1.0926213264465332, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.0926213264465332, "rewards/rollout_reward_func/std": 0.01923803612589836, "sampling/importance_sampling_ratio/max": 0.5415135622024536, "sampling/importance_sampling_ratio/mean": 0.5236371755599976, "sampling/importance_sampling_ratio/min": 0.4970528185367584, "sampling/sampling_logp_difference/max": 0.697811484336853, "sampling/sampling_logp_difference/mean": 0.32362306118011475, "step": 239, "step_time": 5.102854053002375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7579506635665894, "epoch": 0.0024, "grad_norm": 0.000416004826547578, "kl": 0.8710389658808708, "learning_rate": 9.999980740145336e-06, "loss": 0.0011, "step": 240, "step_time": 3.735518785979366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.087509125471115, "epoch": 0.00241, "frac_reward_zero_std": 0.25, "grad_norm": 0.023360127583146095, "kl": 1.0799904838204384, "learning_rate": 9.999980550860597e-06, "loss": -0.0031, "num_tokens": 4915839.0, "reward": 0.09535069018602371, "reward_std": 0.009763619862496853, "rewards/rollout_reward_func/mean": 0.09535069018602371, "rewards/rollout_reward_func/std": 0.020053153857588768, "sampling/importance_sampling_ratio/max": 0.5872793197631836, "sampling/importance_sampling_ratio/mean": 0.5020751953125, "sampling/importance_sampling_ratio/min": 1.7428822274290212e-11, "sampling/sampling_logp_difference/max": 4.608532905578613, "sampling/sampling_logp_difference/mean": 0.5506347417831421, "step": 241, "step_time": 5.8289295400099945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.0721403658390045, "epoch": 0.00242, "grad_norm": 0.021689431741833687, "kl": 1.0828538164496422, "learning_rate": 9.999980360650262e-06, "loss": -0.0032, "step": 242, "step_time": 3.0914951269951416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.108662933111191, "epoch": 0.00243, "frac_reward_zero_std": 0.25, "grad_norm": 0.10518987476825714, "kl": 0.7945884466171265, "learning_rate": 9.999980169514331e-06, "loss": -0.0055, "num_tokens": 4955148.0, "reward": 0.34263843297958374, "reward_std": 0.008983391337096691, "rewards/rollout_reward_func/mean": 0.34263843297958374, "rewards/rollout_reward_func/std": 0.4473282992839813, "sampling/importance_sampling_ratio/max": 0.5412325263023376, "sampling/importance_sampling_ratio/mean": 0.48479604721069336, "sampling/importance_sampling_ratio/min": 4.832455768877253e-09, "sampling/sampling_logp_difference/max": 3.605928421020508, "sampling/sampling_logp_difference/mean": 0.5301432013511658, "step": 243, "step_time": 5.838160598992545 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 3.100603610277176, "epoch": 0.00244, "grad_norm": 0.013229583390057087, "kl": 0.8032283633947372, "learning_rate": 9.999979977452809e-06, "loss": -0.0057, "step": 244, "step_time": 3.0734621299925493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.0017272233963013, "epoch": 0.00245, "frac_reward_zero_std": 0.5, "grad_norm": 0.0250027347356081, "kl": 0.9993690624833107, "learning_rate": 9.999979784465691e-06, "loss": -0.0032, "num_tokens": 4992708.0, "reward": 0.6009950637817383, "reward_std": 0.009242101572453976, "rewards/rollout_reward_func/mean": 0.6009950637817383, "rewards/rollout_reward_func/std": 0.5113335251808167, "sampling/importance_sampling_ratio/max": 0.5419101119041443, "sampling/importance_sampling_ratio/mean": 0.496663898229599, "sampling/importance_sampling_ratio/min": 3.659067840389163e-12, "sampling/sampling_logp_difference/max": 3.472930908203125, "sampling/sampling_logp_difference/mean": 0.5697230696678162, "step": 245, "step_time": 6.189133913008845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.998829334974289, "epoch": 0.00246, "grad_norm": 0.02587025612592697, "kl": 1.0150217190384865, "learning_rate": 9.999979590552979e-06, "loss": -0.0032, "step": 246, "step_time": 3.526479265994567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.068217784166336, "epoch": 0.00247, "frac_reward_zero_std": 0.25, "grad_norm": 0.014431940391659737, "kl": 1.1364342123270035, "learning_rate": 9.999979395714672e-06, "loss": -0.0036, "num_tokens": 5034242.0, "reward": 0.34586021304130554, "reward_std": 0.007208445109426975, "rewards/rollout_reward_func/mean": 0.34586021304130554, "rewards/rollout_reward_func/std": 0.4451257884502411, "sampling/importance_sampling_ratio/max": 0.5398816466331482, "sampling/importance_sampling_ratio/mean": 0.5060387849807739, "sampling/importance_sampling_ratio/min": 4.943579170064472e-18, "sampling/sampling_logp_difference/max": 4.125819206237793, "sampling/sampling_logp_difference/mean": 0.7042371034622192, "step": 247, "step_time": 6.061298897991946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.066601574420929, "epoch": 0.00248, "grad_norm": 0.014098106883466244, "kl": 1.1360977590084076, "learning_rate": 9.99997919995077e-06, "loss": -0.0037, "step": 248, "step_time": 3.1685301100005745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.740210920572281, "epoch": 0.00249, "frac_reward_zero_std": 0.5, "grad_norm": 0.00964316539466381, "kl": 1.0989487320184708, "learning_rate": 9.999979003261275e-06, "loss": 0.0015, "num_tokens": 5075353.0, "reward": 0.3552860617637634, "reward_std": 0.0007686839671805501, "rewards/rollout_reward_func/mean": 0.3552860617637634, "rewards/rollout_reward_func/std": 0.4391622245311737, "sampling/importance_sampling_ratio/max": 0.5491593480110168, "sampling/importance_sampling_ratio/mean": 0.5245932340621948, "sampling/importance_sampling_ratio/min": 0.5111541152000427, "sampling/sampling_logp_difference/max": 0.670894980430603, "sampling/sampling_logp_difference/mean": 0.3276829123497009, "step": 249, "step_time": 5.675966454007721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7379478812217712, "epoch": 0.0025, "grad_norm": 0.008941994048655033, "kl": 1.098896473646164, "learning_rate": 9.999978805646186e-06, "loss": 0.0016, "step": 250, "step_time": 3.0780666540013044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7641094028949738, "epoch": 0.00251, "frac_reward_zero_std": 0.25, "grad_norm": 0.07680047303438187, "kl": 1.4994065910577774, "learning_rate": 9.999978607105502e-06, "loss": 0.0034, "num_tokens": 5116973.0, "reward": 0.10577338933944702, "reward_std": 0.0015080757439136505, "rewards/rollout_reward_func/mean": 0.10577338933944702, "rewards/rollout_reward_func/std": 0.003039788454771042, "sampling/importance_sampling_ratio/max": 0.5394718647003174, "sampling/importance_sampling_ratio/mean": 0.512673020362854, "sampling/importance_sampling_ratio/min": 0.2183951884508133, "sampling/sampling_logp_difference/max": 0.8738141059875488, "sampling/sampling_logp_difference/mean": 0.3494493365287781, "step": 251, "step_time": 6.247139054990839 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.767514705657959, "epoch": 0.00252, "grad_norm": 0.033075712621212006, "kl": 1.4566107392311096, "learning_rate": 9.999978407639225e-06, "loss": 0.0033, "step": 252, "step_time": 3.5260750749876024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.740439146757126, "epoch": 0.00253, "frac_reward_zero_std": 0.75, "grad_norm": 0.06382151693105698, "kl": 1.6625724136829376, "learning_rate": 9.999978207247353e-06, "loss": 0.0015, "num_tokens": 5154593.0, "reward": 0.6041310429573059, "reward_std": 4.844910654355772e-05, "rewards/rollout_reward_func/mean": 0.6041310429573059, "rewards/rollout_reward_func/std": 0.5078237652778625, "sampling/importance_sampling_ratio/max": 0.5445574522018433, "sampling/importance_sampling_ratio/mean": 0.5063398480415344, "sampling/importance_sampling_ratio/min": 0.24303308129310608, "sampling/sampling_logp_difference/max": 0.7480592727661133, "sampling/sampling_logp_difference/mean": 0.34943094849586487, "step": 253, "step_time": 5.570525845003431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7428268790245056, "epoch": 0.00254, "grad_norm": 0.023551415652036667, "kl": 1.321341134607792, "learning_rate": 9.999978005929887e-06, "loss": 0.0012, "step": 254, "step_time": 3.0110095819982234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7201876640319824, "epoch": 0.00255, "frac_reward_zero_std": 0.75, "grad_norm": 0.011906675063073635, "kl": 1.0794601738452911, "learning_rate": 9.999977803686829e-06, "loss": 0.0014, "num_tokens": 5192626.0, "reward": 0.8534371852874756, "reward_std": 0.000552260025870055, "rewards/rollout_reward_func/mean": 0.8534371852874756, "rewards/rollout_reward_func/std": 0.43690064549446106, "sampling/importance_sampling_ratio/max": 0.5387653112411499, "sampling/importance_sampling_ratio/mean": 0.5254790782928467, "sampling/importance_sampling_ratio/min": 0.516064465045929, "sampling/sampling_logp_difference/max": 0.6614106297492981, "sampling/sampling_logp_difference/mean": 0.3217570185661316, "step": 255, "step_time": 5.568710609986738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.718384563922882, "epoch": 0.00256, "grad_norm": 0.009991972707211971, "kl": 1.0801302939653397, "learning_rate": 9.999977600518175e-06, "loss": 0.0014, "step": 256, "step_time": 3.0505521630038857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.015547215938568, "epoch": 0.00257, "frac_reward_zero_std": 0.5, "grad_norm": 0.03417826443910599, "kl": 0.9569173902273178, "learning_rate": 9.999977396423928e-06, "loss": -0.005, "num_tokens": 5229299.0, "reward": 0.8486625552177429, "reward_std": 0.008970970287919044, "rewards/rollout_reward_func/mean": 0.8486625552177429, "rewards/rollout_reward_func/std": 0.4379535913467407, "sampling/importance_sampling_ratio/max": 0.5425843596458435, "sampling/importance_sampling_ratio/mean": 0.5001343488693237, "sampling/importance_sampling_ratio/min": 2.1033531860670251e-10, "sampling/sampling_logp_difference/max": 3.9487550258636475, "sampling/sampling_logp_difference/mean": 0.5235016942024231, "step": 257, "step_time": 6.420301817008294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.0123012959957123, "epoch": 0.00258, "grad_norm": 0.03672194108366966, "kl": 0.9557066410779953, "learning_rate": 9.999977191404087e-06, "loss": -0.005, "step": 258, "step_time": 3.0078598570034956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.09375, "completions/mean_terminated_length": 2.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.9640954732894897, "epoch": 0.00259, "frac_reward_zero_std": 0.0, "grad_norm": 0.5154951810836792, "kl": 1.1060486063361168, "learning_rate": 9.999976985458653e-06, "loss": -0.0051, "num_tokens": 5270804.0, "reward": -0.1745671033859253, "reward_std": 0.014987872913479805, "rewards/rollout_reward_func/mean": -0.1745671033859253, "rewards/rollout_reward_func/std": 0.4877704679965973, "sampling/importance_sampling_ratio/max": 0.5583314895629883, "sampling/importance_sampling_ratio/mean": 0.5034091472625732, "sampling/importance_sampling_ratio/min": 0.00018996128346771002, "sampling/sampling_logp_difference/max": 5.030144214630127, "sampling/sampling_logp_difference/mean": 0.4335639774799347, "step": 259, "step_time": 5.630062348987849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.9545449018478394, "epoch": 0.0026, "grad_norm": 0.02500380575656891, "kl": 1.1093428134918213, "learning_rate": 9.999976778587625e-06, "loss": -0.0056, "step": 260, "step_time": 3.0641696239908924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.790497064590454, "epoch": 0.00261, "frac_reward_zero_std": 0.25, "grad_norm": 0.009068870916962624, "kl": 1.1717452704906464, "learning_rate": 9.999976570791002e-06, "loss": 0.0014, "num_tokens": 5310853.0, "reward": 0.35311126708984375, "reward_std": 0.0009265471016988158, "rewards/rollout_reward_func/mean": 0.35311126708984375, "rewards/rollout_reward_func/std": 0.44008421897888184, "sampling/importance_sampling_ratio/max": 0.5612053871154785, "sampling/importance_sampling_ratio/mean": 0.5211116671562195, "sampling/importance_sampling_ratio/min": 0.5026047229766846, "sampling/sampling_logp_difference/max": 0.6877753734588623, "sampling/sampling_logp_difference/mean": 0.32950595021247864, "step": 261, "step_time": 5.670858692006732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7856441736221313, "epoch": 0.00262, "grad_norm": 0.009174504317343235, "kl": 1.171541452407837, "learning_rate": 9.999976362068785e-06, "loss": 0.0013, "step": 262, "step_time": 3.055373127994244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.0233793556690216, "epoch": 0.00263, "frac_reward_zero_std": 0.5, "grad_norm": 0.028298020362854004, "kl": 0.9923721253871918, "learning_rate": 9.999976152420979e-06, "loss": -0.0029, "num_tokens": 5351123.0, "reward": 0.3430582284927368, "reward_std": 0.009225773625075817, "rewards/rollout_reward_func/mean": 0.3430582284927368, "rewards/rollout_reward_func/std": 0.4424566328525543, "sampling/importance_sampling_ratio/max": 0.5654680132865906, "sampling/importance_sampling_ratio/mean": 0.4958740472793579, "sampling/importance_sampling_ratio/min": 1.2467892170076045e-11, "sampling/sampling_logp_difference/max": 4.024127960205078, "sampling/sampling_logp_difference/mean": 0.5516954660415649, "step": 263, "step_time": 6.272927743004402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.0267254412174225, "epoch": 0.00264, "grad_norm": 0.028096135705709457, "kl": 0.9913851618766785, "learning_rate": 9.999975941847575e-06, "loss": -0.0029, "step": 264, "step_time": 3.0771163450044696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.066666841506958, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.3335796296596527, "epoch": 0.00265, "frac_reward_zero_std": 0.25, "grad_norm": 0.035345375537872314, "kl": 0.9420528300106525, "learning_rate": 9.99997573034858e-06, "loss": -0.0128, "num_tokens": 5391819.0, "reward": 0.32851606607437134, "reward_std": 0.025682024657726288, "rewards/rollout_reward_func/mean": 0.32851606607437134, "rewards/rollout_reward_func/std": 0.43000826239585876, "sampling/importance_sampling_ratio/max": 0.5503427386283875, "sampling/importance_sampling_ratio/mean": 0.46245521306991577, "sampling/importance_sampling_ratio/min": 1.8334042966395908e-15, "sampling/sampling_logp_difference/max": 4.686193466186523, "sampling/sampling_logp_difference/mean": 0.8157042264938354, "step": 265, "step_time": 5.766326706994732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 3.3432831466197968, "epoch": 0.00266, "grad_norm": 0.024096054956316948, "kl": 0.9394459910690784, "learning_rate": 9.99997551792399e-06, "loss": -0.0129, "step": 266, "step_time": 3.0815879920191946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.028009206056595, "epoch": 0.00267, "frac_reward_zero_std": 0.25, "grad_norm": 0.013185127638280392, "kl": 0.9461282268166542, "learning_rate": 9.999975304573807e-06, "loss": -0.0034, "num_tokens": 5433923.0, "reward": 0.1010926216840744, "reward_std": 0.010032303631305695, "rewards/rollout_reward_func/mean": 0.1010926216840744, "rewards/rollout_reward_func/std": 0.01864674501121044, "sampling/importance_sampling_ratio/max": 0.5472308397293091, "sampling/importance_sampling_ratio/mean": 0.5020283460617065, "sampling/importance_sampling_ratio/min": 1.78657103000468e-10, "sampling/sampling_logp_difference/max": 3.0905373096466064, "sampling/sampling_logp_difference/mean": 0.5180369019508362, "step": 267, "step_time": 5.9612498359929305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.0321907997131348, "epoch": 0.00268, "grad_norm": 0.013765589334070683, "kl": 0.9474191665649414, "learning_rate": 9.999975090298031e-06, "loss": -0.0034, "step": 268, "step_time": 3.547017083001265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7262396216392517, "epoch": 0.00269, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003730055468622595, "kl": 0.8954339250922203, "learning_rate": 9.999974875096663e-06, "loss": 0.0011, "num_tokens": 5468998.0, "reward": 0.8465388417243958, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8465388417243958, "rewards/rollout_reward_func/std": 0.44834157824516296, "sampling/importance_sampling_ratio/max": 0.5387517213821411, "sampling/importance_sampling_ratio/mean": 0.528420090675354, "sampling/importance_sampling_ratio/min": 0.507607102394104, "sampling/sampling_logp_difference/max": 0.6766976118087769, "sampling/sampling_logp_difference/mean": 0.31897541880607605, "step": 269, "step_time": 5.748420616990188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.739704728126526, "epoch": 0.0027, "grad_norm": 0.00034833475365303457, "kl": 0.8947665020823479, "learning_rate": 9.999974658969701e-06, "loss": 0.0011, "step": 270, "step_time": 2.826480111005367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.83858984708786, "epoch": 0.00271, "frac_reward_zero_std": 0.5, "grad_norm": 0.018259022384881973, "kl": 1.1946524530649185, "learning_rate": 9.999974441917146e-06, "loss": 0.0017, "num_tokens": 5511456.0, "reward": 0.10497283935546875, "reward_std": 4.364216874819249e-05, "rewards/rollout_reward_func/mean": 0.10497283935546875, "rewards/rollout_reward_func/std": 0.0005679095629602671, "sampling/importance_sampling_ratio/max": 0.7479435205459595, "sampling/importance_sampling_ratio/mean": 0.5443019270896912, "sampling/importance_sampling_ratio/min": 0.5022339224815369, "sampling/sampling_logp_difference/max": 0.6825327277183533, "sampling/sampling_logp_difference/mean": 0.33265256881713867, "step": 271, "step_time": 5.721598141994036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.860979974269867, "epoch": 0.00272, "grad_norm": 0.020220458507537842, "kl": 1.1934433728456497, "learning_rate": 9.999974223938997e-06, "loss": 0.0017, "step": 272, "step_time": 3.0749447389971465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.992329031229019, "epoch": 0.00273, "frac_reward_zero_std": 0.5, "grad_norm": 0.01655915006995201, "kl": 1.011431708931923, "learning_rate": 9.999974005035256e-06, "loss": -0.0038, "num_tokens": 5550742.0, "reward": 0.3449254035949707, "reward_std": 0.009291532449424267, "rewards/rollout_reward_func/mean": 0.3449254035949707, "rewards/rollout_reward_func/std": 0.4456731677055359, "sampling/importance_sampling_ratio/max": 0.5611110925674438, "sampling/importance_sampling_ratio/mean": 0.5084449648857117, "sampling/importance_sampling_ratio/min": 5.313231099535187e-08, "sampling/sampling_logp_difference/max": 3.1179323196411133, "sampling/sampling_logp_difference/mean": 0.45069608092308044, "step": 273, "step_time": 5.81207063599868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.008851081132889, "epoch": 0.00274, "grad_norm": 0.01724385656416416, "kl": 1.010471671819687, "learning_rate": 9.999973785205922e-06, "loss": -0.0038, "step": 274, "step_time": 3.4985182200034615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.83827668428421, "epoch": 0.00275, "frac_reward_zero_std": 0.75, "grad_norm": 0.010838210582733154, "kl": 1.3292706906795502, "learning_rate": 9.999973564450996e-06, "loss": 0.0018, "num_tokens": 5590947.0, "reward": 0.34875980019569397, "reward_std": 9.699590009404346e-05, "rewards/rollout_reward_func/mean": 0.34875980019569397, "rewards/rollout_reward_func/std": 0.4430944621562958, "sampling/importance_sampling_ratio/max": 0.6181166768074036, "sampling/importance_sampling_ratio/mean": 0.5297725200653076, "sampling/importance_sampling_ratio/min": 0.5005250573158264, "sampling/sampling_logp_difference/max": 0.6839037537574768, "sampling/sampling_logp_difference/mean": 0.32245126366615295, "step": 275, "step_time": 6.361663733005116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8499106764793396, "epoch": 0.00276, "grad_norm": 0.011018035002052784, "kl": 1.3283061981201172, "learning_rate": 9.999973342770475e-06, "loss": 0.0018, "step": 276, "step_time": 3.1454330010019476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.1695716977119446, "epoch": 0.00277, "frac_reward_zero_std": 0.0, "grad_norm": 0.035891976207494736, "kl": 1.1518970355391502, "learning_rate": 9.999973120164363e-06, "loss": -0.0048, "num_tokens": 5631581.0, "reward": 0.28425779938697815, "reward_std": 0.19509118795394897, "rewards/rollout_reward_func/mean": 0.28425779938697815, "rewards/rollout_reward_func/std": 0.47879427671432495, "sampling/importance_sampling_ratio/max": 0.5523512959480286, "sampling/importance_sampling_ratio/mean": 0.4988151490688324, "sampling/importance_sampling_ratio/min": 6.079921899271312e-09, "sampling/sampling_logp_difference/max": 3.09608793258667, "sampling/sampling_logp_difference/mean": 0.4874398410320282, "step": 277, "step_time": 5.815189936001843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.1723535656929016, "epoch": 0.00278, "grad_norm": 0.7879698872566223, "kl": 1.151903610676527, "learning_rate": 9.999972896632658e-06, "loss": -0.0044, "step": 278, "step_time": 3.0845267080003396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.265122562646866, "epoch": 0.00279, "frac_reward_zero_std": 0.25, "grad_norm": 0.03683782368898392, "kl": 0.8702132999897003, "learning_rate": 9.99997267217536e-06, "loss": -0.0094, "num_tokens": 5669243.0, "reward": 0.0585692822933197, "reward_std": 0.019255392253398895, "rewards/rollout_reward_func/mean": 0.0585692822933197, "rewards/rollout_reward_func/std": 0.7728469967842102, "sampling/importance_sampling_ratio/max": 0.5671262145042419, "sampling/importance_sampling_ratio/mean": 0.4929661154747009, "sampling/importance_sampling_ratio/min": 6.1635492716594076e-12, "sampling/sampling_logp_difference/max": 3.0949745178222656, "sampling/sampling_logp_difference/mean": 0.6457443237304688, "step": 279, "step_time": 5.7357032369982335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.2604923844337463, "epoch": 0.0028, "grad_norm": 0.0383562333881855, "kl": 0.8718214258551598, "learning_rate": 9.999972446792469e-06, "loss": -0.0094, "step": 280, "step_time": 3.9268349230042077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.4193546772003174, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.295897454023361, "epoch": 0.00281, "frac_reward_zero_std": 0.25, "grad_norm": 0.026330864056944847, "kl": 0.9964546337723732, "learning_rate": 9.999972220483987e-06, "loss": -0.01, "num_tokens": 5707335.0, "reward": 0.586214542388916, "reward_std": 0.01623380184173584, "rewards/rollout_reward_func/mean": 0.586214542388916, "rewards/rollout_reward_func/std": 0.4969720244407654, "sampling/importance_sampling_ratio/max": 0.5423341393470764, "sampling/importance_sampling_ratio/mean": 0.4707656502723694, "sampling/importance_sampling_ratio/min": 8.580472998787059e-10, "sampling/sampling_logp_difference/max": 5.2982635498046875, "sampling/sampling_logp_difference/mean": 0.6358560919761658, "step": 281, "step_time": 6.086655713013897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.294405072927475, "epoch": 0.00282, "grad_norm": 0.027700157836079597, "kl": 0.9948722422122955, "learning_rate": 9.99997199324991e-06, "loss": -0.0101, "step": 282, "step_time": 3.2197482280025724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.8104158341884613, "epoch": 0.00283, "frac_reward_zero_std": 0.75, "grad_norm": 0.020320050418376923, "kl": 0.9943118691444397, "learning_rate": 9.999971765090241e-06, "loss": 0.0012, "num_tokens": 5743812.0, "reward": 0.8522621989250183, "reward_std": 0.00011003986583091319, "rewards/rollout_reward_func/mean": 0.8522621989250183, "rewards/rollout_reward_func/std": 0.43862688541412354, "sampling/importance_sampling_ratio/max": 0.5454959869384766, "sampling/importance_sampling_ratio/mean": 0.5281776189804077, "sampling/importance_sampling_ratio/min": 0.5117464065551758, "sampling/sampling_logp_difference/max": 0.6680207848548889, "sampling/sampling_logp_difference/mean": 0.32054418325424194, "step": 283, "step_time": 5.57299516499188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8093622028827667, "epoch": 0.00284, "grad_norm": 0.02185937948524952, "kl": 0.9943063706159592, "learning_rate": 9.999971536004981e-06, "loss": 0.0013, "step": 284, "step_time": 2.9752595930040115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.9608814418315887, "epoch": 0.00285, "frac_reward_zero_std": 0.5, "grad_norm": 0.14925315976142883, "kl": 0.9766102060675621, "learning_rate": 9.99997130599413e-06, "loss": -0.0013, "num_tokens": 5783196.0, "reward": 0.5596189498901367, "reward_std": 0.0027009842451661825, "rewards/rollout_reward_func/mean": 0.5596189498901367, "rewards/rollout_reward_func/std": 0.46873557567596436, "sampling/importance_sampling_ratio/max": 0.600827693939209, "sampling/importance_sampling_ratio/mean": 0.5239743590354919, "sampling/importance_sampling_ratio/min": 0.3568241000175476, "sampling/sampling_logp_difference/max": 0.7268756628036499, "sampling/sampling_logp_difference/mean": 0.3328152894973755, "step": 285, "step_time": 5.587177349007106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 2.9500973224639893, "epoch": 0.00286, "grad_norm": 0.049009475857019424, "kl": 0.9811014384031296, "learning_rate": 9.999971075057683e-06, "loss": -0.0016, "step": 286, "step_time": 3.9491348519877647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.9158317148685455, "epoch": 0.00287, "frac_reward_zero_std": 0.25, "grad_norm": 0.08641960471868515, "kl": 1.0322148948907852, "learning_rate": 9.999970843195648e-06, "loss": 0.0001, "num_tokens": 5824684.0, "reward": 0.10471857339143753, "reward_std": 0.0026183538138866425, "rewards/rollout_reward_func/mean": 0.10471857339143753, "rewards/rollout_reward_func/std": 0.004517199005931616, "sampling/importance_sampling_ratio/max": 0.5915094017982483, "sampling/importance_sampling_ratio/mean": 0.5023317337036133, "sampling/importance_sampling_ratio/min": 0.24949905276298523, "sampling/sampling_logp_difference/max": 0.8450815081596375, "sampling/sampling_logp_difference/mean": 0.35809022188186646, "step": 287, "step_time": 5.7546142369974405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.9275115728378296, "epoch": 0.00288, "grad_norm": 0.08306718617677689, "kl": 1.0378914400935173, "learning_rate": 9.999970610408019e-06, "loss": -0.0002, "step": 288, "step_time": 3.070421191987407 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.202357143163681, "epoch": 0.00289, "frac_reward_zero_std": 0.0, "grad_norm": 0.04870449751615524, "kl": 0.9070092588663101, "learning_rate": 9.999970376694797e-06, "loss": 0.0002, "num_tokens": 5865794.0, "reward": -0.1209263950586319, "reward_std": 0.16004624962806702, "rewards/rollout_reward_func/mean": -0.1209263950586319, "rewards/rollout_reward_func/std": 0.48227357864379883, "sampling/importance_sampling_ratio/max": 0.5867475867271423, "sampling/importance_sampling_ratio/mean": 0.49132949113845825, "sampling/importance_sampling_ratio/min": 7.405776686342236e-11, "sampling/sampling_logp_difference/max": 2.814530849456787, "sampling/sampling_logp_difference/mean": 0.5582941174507141, "step": 289, "step_time": 5.7982684210001025 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.246769964694977, "epoch": 0.0029, "grad_norm": 0.07182884961366653, "kl": 0.8899594470858574, "learning_rate": 9.999970142055984e-06, "loss": -0.0002, "step": 290, "step_time": 3.089006537993555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.09375, "completions/mean_terminated_length": 2.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.1971293687820435, "epoch": 0.00291, "frac_reward_zero_std": 0.5, "grad_norm": 0.06036125868558884, "kl": 1.0501237139105797, "learning_rate": 9.999969906491578e-06, "loss": -0.003, "num_tokens": 5906614.0, "reward": 0.34123337268829346, "reward_std": 0.010211551561951637, "rewards/rollout_reward_func/mean": 0.34123337268829346, "rewards/rollout_reward_func/std": 0.4219474792480469, "sampling/importance_sampling_ratio/max": 0.5503221750259399, "sampling/importance_sampling_ratio/mean": 0.5065489411354065, "sampling/importance_sampling_ratio/min": 0.0034970412962138653, "sampling/sampling_logp_difference/max": 3.1146459579467773, "sampling/sampling_logp_difference/mean": 0.3870203197002411, "step": 291, "step_time": 6.119134445012605 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.258518397808075, "epoch": 0.00292, "grad_norm": 0.05040156841278076, "kl": 1.043013833463192, "learning_rate": 9.99996967000158e-06, "loss": -0.0034, "step": 292, "step_time": 3.4922302420091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.0961835980415344, "epoch": 0.00293, "frac_reward_zero_std": 0.5, "grad_norm": 0.06663449108600616, "kl": 0.8692983239889145, "learning_rate": 9.999969432585992e-06, "loss": 0.0003, "num_tokens": 5943941.0, "reward": 0.6056066155433655, "reward_std": 0.0017117602983489633, "rewards/rollout_reward_func/mean": 0.6056066155433655, "rewards/rollout_reward_func/std": 0.5063433051109314, "sampling/importance_sampling_ratio/max": 0.5499798059463501, "sampling/importance_sampling_ratio/mean": 0.5040630102157593, "sampling/importance_sampling_ratio/min": 0.3670312166213989, "sampling/sampling_logp_difference/max": 0.7262954115867615, "sampling/sampling_logp_difference/mean": 0.3459891676902771, "step": 293, "step_time": 5.587379612996301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.1460440158843994, "epoch": 0.00294, "grad_norm": 0.0670437216758728, "kl": 0.8688431307673454, "learning_rate": 9.99996919424481e-06, "loss": 0.0004, "step": 294, "step_time": 3.0335949860091205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.3181312680244446, "epoch": 0.00295, "frac_reward_zero_std": 0.5, "grad_norm": 0.08817745745182037, "kl": 0.976163238286972, "learning_rate": 9.999968954978038e-06, "loss": -0.0068, "num_tokens": 5983833.0, "reward": 0.3443855345249176, "reward_std": 0.011059368960559368, "rewards/rollout_reward_func/mean": 0.3443855345249176, "rewards/rollout_reward_func/std": 0.43840450048446655, "sampling/importance_sampling_ratio/max": 0.5463346242904663, "sampling/importance_sampling_ratio/mean": 0.4856630265712738, "sampling/importance_sampling_ratio/min": 2.162977913097719e-12, "sampling/sampling_logp_difference/max": 3.066990852355957, "sampling/sampling_logp_difference/mean": 0.5868902206420898, "step": 295, "step_time": 5.84122894200118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.328244239091873, "epoch": 0.00296, "grad_norm": 0.06576968729496002, "kl": 0.9852109923958778, "learning_rate": 9.999968714785673e-06, "loss": -0.007, "step": 296, "step_time": 3.076749303007091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.7672199606895447, "epoch": 0.00297, "frac_reward_zero_std": 0.0, "grad_norm": 0.07999575883150101, "kl": 0.8391242027282715, "learning_rate": 9.999968473667719e-06, "loss": -0.0024, "num_tokens": 6025587.0, "reward": 0.10113810747861862, "reward_std": 0.009904159232974052, "rewards/rollout_reward_func/mean": 0.10113810747861862, "rewards/rollout_reward_func/std": 0.01852547749876976, "sampling/importance_sampling_ratio/max": 0.5508183240890503, "sampling/importance_sampling_ratio/mean": 0.455846905708313, "sampling/importance_sampling_ratio/min": 1.198176224903591e-09, "sampling/sampling_logp_difference/max": 3.7893753051757812, "sampling/sampling_logp_difference/mean": 0.5426442623138428, "step": 297, "step_time": 6.7784610009985045 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.8100645840168, "epoch": 0.00298, "grad_norm": 0.05405411869287491, "kl": 0.836361937224865, "learning_rate": 9.99996823162417e-06, "loss": -0.0027, "step": 298, "step_time": 3.0859685840041493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.1152824759483337, "epoch": 0.00299, "frac_reward_zero_std": 0.75, "grad_norm": 0.05984298139810562, "kl": 1.0297783613204956, "learning_rate": 9.99996798865503e-06, "loss": 0.0037, "num_tokens": 6064749.0, "reward": 0.5936838984489441, "reward_std": 0.0006174458540044725, "rewards/rollout_reward_func/mean": 0.5936838984489441, "rewards/rollout_reward_func/std": 0.49627578258514404, "sampling/importance_sampling_ratio/max": 0.5483219027519226, "sampling/importance_sampling_ratio/mean": 0.5051056146621704, "sampling/importance_sampling_ratio/min": 0.3440453112125397, "sampling/sampling_logp_difference/max": 0.7318605184555054, "sampling/sampling_logp_difference/mean": 0.34434789419174194, "step": 299, "step_time": 5.806278213007317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.104206472635269, "epoch": 0.003, "grad_norm": 0.05745675042271614, "kl": 1.0293362140655518, "learning_rate": 9.9999677447603e-06, "loss": 0.0037, "step": 300, "step_time": 3.105207194996183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.6860962510108948, "epoch": 0.00301, "frac_reward_zero_std": 0.25, "grad_norm": 0.05760904401540756, "kl": 0.8659589663147926, "learning_rate": 9.99996749993998e-06, "loss": -0.0071, "num_tokens": 6105884.0, "reward": 0.33701640367507935, "reward_std": 0.018663672730326653, "rewards/rollout_reward_func/mean": 0.33701640367507935, "rewards/rollout_reward_func/std": 0.4247589409351349, "sampling/importance_sampling_ratio/max": 0.5479761362075806, "sampling/importance_sampling_ratio/mean": 0.46287035942077637, "sampling/importance_sampling_ratio/min": 1.0609337713907152e-10, "sampling/sampling_logp_difference/max": 3.232490062713623, "sampling/sampling_logp_difference/mean": 0.5943999290466309, "step": 301, "step_time": 5.584201663994463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "entropy": 3.623733788728714, "epoch": 0.00302, "grad_norm": 0.05397392064332962, "kl": 0.8722112253308296, "learning_rate": 9.999967254194065e-06, "loss": -0.0073, "step": 302, "step_time": 3.0740936439979123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.256304442882538, "epoch": 0.00303, "frac_reward_zero_std": 0.5, "grad_norm": 0.02502983808517456, "kl": 0.8972697891294956, "learning_rate": 9.999967007522561e-06, "loss": -0.0038, "num_tokens": 6147856.0, "reward": 0.09798169136047363, "reward_std": 0.009289983659982681, "rewards/rollout_reward_func/mean": 0.09798169136047363, "rewards/rollout_reward_func/std": 0.018974436447024345, "sampling/importance_sampling_ratio/max": 0.5532810091972351, "sampling/importance_sampling_ratio/mean": 0.49004632234573364, "sampling/importance_sampling_ratio/min": 1.8293432049176772e-06, "sampling/sampling_logp_difference/max": 3.028945207595825, "sampling/sampling_logp_difference/mean": 0.42807120084762573, "step": 303, "step_time": 6.9047572939962265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.2007584869861603, "epoch": 0.00304, "grad_norm": 0.0254017636179924, "kl": 0.9030896425247192, "learning_rate": 9.999966759925464e-06, "loss": -0.0037, "step": 304, "step_time": 3.0812296510121087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.119245707988739, "epoch": 0.00305, "frac_reward_zero_std": 0.25, "grad_norm": 0.14707089960575104, "kl": 0.7758923843502998, "learning_rate": 9.999966511402779e-06, "loss": 0.0005, "num_tokens": 6188898.0, "reward": 0.35222217440605164, "reward_std": 0.002472050255164504, "rewards/rollout_reward_func/mean": 0.35222217440605164, "rewards/rollout_reward_func/std": 0.4409790337085724, "sampling/importance_sampling_ratio/max": 0.5511083602905273, "sampling/importance_sampling_ratio/mean": 0.5113102793693542, "sampling/importance_sampling_ratio/min": 0.3264574706554413, "sampling/sampling_logp_difference/max": 0.8081561923027039, "sampling/sampling_logp_difference/mean": 0.33912062644958496, "step": 305, "step_time": 5.918656635003572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.0575003623962402, "epoch": 0.00306, "grad_norm": 0.13388493657112122, "kl": 0.7887115329504013, "learning_rate": 9.9999662619545e-06, "loss": -0.0004, "step": 306, "step_time": 3.1295301740028663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.1926293075084686, "epoch": 0.00307, "frac_reward_zero_std": 0.0, "grad_norm": 0.21830129623413086, "kl": 0.9950210228562355, "learning_rate": 9.999966011580632e-06, "loss": -0.0071, "num_tokens": 6231688.0, "reward": 0.10163908451795578, "reward_std": 0.010628527961671352, "rewards/rollout_reward_func/mean": 0.10163908451795578, "rewards/rollout_reward_func/std": 0.01867382600903511, "sampling/importance_sampling_ratio/max": 0.5533400774002075, "sampling/importance_sampling_ratio/mean": 0.4988005757331848, "sampling/importance_sampling_ratio/min": 7.64161267596819e-09, "sampling/sampling_logp_difference/max": 3.7204337120056152, "sampling/sampling_logp_difference/mean": 0.4864311218261719, "step": 307, "step_time": 6.015831387005164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 3.1032323241233826, "epoch": 0.00308, "grad_norm": 0.056293416768312454, "kl": 1.0350559651851654, "learning_rate": 9.999965760281171e-06, "loss": -0.0083, "step": 308, "step_time": 3.561840436006605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 2.34375, "completions/mean_terminated_length": 2.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.981288343667984, "epoch": 0.00309, "frac_reward_zero_std": 0.5, "grad_norm": 0.2386217564344406, "kl": 1.0090742334723473, "learning_rate": 9.999965508056122e-06, "loss": -0.0043, "num_tokens": 6270439.0, "reward": 0.5899965167045593, "reward_std": 0.00932967197149992, "rewards/rollout_reward_func/mean": 0.5899965167045593, "rewards/rollout_reward_func/std": 0.5000361800193787, "sampling/importance_sampling_ratio/max": 0.5503522157669067, "sampling/importance_sampling_ratio/mean": 0.5026695728302002, "sampling/importance_sampling_ratio/min": 5.997933261170374e-10, "sampling/sampling_logp_difference/max": 3.9083175659179688, "sampling/sampling_logp_difference/mean": 0.5305061936378479, "step": 309, "step_time": 6.137895430998469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 2.9683812260627747, "epoch": 0.0031, "grad_norm": 0.5967473387718201, "kl": 1.6317791044712067, "learning_rate": 9.999965254905479e-06, "loss": -0.0038, "step": 310, "step_time": 3.0509331240027677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.982439339160919, "epoch": 0.00311, "frac_reward_zero_std": 0.25, "grad_norm": 0.07616487890481949, "kl": 1.0867563039064407, "learning_rate": 9.999965000829247e-06, "loss": -0.0033, "num_tokens": 6313435.0, "reward": 0.10155211389064789, "reward_std": 0.009837916120886803, "rewards/rollout_reward_func/mean": 0.10155211389064789, "rewards/rollout_reward_func/std": 0.018558602780103683, "sampling/importance_sampling_ratio/max": 0.6274287700653076, "sampling/importance_sampling_ratio/mean": 0.5241721272468567, "sampling/importance_sampling_ratio/min": 5.413693039864331e-13, "sampling/sampling_logp_difference/max": 4.363093376159668, "sampling/sampling_logp_difference/mean": 0.5687124729156494, "step": 311, "step_time": 5.854605627995625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.964189291000366, "epoch": 0.00312, "grad_norm": 0.08124776929616928, "kl": 1.0896142199635506, "learning_rate": 9.999964745827424e-06, "loss": -0.0032, "step": 312, "step_time": 3.098410030004743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 2.15625, "completions/mean_terminated_length": 2.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.985982984304428, "epoch": 0.00313, "frac_reward_zero_std": 0.75, "grad_norm": 0.006122630555182695, "kl": 1.1359106078743935, "learning_rate": 9.99996448990001e-06, "loss": -0.0035, "num_tokens": 6354072.0, "reward": 0.0935431718826294, "reward_std": 0.009197309613227844, "rewards/rollout_reward_func/mean": 0.0935431718826294, "rewards/rollout_reward_func/std": 0.7253190279006958, "sampling/importance_sampling_ratio/max": 0.548755943775177, "sampling/importance_sampling_ratio/mean": 0.5059186220169067, "sampling/importance_sampling_ratio/min": 5.094491280033253e-05, "sampling/sampling_logp_difference/max": 3.9027647972106934, "sampling/sampling_logp_difference/mean": 0.4283684194087982, "step": 313, "step_time": 5.758521415009454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.9804631173610687, "epoch": 0.00314, "grad_norm": 0.006147774867713451, "kl": 1.136872611939907, "learning_rate": 9.999964233047006e-06, "loss": -0.0035, "step": 314, "step_time": 3.484042418982426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.19235360622406, "epoch": 0.00315, "frac_reward_zero_std": 0.0, "grad_norm": 0.017657563090324402, "kl": 1.0498998314142227, "learning_rate": 9.999963975268412e-06, "loss": -0.0083, "num_tokens": 6395834.0, "reward": 0.09193301200866699, "reward_std": 0.016033969819545746, "rewards/rollout_reward_func/mean": 0.09193301200866699, "rewards/rollout_reward_func/std": 0.026854848489165306, "sampling/importance_sampling_ratio/max": 0.6540788412094116, "sampling/importance_sampling_ratio/mean": 0.5039482712745667, "sampling/importance_sampling_ratio/min": 2.3457147682592883e-12, "sampling/sampling_logp_difference/max": 3.498971939086914, "sampling/sampling_logp_difference/mean": 0.6184600591659546, "step": 315, "step_time": 6.264227156010747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "entropy": 3.188777893781662, "epoch": 0.00316, "grad_norm": 0.02727506496012211, "kl": 1.0480819791555405, "learning_rate": 9.999963716564226e-06, "loss": -0.0084, "step": 316, "step_time": 3.0563378690058016 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.986633151769638, "epoch": 0.00317, "frac_reward_zero_std": 0.75, "grad_norm": 0.009791991673409939, "kl": 1.0467952117323875, "learning_rate": 9.99996345693445e-06, "loss": -0.0036, "num_tokens": 6434444.0, "reward": 0.35138943791389465, "reward_std": 0.00927620567381382, "rewards/rollout_reward_func/mean": 0.35138943791389465, "rewards/rollout_reward_func/std": 0.44183269143104553, "sampling/importance_sampling_ratio/max": 0.5674819946289062, "sampling/importance_sampling_ratio/mean": 0.5092076659202576, "sampling/importance_sampling_ratio/min": 8.945420515438229e-10, "sampling/sampling_logp_difference/max": 3.759033679962158, "sampling/sampling_logp_difference/mean": 0.49552154541015625, "step": 317, "step_time": 5.791271464986494 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.982309252023697, "epoch": 0.00318, "grad_norm": 0.012284097261726856, "kl": 1.0506343469023705, "learning_rate": 9.999963196379084e-06, "loss": -0.0036, "step": 318, "step_time": 3.0554704669848434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.762875586748123, "epoch": 0.00319, "frac_reward_zero_std": 0.5, "grad_norm": 0.2564242482185364, "kl": 0.9941031485795975, "learning_rate": 9.999962934898128e-06, "loss": 0.0007, "num_tokens": 6476225.0, "reward": 0.3437140882015228, "reward_std": 0.0007994677871465683, "rewards/rollout_reward_func/mean": 0.3437140882015228, "rewards/rollout_reward_func/std": 0.4200662076473236, "sampling/importance_sampling_ratio/max": 0.551784336566925, "sampling/importance_sampling_ratio/mean": 0.519234299659729, "sampling/importance_sampling_ratio/min": 0.27351969480514526, "sampling/sampling_logp_difference/max": 0.6948071718215942, "sampling/sampling_logp_difference/mean": 0.33356308937072754, "step": 319, "step_time": 5.690876297019713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 2.7497818768024445, "epoch": 0.0032, "grad_norm": 0.06834273040294647, "kl": 0.9579518139362335, "learning_rate": 9.999962672491582e-06, "loss": 0.0003, "step": 320, "step_time": 3.977372555003967 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.9171096980571747, "epoch": 0.00321, "frac_reward_zero_std": 0.25, "grad_norm": 0.007830891758203506, "kl": 1.0758580565452576, "learning_rate": 9.999962409159445e-06, "loss": -0.0038, "num_tokens": 6518285.0, "reward": 0.10135509073734283, "reward_std": 0.00933779962360859, "rewards/rollout_reward_func/mean": 0.10135509073734283, "rewards/rollout_reward_func/std": 0.0185615923255682, "sampling/importance_sampling_ratio/max": 0.5531873106956482, "sampling/importance_sampling_ratio/mean": 0.518477737903595, "sampling/importance_sampling_ratio/min": 2.291457440461997e-10, "sampling/sampling_logp_difference/max": 3.9036684036254883, "sampling/sampling_logp_difference/mean": 0.5035097599029541, "step": 321, "step_time": 5.805161377007607 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.9283141493797302, "epoch": 0.00322, "grad_norm": 0.007800721563398838, "kl": 1.0759969651699066, "learning_rate": 9.999962144901718e-06, "loss": -0.0038, "step": 322, "step_time": 3.085053387003427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.702649235725403, "epoch": 0.00323, "frac_reward_zero_std": 0.5, "grad_norm": 0.036845214664936066, "kl": 1.1564905866980553, "learning_rate": 9.999961879718401e-06, "loss": 0.0008, "num_tokens": 6560339.0, "reward": -0.18590009212493896, "reward_std": 0.0006136721931397915, "rewards/rollout_reward_func/mean": -0.18590009212493896, "rewards/rollout_reward_func/std": 0.5105222463607788, "sampling/importance_sampling_ratio/max": 0.6780114769935608, "sampling/importance_sampling_ratio/mean": 0.542724072933197, "sampling/importance_sampling_ratio/min": 0.511175274848938, "sampling/sampling_logp_difference/max": 0.6708965301513672, "sampling/sampling_logp_difference/mean": 0.3151528835296631, "step": 323, "step_time": 5.9223337059956975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7014201283454895, "epoch": 0.00324, "grad_norm": 0.02908303402364254, "kl": 1.1578230112791061, "learning_rate": 9.999961613609494e-06, "loss": 0.0007, "step": 324, "step_time": 3.1550441620129277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.794700026512146, "epoch": 0.00325, "frac_reward_zero_std": 0.75, "grad_norm": 0.012745519168674946, "kl": 1.0621542409062386, "learning_rate": 9.999961346574998e-06, "loss": 0.0013, "num_tokens": 6598731.0, "reward": 0.6026591658592224, "reward_std": 1.1667218132060952e-05, "rewards/rollout_reward_func/mean": 0.6026591658592224, "rewards/rollout_reward_func/std": 0.505595862865448, "sampling/importance_sampling_ratio/max": 0.648481547832489, "sampling/importance_sampling_ratio/mean": 0.5268490314483643, "sampling/importance_sampling_ratio/min": 0.12379276752471924, "sampling/sampling_logp_difference/max": 1.4300806522369385, "sampling/sampling_logp_difference/mean": 0.34825700521469116, "step": 325, "step_time": 5.685501934000058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.793960839509964, "epoch": 0.00326, "grad_norm": 0.011911332607269287, "kl": 1.0632172226905823, "learning_rate": 9.999961078614912e-06, "loss": 0.0013, "step": 326, "step_time": 3.9514637230095104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.1802074909210205, "epoch": 0.00327, "frac_reward_zero_std": 0.25, "grad_norm": 0.008293929509818554, "kl": 0.9603535011410713, "learning_rate": 9.999960809729237e-06, "loss": -0.0091, "num_tokens": 6638925.0, "reward": 0.056656286120414734, "reward_std": 0.0178748220205307, "rewards/rollout_reward_func/mean": 0.056656286120414734, "rewards/rollout_reward_func/std": 0.7699928879737854, "sampling/importance_sampling_ratio/max": 0.655485212802887, "sampling/importance_sampling_ratio/mean": 0.5098145604133606, "sampling/importance_sampling_ratio/min": 3.850535613015432e-13, "sampling/sampling_logp_difference/max": 4.467103004455566, "sampling/sampling_logp_difference/mean": 0.7296096086502075, "step": 327, "step_time": 5.775637409009505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.181100994348526, "epoch": 0.00328, "grad_norm": 0.00820213183760643, "kl": 0.9603519141674042, "learning_rate": 9.99996053991797e-06, "loss": -0.009, "step": 328, "step_time": 3.0861211240116972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.2969835698604584, "epoch": 0.00329, "frac_reward_zero_std": 0.25, "grad_norm": 0.13116136193275452, "kl": 0.8397955894470215, "learning_rate": 9.999960269181116e-06, "loss": -0.005, "num_tokens": 6679206.0, "reward": 0.348633348941803, "reward_std": 0.012600407935678959, "rewards/rollout_reward_func/mean": 0.348633348941803, "rewards/rollout_reward_func/std": 0.44380947947502136, "sampling/importance_sampling_ratio/max": 0.6161144375801086, "sampling/importance_sampling_ratio/mean": 0.5032378435134888, "sampling/importance_sampling_ratio/min": 3.320749325446387e-17, "sampling/sampling_logp_difference/max": 4.549830436706543, "sampling/sampling_logp_difference/mean": 0.7551888823509216, "step": 329, "step_time": 6.1602775050123455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.298086017370224, "epoch": 0.0033, "grad_norm": 0.010502017103135586, "kl": 0.8353239297866821, "learning_rate": 9.999959997518671e-06, "loss": -0.0054, "step": 330, "step_time": 3.1851333990198327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7030363380908966, "epoch": 0.00331, "frac_reward_zero_std": 0.25, "grad_norm": 0.08008477836847305, "kl": 1.0304204821586609, "learning_rate": 9.999959724930638e-06, "loss": 0.0017, "num_tokens": 6720664.0, "reward": 0.10665886849164963, "reward_std": 0.0013307735789567232, "rewards/rollout_reward_func/mean": 0.10665886849164963, "rewards/rollout_reward_func/std": 0.004088005516678095, "sampling/importance_sampling_ratio/max": 0.6727132201194763, "sampling/importance_sampling_ratio/mean": 0.5497747659683228, "sampling/importance_sampling_ratio/min": 0.5261968374252319, "sampling/sampling_logp_difference/max": 0.6451361179351807, "sampling/sampling_logp_difference/mean": 0.31848371028900146, "step": 331, "step_time": 5.693282957014162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7115975320339203, "epoch": 0.00332, "grad_norm": 0.06120472401380539, "kl": 1.0319889411330223, "learning_rate": 9.999959451417012e-06, "loss": 0.0018, "step": 332, "step_time": 3.9664878280091216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.757014721632004, "epoch": 0.00333, "frac_reward_zero_std": 0.5, "grad_norm": 0.20782040059566498, "kl": 0.9710577800869942, "learning_rate": 9.9999591769778e-06, "loss": 0.0019, "num_tokens": 6760010.0, "reward": 0.5608695149421692, "reward_std": 0.001169261522591114, "rewards/rollout_reward_func/mean": 0.5608695149421692, "rewards/rollout_reward_func/std": 0.46712082624435425, "sampling/importance_sampling_ratio/max": 0.6542908549308777, "sampling/importance_sampling_ratio/mean": 0.5392788052558899, "sampling/importance_sampling_ratio/min": 0.4865926504135132, "sampling/sampling_logp_difference/max": 0.7128278017044067, "sampling/sampling_logp_difference/mean": 0.31937089562416077, "step": 333, "step_time": 5.635250547988107 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 2.7573891282081604, "epoch": 0.00334, "grad_norm": 0.013499067164957523, "kl": 0.9758808314800262, "learning_rate": 9.999958901612997e-06, "loss": 0.0013, "step": 334, "step_time": 3.0208352929985267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.750638723373413, "epoch": 0.00335, "frac_reward_zero_std": 0.25, "grad_norm": 0.03457660228013992, "kl": 1.1863691881299019, "learning_rate": 9.999958625322606e-06, "loss": 0.0018, "num_tokens": 6802404.0, "reward": 0.10422185063362122, "reward_std": 0.0007976024644449353, "rewards/rollout_reward_func/mean": 0.10422185063362122, "rewards/rollout_reward_func/std": 0.002016264945268631, "sampling/importance_sampling_ratio/max": 0.5775055885314941, "sampling/importance_sampling_ratio/mean": 0.5292109251022339, "sampling/importance_sampling_ratio/min": 0.2296164631843567, "sampling/sampling_logp_difference/max": 0.7990639209747314, "sampling/sampling_logp_difference/mean": 0.3273868262767792, "step": 335, "step_time": 5.666458398991381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.760325700044632, "epoch": 0.00336, "grad_norm": 0.03870951011776924, "kl": 1.183845892548561, "learning_rate": 9.999958348106625e-06, "loss": 0.0017, "step": 336, "step_time": 3.077703360002488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 2.21875, "completions/mean_terminated_length": 2.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.028902769088745, "epoch": 0.00337, "frac_reward_zero_std": 0.25, "grad_norm": 0.2226838320493698, "kl": 0.9293961152434349, "learning_rate": 9.999958069965056e-06, "loss": -0.0066, "num_tokens": 6844580.0, "reward": 0.3502488136291504, "reward_std": 0.00980678666383028, "rewards/rollout_reward_func/mean": 0.3502488136291504, "rewards/rollout_reward_func/std": 0.43820685148239136, "sampling/importance_sampling_ratio/max": 0.6045396327972412, "sampling/importance_sampling_ratio/mean": 0.4978286921977997, "sampling/importance_sampling_ratio/min": 2.2225228804018116e-06, "sampling/sampling_logp_difference/max": 3.639810562133789, "sampling/sampling_logp_difference/mean": 0.5014159679412842, "step": 337, "step_time": 6.31773350401636 }, { "clip_ratio/high_max": 0.09375, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 3.017491489648819, "epoch": 0.00338, "grad_norm": 0.22782652080059052, "kl": 1.4273724257946014, "learning_rate": 9.999957790897897e-06, "loss": -0.007, "step": 338, "step_time": 3.6153101270028856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.873435527086258, "epoch": 0.00339, "frac_reward_zero_std": 0.75, "grad_norm": 0.010897542349994183, "kl": 1.0515243634581566, "learning_rate": 9.999957510905149e-06, "loss": 0.0011, "num_tokens": 6884779.0, "reward": 0.34189069271087646, "reward_std": 0.00035730007220990956, "rewards/rollout_reward_func/mean": 0.34189069271087646, "rewards/rollout_reward_func/std": 0.44712334871292114, "sampling/importance_sampling_ratio/max": 0.615838348865509, "sampling/importance_sampling_ratio/mean": 0.5299073457717896, "sampling/importance_sampling_ratio/min": 0.44862619042396545, "sampling/sampling_logp_difference/max": 0.8572133779525757, "sampling/sampling_logp_difference/mean": 0.33875638246536255, "step": 339, "step_time": 5.8675375129969325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.886341631412506, "epoch": 0.0034, "grad_norm": 0.009202172048389912, "kl": 1.065614253282547, "learning_rate": 9.999957229986813e-06, "loss": 0.0011, "step": 340, "step_time": 3.022844745995826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7944147884845734, "epoch": 0.00341, "frac_reward_zero_std": 0.25, "grad_norm": 0.010857141576707363, "kl": 1.1396991983056068, "learning_rate": 9.999956948142888e-06, "loss": 0.0016, "num_tokens": 6927089.0, "reward": 0.10528045147657394, "reward_std": 0.0013816789723932743, "rewards/rollout_reward_func/mean": 0.10528045147657394, "rewards/rollout_reward_func/std": 0.0033621301408857107, "sampling/importance_sampling_ratio/max": 0.5553457140922546, "sampling/importance_sampling_ratio/mean": 0.5320407152175903, "sampling/importance_sampling_ratio/min": 0.48910489678382874, "sampling/sampling_logp_difference/max": 0.7215526700019836, "sampling/sampling_logp_difference/mean": 0.321378231048584, "step": 341, "step_time": 5.754511411003477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8105552196502686, "epoch": 0.00342, "grad_norm": 0.013493618927896023, "kl": 1.1376714780926704, "learning_rate": 9.999956665373374e-06, "loss": 0.0017, "step": 342, "step_time": 3.044299100009084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.9408008754253387, "epoch": 0.00343, "frac_reward_zero_std": 0.5, "grad_norm": 0.14678384363651276, "kl": 1.0458603128790855, "learning_rate": 9.999956381678271e-06, "loss": -0.0004, "num_tokens": 6967275.0, "reward": 0.3476759195327759, "reward_std": 0.0006439252756536007, "rewards/rollout_reward_func/mean": 0.3476759195327759, "rewards/rollout_reward_func/std": 0.4437747001647949, "sampling/importance_sampling_ratio/max": 0.5508406758308411, "sampling/importance_sampling_ratio/mean": 0.508334755897522, "sampling/importance_sampling_ratio/min": 0.3402721881866455, "sampling/sampling_logp_difference/max": 0.8270300030708313, "sampling/sampling_logp_difference/mean": 0.34628015756607056, "step": 343, "step_time": 6.308720604996779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.9642234444618225, "epoch": 0.00344, "grad_norm": 0.1504475474357605, "kl": 1.0431530997157097, "learning_rate": 9.99995609705758e-06, "loss": -0.0004, "step": 344, "step_time": 3.4788607579976087 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 3.1875, "completions/mean_terminated_length": 2.3333334922790527, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.684504806995392, "epoch": 0.00345, "frac_reward_zero_std": 0.0, "grad_norm": 0.07360611110925674, "kl": 0.8940838128328323, "learning_rate": 9.999955811511302e-06, "loss": -0.0116, "num_tokens": 7008237.0, "reward": 0.3435920476913452, "reward_std": 0.02769605442881584, "rewards/rollout_reward_func/mean": 0.3435920476913452, "rewards/rollout_reward_func/std": 0.43915417790412903, "sampling/importance_sampling_ratio/max": 0.5454539656639099, "sampling/importance_sampling_ratio/mean": 0.4343684911727905, "sampling/importance_sampling_ratio/min": 7.32500201445184e-14, "sampling/sampling_logp_difference/max": 3.8083689212799072, "sampling/sampling_logp_difference/mean": 0.8507159948348999, "step": 345, "step_time": 6.049790259014117 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.698573648929596, "epoch": 0.00346, "grad_norm": 0.07194488495588303, "kl": 0.8905919082462788, "learning_rate": 9.999955525039433e-06, "loss": -0.0118, "step": 346, "step_time": 3.094537129996752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.968942940235138, "epoch": 0.00347, "frac_reward_zero_std": 0.75, "grad_norm": 0.009119466878473759, "kl": 1.0328731760382652, "learning_rate": 9.999955237641976e-06, "loss": 0.0007, "num_tokens": 7046753.0, "reward": 0.6022200584411621, "reward_std": 0.00023501619580201805, "rewards/rollout_reward_func/mean": 0.6022200584411621, "rewards/rollout_reward_func/std": 0.5057325959205627, "sampling/importance_sampling_ratio/max": 0.5484819412231445, "sampling/importance_sampling_ratio/mean": 0.5014855861663818, "sampling/importance_sampling_ratio/min": 0.41386663913726807, "sampling/sampling_logp_difference/max": 0.8793177604675293, "sampling/sampling_logp_difference/mean": 0.34709179401397705, "step": 347, "step_time": 5.744434462001664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.9819316267967224, "epoch": 0.00348, "grad_norm": 0.009611646644771099, "kl": 1.0312913581728935, "learning_rate": 9.999954949318932e-06, "loss": 0.0006, "step": 348, "step_time": 3.0334937979932874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.956917703151703, "epoch": 0.00349, "frac_reward_zero_std": 0.75, "grad_norm": 0.020919889211654663, "kl": 0.8911553174257278, "learning_rate": 9.999954660070299e-06, "loss": 0.0019, "num_tokens": 7084477.0, "reward": 0.5688897371292114, "reward_std": 0.0013980362564325333, "rewards/rollout_reward_func/mean": 0.5688897371292114, "rewards/rollout_reward_func/std": 0.5228142738342285, "sampling/importance_sampling_ratio/max": 0.5449256300926208, "sampling/importance_sampling_ratio/mean": 0.5054171681404114, "sampling/importance_sampling_ratio/min": 0.3965950906276703, "sampling/sampling_logp_difference/max": 0.9210527539253235, "sampling/sampling_logp_difference/mean": 0.34634703397750854, "step": 349, "step_time": 6.248468197998591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.962557852268219, "epoch": 0.0035, "grad_norm": 0.020816434174776077, "kl": 0.8904831632971764, "learning_rate": 9.999954369896076e-06, "loss": 0.0019, "step": 350, "step_time": 3.4969974790292326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.1035397946834564, "epoch": 0.00351, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013722231378778815, "kl": 1.0085569992661476, "learning_rate": 9.999954078796268e-06, "loss": 0.0013, "num_tokens": 7123250.0, "reward": 0.3358842134475708, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.3358842134475708, "rewards/rollout_reward_func/std": 0.4501870274543762, "sampling/importance_sampling_ratio/max": 0.5393965840339661, "sampling/importance_sampling_ratio/mean": 0.48176437616348267, "sampling/importance_sampling_ratio/min": 0.3839019536972046, "sampling/sampling_logp_difference/max": 0.9525705575942993, "sampling/sampling_logp_difference/mean": 0.367412269115448, "step": 351, "step_time": 5.7014114479970885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.116129368543625, "epoch": 0.00352, "grad_norm": 0.0013827007496729493, "kl": 1.0067291483283043, "learning_rate": 9.99995378677087e-06, "loss": 0.0013, "step": 352, "step_time": 2.912339612012147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.0382410883903503, "epoch": 0.00353, "frac_reward_zero_std": 0.0, "grad_norm": 0.1254388391971588, "kl": 1.2018917351961136, "learning_rate": 9.999953493819885e-06, "loss": 0.002, "num_tokens": 7165822.0, "reward": 0.10443936288356781, "reward_std": 0.0010657160310074687, "rewards/rollout_reward_func/mean": 0.10443936288356781, "rewards/rollout_reward_func/std": 0.0016952859004959464, "sampling/importance_sampling_ratio/max": 0.5509746670722961, "sampling/importance_sampling_ratio/mean": 0.5062421560287476, "sampling/importance_sampling_ratio/min": 0.41338324546813965, "sampling/sampling_logp_difference/max": 0.9959031343460083, "sampling/sampling_logp_difference/mean": 0.3573015630245209, "step": 353, "step_time": 6.071354824998707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 3.0644567012786865, "epoch": 0.00354, "grad_norm": 0.10265849530696869, "kl": 1.178892582654953, "learning_rate": 9.999953199943314e-06, "loss": 0.0015, "step": 354, "step_time": 3.1442513589863665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.376656651496887, "epoch": 0.00355, "frac_reward_zero_std": 0.0, "grad_norm": 0.10074718296527863, "kl": 0.9723216071724892, "learning_rate": 9.999952905141152e-06, "loss": -0.0019, "num_tokens": 7207294.0, "reward": 0.10234974324703217, "reward_std": 0.010145512409508228, "rewards/rollout_reward_func/mean": 0.10234974324703217, "rewards/rollout_reward_func/std": 0.018748946487903595, "sampling/importance_sampling_ratio/max": 0.5667394995689392, "sampling/importance_sampling_ratio/mean": 0.4812660217285156, "sampling/importance_sampling_ratio/min": 3.4216471886741595e-14, "sampling/sampling_logp_difference/max": 3.4289958477020264, "sampling/sampling_logp_difference/mean": 0.6431632041931152, "step": 355, "step_time": 6.84914141799527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.359410434961319, "epoch": 0.00356, "grad_norm": 0.11123201996088028, "kl": 0.9768740013241768, "learning_rate": 9.999952609413403e-06, "loss": -0.0019, "step": 356, "step_time": 3.054059263988165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.2069675028324127, "epoch": 0.00357, "frac_reward_zero_std": 0.5, "grad_norm": 0.06562845408916473, "kl": 0.9388005584478378, "learning_rate": 9.999952312760068e-06, "loss": -0.0029, "num_tokens": 7248069.0, "reward": 0.3342539668083191, "reward_std": 0.009697731584310532, "rewards/rollout_reward_func/mean": 0.3342539668083191, "rewards/rollout_reward_func/std": 0.4260777235031128, "sampling/importance_sampling_ratio/max": 0.5522638559341431, "sampling/importance_sampling_ratio/mean": 0.4816593825817108, "sampling/importance_sampling_ratio/min": 3.652228386386014e-08, "sampling/sampling_logp_difference/max": 2.89306640625, "sampling/sampling_logp_difference/mean": 0.4802705645561218, "step": 357, "step_time": 5.9043499659892404 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.1727271378040314, "epoch": 0.00358, "grad_norm": 0.041793402284383774, "kl": 0.9448869004845619, "learning_rate": 9.999952015181144e-06, "loss": -0.0029, "step": 358, "step_time": 3.070436948997667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.99104243516922, "epoch": 0.00359, "frac_reward_zero_std": 0.75, "grad_norm": 0.22918923199176788, "kl": 1.0686639323830605, "learning_rate": 9.999951716676632e-06, "loss": 0.002, "num_tokens": 7289759.0, "reward": 0.09896648675203323, "reward_std": 0.00023168828920461237, "rewards/rollout_reward_func/mean": 0.09896648675203323, "rewards/rollout_reward_func/std": 0.010826101526618004, "sampling/importance_sampling_ratio/max": 0.550812840461731, "sampling/importance_sampling_ratio/mean": 0.5109913945198059, "sampling/importance_sampling_ratio/min": 0.4398786127567291, "sampling/sampling_logp_difference/max": 0.8196622133255005, "sampling/sampling_logp_difference/mean": 0.34386277198791504, "step": 359, "step_time": 6.040064568005619 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 2.967257261276245, "epoch": 0.0036, "grad_norm": 0.023567037656903267, "kl": 1.071425437927246, "learning_rate": 9.999951417246534e-06, "loss": 0.0015, "step": 360, "step_time": 3.076845401992614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.2566710114479065, "epoch": 0.00361, "frac_reward_zero_std": 0.75, "grad_norm": 0.1750289648771286, "kl": 1.132862612605095, "learning_rate": 9.999951116890847e-06, "loss": -0.0011, "num_tokens": 7330040.0, "reward": 0.0692020058631897, "reward_std": 0.017349783331155777, "rewards/rollout_reward_func/mean": 0.0692020058631897, "rewards/rollout_reward_func/std": 0.7700861096382141, "sampling/importance_sampling_ratio/max": 0.5455353856086731, "sampling/importance_sampling_ratio/mean": 0.4605376124382019, "sampling/importance_sampling_ratio/min": 1.55944113178208e-09, "sampling/sampling_logp_difference/max": 3.065110921859741, "sampling/sampling_logp_difference/mean": 0.5347534418106079, "step": 361, "step_time": 6.800474680996558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 3.2267525494098663, "epoch": 0.00362, "grad_norm": 0.04995701462030411, "kl": 1.1612886041402817, "learning_rate": 9.999950815609574e-06, "loss": -0.0012, "step": 362, "step_time": 3.12863658999413 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.36887788772583, "epoch": 0.00363, "frac_reward_zero_std": 0.0, "grad_norm": 0.04703135788440704, "kl": 1.132267139852047, "learning_rate": 9.999950513402715e-06, "loss": -0.0088, "num_tokens": 7371388.0, "reward": 0.09836290776729584, "reward_std": 0.01862216554582119, "rewards/rollout_reward_func/mean": 0.09836290776729584, "rewards/rollout_reward_func/std": 0.025877695530653, "sampling/importance_sampling_ratio/max": 0.779991626739502, "sampling/importance_sampling_ratio/mean": 0.49269789457321167, "sampling/importance_sampling_ratio/min": 1.903818032042426e-12, "sampling/sampling_logp_difference/max": 3.5042366981506348, "sampling/sampling_logp_difference/mean": 0.7317321300506592, "step": 363, "step_time": 5.8196948570112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.3467807471752167, "epoch": 0.00364, "grad_norm": 0.033229630440473557, "kl": 1.1278311386704445, "learning_rate": 9.999950210270267e-06, "loss": -0.0088, "step": 364, "step_time": 3.086561800999334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.8413954973220825, "epoch": 0.00365, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017812044825404882, "kl": 0.9707506000995636, "learning_rate": 9.999949906212232e-06, "loss": 0.0012, "num_tokens": 7408761.0, "reward": 0.8505592346191406, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8505592346191406, "rewards/rollout_reward_func/std": 0.4373161196708679, "sampling/importance_sampling_ratio/max": 0.6791704297065735, "sampling/importance_sampling_ratio/mean": 0.5299949049949646, "sampling/importance_sampling_ratio/min": 0.49057576060295105, "sampling/sampling_logp_difference/max": 0.7120166420936584, "sampling/sampling_logp_difference/mean": 0.3293605148792267, "step": 365, "step_time": 5.4892275700040045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8173536360263824, "epoch": 0.00366, "grad_norm": 0.0016565759433433414, "kl": 0.9740576446056366, "learning_rate": 9.999949601228609e-06, "loss": 0.0012, "step": 366, "step_time": 2.992165677009325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.8127278685569763, "epoch": 0.00367, "frac_reward_zero_std": 0.25, "grad_norm": 0.012405509129166603, "kl": 1.1423841640353203, "learning_rate": 9.9999492953194e-06, "loss": 0.0015, "num_tokens": 7448516.0, "reward": 0.35358864068984985, "reward_std": 0.0011182207381352782, "rewards/rollout_reward_func/mean": 0.35358864068984985, "rewards/rollout_reward_func/std": 0.4358569085597992, "sampling/importance_sampling_ratio/max": 0.5539470911026001, "sampling/importance_sampling_ratio/mean": 0.527838408946991, "sampling/importance_sampling_ratio/min": 0.4966011941432953, "sampling/sampling_logp_difference/max": 0.7028566598892212, "sampling/sampling_logp_difference/mean": 0.32356399297714233, "step": 367, "step_time": 6.582398739999917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.8065935373306274, "epoch": 0.00368, "grad_norm": 0.011162263341248035, "kl": 1.141952358186245, "learning_rate": 9.999948988484605e-06, "loss": 0.0014, "step": 368, "step_time": 3.026853450006456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.8122291564941406, "epoch": 0.00369, "frac_reward_zero_std": 0.25, "grad_norm": 0.02088358625769615, "kl": 1.260048471391201, "learning_rate": 9.999948680724223e-06, "loss": 0.0018, "num_tokens": 7491172.0, "reward": 0.10547229647636414, "reward_std": 0.0009681266965344548, "rewards/rollout_reward_func/mean": 0.10547229647636414, "rewards/rollout_reward_func/std": 0.001705438015051186, "sampling/importance_sampling_ratio/max": 0.5618352293968201, "sampling/importance_sampling_ratio/mean": 0.530832827091217, "sampling/importance_sampling_ratio/min": 0.3500404953956604, "sampling/sampling_logp_difference/max": 0.7377172708511353, "sampling/sampling_logp_difference/mean": 0.33282777667045593, "step": 369, "step_time": 5.873542453991831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.79980406165123, "epoch": 0.0037, "grad_norm": 0.018173040822148323, "kl": 1.2782800421118736, "learning_rate": 9.999948372038253e-06, "loss": 0.0018, "step": 370, "step_time": 3.116635172984388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7963118851184845, "epoch": 0.00371, "frac_reward_zero_std": 0.75, "grad_norm": 0.0112152686342597, "kl": 1.0853168591856956, "learning_rate": 9.9999480624267e-06, "loss": 0.0011, "num_tokens": 7530905.0, "reward": 0.348290354013443, "reward_std": 0.00036901794373989105, "rewards/rollout_reward_func/mean": 0.348290354013443, "rewards/rollout_reward_func/std": 0.443377822637558, "sampling/importance_sampling_ratio/max": 0.5519425272941589, "sampling/importance_sampling_ratio/mean": 0.526742160320282, "sampling/importance_sampling_ratio/min": 0.4632178544998169, "sampling/sampling_logp_difference/max": 0.7742924094200134, "sampling/sampling_logp_difference/mean": 0.3235486149787903, "step": 371, "step_time": 5.744090530984977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.777267277240753, "epoch": 0.00372, "grad_norm": 0.00977717712521553, "kl": 1.0875214487314224, "learning_rate": 9.999947751889557e-06, "loss": 0.0011, "step": 372, "step_time": 3.571941351990972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.763719230890274, "epoch": 0.00373, "frac_reward_zero_std": 0.75, "grad_norm": 0.00580716272816062, "kl": 1.0574836283922195, "learning_rate": 9.99994744042683e-06, "loss": 0.001, "num_tokens": 7570243.0, "reward": 0.32846760749816895, "reward_std": 0.0004651603230740875, "rewards/rollout_reward_func/mean": 0.32846760749816895, "rewards/rollout_reward_func/std": 0.4568646550178528, "sampling/importance_sampling_ratio/max": 0.5565162897109985, "sampling/importance_sampling_ratio/mean": 0.5307185649871826, "sampling/importance_sampling_ratio/min": 0.4978537857532501, "sampling/sampling_logp_difference/max": 0.6959719657897949, "sampling/sampling_logp_difference/mean": 0.317726731300354, "step": 373, "step_time": 6.332172431000799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.761884957551956, "epoch": 0.00374, "grad_norm": 0.00583596620708704, "kl": 1.0574418678879738, "learning_rate": 9.999947128038514e-06, "loss": 0.0009, "step": 374, "step_time": 3.0919990020047408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.0495359897613525, "epoch": 0.00375, "frac_reward_zero_std": 0.5, "grad_norm": 0.005248270463198423, "kl": 1.0119263902306557, "learning_rate": 9.999946814724613e-06, "loss": -0.0036, "num_tokens": 7609223.0, "reward": 0.32510632276535034, "reward_std": 0.009296650998294353, "rewards/rollout_reward_func/mean": 0.32510632276535034, "rewards/rollout_reward_func/std": 0.4590388536453247, "sampling/importance_sampling_ratio/max": 0.553766131401062, "sampling/importance_sampling_ratio/mean": 0.510669469833374, "sampling/importance_sampling_ratio/min": 3.091821232630715e-13, "sampling/sampling_logp_difference/max": 3.6967477798461914, "sampling/sampling_logp_difference/mean": 0.5800716280937195, "step": 375, "step_time": 5.910428728995612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.0509733855724335, "epoch": 0.00376, "grad_norm": 0.005204714834690094, "kl": 1.0112500041723251, "learning_rate": 9.999946500485126e-06, "loss": -0.0036, "step": 376, "step_time": 3.0945308999944245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7495884895324707, "epoch": 0.00377, "frac_reward_zero_std": 0.75, "grad_norm": 0.00530738802626729, "kl": 1.0589147880673409, "learning_rate": 9.999946185320051e-06, "loss": 0.0011, "num_tokens": 7650742.0, "reward": 0.3540690243244171, "reward_std": 0.00040341675048694015, "rewards/rollout_reward_func/mean": 0.3540690243244171, "rewards/rollout_reward_func/std": 0.4395214915275574, "sampling/importance_sampling_ratio/max": 0.555817723274231, "sampling/importance_sampling_ratio/mean": 0.5278332829475403, "sampling/importance_sampling_ratio/min": 0.5077018141746521, "sampling/sampling_logp_difference/max": 0.6781985759735107, "sampling/sampling_logp_difference/mean": 0.32022756338119507, "step": 377, "step_time": 6.152837781999551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7484697103500366, "epoch": 0.00378, "grad_norm": 0.005210863426327705, "kl": 1.0589198023080826, "learning_rate": 9.999945869229393e-06, "loss": 0.0011, "step": 378, "step_time": 4.113548583998636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 3.09375, "completions/mean_terminated_length": 2.2333333492279053, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.4416747391223907, "epoch": 0.00379, "frac_reward_zero_std": 0.0, "grad_norm": 0.008442397229373455, "kl": 0.9403651058673859, "learning_rate": 9.999945552213145e-06, "loss": -0.014, "num_tokens": 7692451.0, "reward": 0.09037919342517853, "reward_std": 0.025934459641575813, "rewards/rollout_reward_func/mean": 0.09037919342517853, "rewards/rollout_reward_func/std": 0.03099244460463524, "sampling/importance_sampling_ratio/max": 0.5583080053329468, "sampling/importance_sampling_ratio/mean": 0.48531270027160645, "sampling/importance_sampling_ratio/min": 1.0065259042901786e-10, "sampling/sampling_logp_difference/max": 3.8809714317321777, "sampling/sampling_logp_difference/mean": 0.8072496652603149, "step": 379, "step_time": 6.071769906004192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.4476400017738342, "epoch": 0.0038, "grad_norm": 0.00838474091142416, "kl": 0.9403519928455353, "learning_rate": 9.999945234271316e-06, "loss": -0.014, "step": 380, "step_time": 3.0865582230107975 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.762263238430023, "epoch": 0.00381, "frac_reward_zero_std": 0.25, "grad_norm": 0.048572856932878494, "kl": 1.0510911121964455, "learning_rate": 9.9999449154039e-06, "loss": 0.0023, "num_tokens": 7733633.0, "reward": 0.35397207736968994, "reward_std": 0.0007295984541997313, "rewards/rollout_reward_func/mean": 0.35397207736968994, "rewards/rollout_reward_func/std": 0.4395783543586731, "sampling/importance_sampling_ratio/max": 0.5555952191352844, "sampling/importance_sampling_ratio/mean": 0.5212432742118835, "sampling/importance_sampling_ratio/min": 0.23369713127613068, "sampling/sampling_logp_difference/max": 0.7741672992706299, "sampling/sampling_logp_difference/mean": 0.3321303725242615, "step": 381, "step_time": 6.004493178988923 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.7555155754089355, "epoch": 0.00382, "grad_norm": 0.04755531623959541, "kl": 1.0565636977553368, "learning_rate": 9.999944595610896e-06, "loss": 0.0022, "step": 382, "step_time": 3.129315836005844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7573567032814026, "epoch": 0.00383, "frac_reward_zero_std": 0.5, "grad_norm": 0.007573384791612625, "kl": 1.3680932745337486, "learning_rate": 9.999944274892308e-06, "loss": 0.0022, "num_tokens": 7775567.0, "reward": 0.09820742905139923, "reward_std": 0.00034131755819544196, "rewards/rollout_reward_func/mean": 0.09820742905139923, "rewards/rollout_reward_func/std": 0.011834964156150818, "sampling/importance_sampling_ratio/max": 0.5538134574890137, "sampling/importance_sampling_ratio/mean": 0.5296226739883423, "sampling/importance_sampling_ratio/min": 0.5042413473129272, "sampling/sampling_logp_difference/max": 0.6821393966674805, "sampling/sampling_logp_difference/mean": 0.3183897137641907, "step": 383, "step_time": 5.894449580999208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.751048058271408, "epoch": 0.00384, "grad_norm": 0.007568292785435915, "kl": 1.3689441457390785, "learning_rate": 9.999943953248133e-06, "loss": 0.0022, "step": 384, "step_time": 4.018032337007753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7830534279346466, "epoch": 0.00385, "frac_reward_zero_std": 0.25, "grad_norm": 0.162947878241539, "kl": 1.2022706642746925, "learning_rate": 9.999943630678372e-06, "loss": 0.0013, "num_tokens": 7814093.0, "reward": 0.26661670207977295, "reward_std": 0.11575892567634583, "rewards/rollout_reward_func/mean": 0.26661670207977295, "rewards/rollout_reward_func/std": 0.5572050213813782, "sampling/importance_sampling_ratio/max": 0.8549620509147644, "sampling/importance_sampling_ratio/mean": 0.5367958545684814, "sampling/importance_sampling_ratio/min": 0.48898813128471375, "sampling/sampling_logp_difference/max": 0.7104974985122681, "sampling/sampling_logp_difference/mean": 0.3302394449710846, "step": 385, "step_time": 5.8942389320000075 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.7706953287124634, "epoch": 0.00386, "grad_norm": 0.017427289858460426, "kl": 1.210419923067093, "learning_rate": 9.999943307183029e-06, "loss": 0.0011, "step": 386, "step_time": 3.0543350119842216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 2.15625, "completions/mean_terminated_length": 2.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.9199803471565247, "epoch": 0.00387, "frac_reward_zero_std": 0.25, "grad_norm": 0.08283384889364243, "kl": 1.0234687700867653, "learning_rate": 9.999942982762097e-06, "loss": -0.0036, "num_tokens": 7853166.0, "reward": 0.34395602345466614, "reward_std": 0.009126418270170689, "rewards/rollout_reward_func/mean": 0.34395602345466614, "rewards/rollout_reward_func/std": 0.4348003566265106, "sampling/importance_sampling_ratio/max": 0.6025300025939941, "sampling/importance_sampling_ratio/mean": 0.5191149115562439, "sampling/importance_sampling_ratio/min": 1.8297909264219925e-05, "sampling/sampling_logp_difference/max": 3.8010706901550293, "sampling/sampling_logp_difference/mean": 0.4348241090774536, "step": 387, "step_time": 5.82828039501328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.912853628396988, "epoch": 0.00388, "grad_norm": 0.026537414640188217, "kl": 1.0247583761811256, "learning_rate": 9.999942657415583e-06, "loss": -0.0038, "step": 388, "step_time": 3.058303881996835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7299148440361023, "epoch": 0.00389, "frac_reward_zero_std": 0.25, "grad_norm": 0.3119162321090698, "kl": 0.8751125782728195, "learning_rate": 9.99994233114348e-06, "loss": 0.0052, "num_tokens": 7896546.0, "reward": 0.10557147115468979, "reward_std": 0.0010305516188964248, "rewards/rollout_reward_func/mean": 0.10557147115468979, "rewards/rollout_reward_func/std": 0.0016220130492001772, "sampling/importance_sampling_ratio/max": 0.5793737769126892, "sampling/importance_sampling_ratio/mean": 0.5260404348373413, "sampling/importance_sampling_ratio/min": 0.17219386994838715, "sampling/sampling_logp_difference/max": 1.1155426502227783, "sampling/sampling_logp_difference/mean": 0.34133705496788025, "step": 389, "step_time": 6.043070727006125 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 2.7587615847587585, "epoch": 0.0039, "grad_norm": 0.08826188743114471, "kl": 0.8765812143683434, "learning_rate": 9.999942003945793e-06, "loss": 0.0044, "step": 390, "step_time": 3.559237340996333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7278419137001038, "epoch": 0.00391, "frac_reward_zero_std": 0.75, "grad_norm": 0.026397524401545525, "kl": 1.0681867003440857, "learning_rate": 9.999941675822523e-06, "loss": 0.0012, "num_tokens": 7936117.0, "reward": 0.6006664037704468, "reward_std": 5.447559033200378e-06, "rewards/rollout_reward_func/mean": 0.6006664037704468, "rewards/rollout_reward_func/std": 0.503882884979248, "sampling/importance_sampling_ratio/max": 0.6162264943122864, "sampling/importance_sampling_ratio/mean": 0.5427309274673462, "sampling/importance_sampling_ratio/min": 0.5243656635284424, "sampling/sampling_logp_difference/max": 0.6483714580535889, "sampling/sampling_logp_difference/mean": 0.31844189763069153, "step": 391, "step_time": 5.894702656994923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7255356907844543, "epoch": 0.00392, "grad_norm": 0.005636134184896946, "kl": 1.0700905099511147, "learning_rate": 9.999941346773667e-06, "loss": 0.0011, "step": 392, "step_time": 3.0606962789897807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.09375, "completions/mean_terminated_length": 2.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.8511748909950256, "epoch": 0.00393, "frac_reward_zero_std": 0.0, "grad_norm": 0.17795687913894653, "kl": 1.0120862796902657, "learning_rate": 9.999941016799226e-06, "loss": -0.0046, "num_tokens": 7977938.0, "reward": 0.10068999230861664, "reward_std": 0.01036738883703947, "rewards/rollout_reward_func/mean": 0.10068999230861664, "rewards/rollout_reward_func/std": 0.01845724880695343, "sampling/importance_sampling_ratio/max": 0.5980015993118286, "sampling/importance_sampling_ratio/mean": 0.5170769691467285, "sampling/importance_sampling_ratio/min": 0.00014368667325470597, "sampling/sampling_logp_difference/max": 4.5406999588012695, "sampling/sampling_logp_difference/mean": 0.4232558012008667, "step": 393, "step_time": 5.768965221992403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.85821595788002, "epoch": 0.00394, "grad_norm": 0.017893048003315926, "kl": 0.9984462186694145, "learning_rate": 9.9999406858992e-06, "loss": -0.0048, "step": 394, "step_time": 3.0668748209936894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.9823193550109863, "epoch": 0.00395, "frac_reward_zero_std": 0.25, "grad_norm": 0.009136185981333256, "kl": 0.9698525667190552, "learning_rate": 9.999940354073589e-06, "loss": -0.004, "num_tokens": 8016247.0, "reward": 0.6009671688079834, "reward_std": 0.00943584181368351, "rewards/rollout_reward_func/mean": 0.6009671688079834, "rewards/rollout_reward_func/std": 0.5044509172439575, "sampling/importance_sampling_ratio/max": 0.5589505434036255, "sampling/importance_sampling_ratio/mean": 0.5175223350524902, "sampling/importance_sampling_ratio/min": 4.431912194007015e-14, "sampling/sampling_logp_difference/max": 3.4868972301483154, "sampling/sampling_logp_difference/mean": 0.5941332578659058, "step": 395, "step_time": 6.376316107009188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.984519600868225, "epoch": 0.00396, "grad_norm": 0.00927049946039915, "kl": 0.9693247526884079, "learning_rate": 9.999940021322394e-06, "loss": -0.004, "step": 396, "step_time": 3.5105428810056765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.9233427941799164, "epoch": 0.00397, "frac_reward_zero_std": 0.25, "grad_norm": 0.010654771700501442, "kl": 0.9886147901415825, "learning_rate": 9.999939687645615e-06, "loss": -0.0038, "num_tokens": 8057715.0, "reward": 0.0763990730047226, "reward_std": 0.010350153781473637, "rewards/rollout_reward_func/mean": 0.0763990730047226, "rewards/rollout_reward_func/std": 0.04355117678642273, "sampling/importance_sampling_ratio/max": 0.5457513928413391, "sampling/importance_sampling_ratio/mean": 0.5155763626098633, "sampling/importance_sampling_ratio/min": 1.5202080705378762e-09, "sampling/sampling_logp_difference/max": 3.1324827671051025, "sampling/sampling_logp_difference/mean": 0.4817761778831482, "step": 397, "step_time": 5.849835499007895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.929961234331131, "epoch": 0.00398, "grad_norm": 0.01066440436989069, "kl": 0.9869697168469429, "learning_rate": 9.999939353043252e-06, "loss": -0.0038, "step": 398, "step_time": 3.049564446999284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.9230317771434784, "epoch": 0.00399, "frac_reward_zero_std": 0.0, "grad_norm": 0.6220861077308655, "kl": 1.2463876605033875, "learning_rate": 9.999939017515304e-06, "loss": -0.0085, "num_tokens": 8098253.0, "reward": -0.142816960811615, "reward_std": 0.24586054682731628, "rewards/rollout_reward_func/mean": -0.142816960811615, "rewards/rollout_reward_func/std": 0.4379126727581024, "sampling/importance_sampling_ratio/max": 0.7053084969520569, "sampling/importance_sampling_ratio/mean": 0.5129601955413818, "sampling/importance_sampling_ratio/min": 3.2291455909216893e-07, "sampling/sampling_logp_difference/max": 3.938126564025879, "sampling/sampling_logp_difference/mean": 0.45339441299438477, "step": 399, "step_time": 6.018506749001972 }, { "clip_ratio/high_max": 0.125, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 2.9183793663978577, "epoch": 0.004, "grad_norm": 0.057947684079408646, "kl": 1.370852991938591, "learning_rate": 9.99993868106177e-06, "loss": -0.0089, "step": 400, "step_time": 3.074388115994225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 2.133333444595337, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.3527203798294067, "epoch": 0.00401, "frac_reward_zero_std": 0.25, "grad_norm": 0.008474616333842278, "kl": 0.8707108348608017, "learning_rate": 9.999938343682654e-06, "loss": -0.0142, "num_tokens": 8133821.0, "reward": 0.844040036201477, "reward_std": 0.027589101344347, "rewards/rollout_reward_func/mean": 0.844040036201477, "rewards/rollout_reward_func/std": 0.44033870100975037, "sampling/importance_sampling_ratio/max": 0.5461845397949219, "sampling/importance_sampling_ratio/mean": 0.4834981858730316, "sampling/importance_sampling_ratio/min": 2.778643200945785e-12, "sampling/sampling_logp_difference/max": 4.135563373565674, "sampling/sampling_logp_difference/mean": 0.7190974950790405, "step": 401, "step_time": 6.710151320003206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.3550875186920166, "epoch": 0.00402, "grad_norm": 0.008114269934594631, "kl": 0.8699392229318619, "learning_rate": 9.999938005377952e-06, "loss": -0.0142, "step": 402, "step_time": 2.9940271440063952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.729668229818344, "epoch": 0.00403, "frac_reward_zero_std": 0.5, "grad_norm": 0.022877061739563942, "kl": 1.1564392521977425, "learning_rate": 9.999937666147667e-06, "loss": 0.0013, "num_tokens": 8173931.0, "reward": 0.09264299273490906, "reward_std": 0.0004730539512820542, "rewards/rollout_reward_func/mean": 0.09264299273490906, "rewards/rollout_reward_func/std": 0.7271558046340942, "sampling/importance_sampling_ratio/max": 0.5831620097160339, "sampling/importance_sampling_ratio/mean": 0.5382751822471619, "sampling/importance_sampling_ratio/min": 0.5219911336898804, "sampling/sampling_logp_difference/max": 0.6484642624855042, "sampling/sampling_logp_difference/mean": 0.3150380551815033, "step": 403, "step_time": 5.828803764983604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7199412286281586, "epoch": 0.00404, "grad_norm": 0.02916199155151844, "kl": 1.1574437767267227, "learning_rate": 9.999937325991797e-06, "loss": 0.0013, "step": 404, "step_time": 3.047678119990451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.1714641749858856, "epoch": 0.00405, "frac_reward_zero_std": 0.25, "grad_norm": 0.008110077120363712, "kl": 0.9458668678998947, "learning_rate": 9.999936984910345e-06, "loss": -0.0088, "num_tokens": 8215820.0, "reward": 0.3462100327014923, "reward_std": 0.01908387616276741, "rewards/rollout_reward_func/mean": 0.3462100327014923, "rewards/rollout_reward_func/std": 0.44189921021461487, "sampling/importance_sampling_ratio/max": 0.5502868294715881, "sampling/importance_sampling_ratio/mean": 0.5014488697052002, "sampling/importance_sampling_ratio/min": 3.925376690361304e-11, "sampling/sampling_logp_difference/max": 3.2254507541656494, "sampling/sampling_logp_difference/mean": 0.7123967409133911, "step": 405, "step_time": 5.919091584008129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.161713480949402, "epoch": 0.00406, "grad_norm": 0.0077964807860553265, "kl": 0.9474156238138676, "learning_rate": 9.999936642903308e-06, "loss": -0.0087, "step": 406, "step_time": 3.0956052820110926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7081401646137238, "epoch": 0.00407, "frac_reward_zero_std": 0.75, "grad_norm": 0.0030335565097630024, "kl": 0.9662773087620735, "learning_rate": 9.999936299970686e-06, "loss": 0.0011, "num_tokens": 8255033.0, "reward": 0.3485729992389679, "reward_std": 4.282777081243694e-05, "rewards/rollout_reward_func/mean": 0.3485729992389679, "rewards/rollout_reward_func/std": 0.443217396736145, "sampling/importance_sampling_ratio/max": 0.5526825189590454, "sampling/importance_sampling_ratio/mean": 0.536029577255249, "sampling/importance_sampling_ratio/min": 0.5257537961006165, "sampling/sampling_logp_difference/max": 0.642853319644928, "sampling/sampling_logp_difference/mean": 0.31186428666114807, "step": 407, "step_time": 6.553565403009998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.699419289827347, "epoch": 0.00408, "grad_norm": 0.0029069443698972464, "kl": 0.9676682576537132, "learning_rate": 9.999935956112484e-06, "loss": 0.0011, "step": 408, "step_time": 3.0167133230061154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.6963637471199036, "epoch": 0.00409, "frac_reward_zero_std": 0.25, "grad_norm": 0.18474549055099487, "kl": 1.192208580672741, "learning_rate": 9.999935611328696e-06, "loss": 0.002, "num_tokens": 8296345.0, "reward": 0.0992918461561203, "reward_std": 0.00029778131283819675, "rewards/rollout_reward_func/mean": 0.0992918461561203, "rewards/rollout_reward_func/std": 0.009595943614840508, "sampling/importance_sampling_ratio/max": 0.5577840209007263, "sampling/importance_sampling_ratio/mean": 0.5372985601425171, "sampling/importance_sampling_ratio/min": 0.429993599653244, "sampling/sampling_logp_difference/max": 0.6385952234268188, "sampling/sampling_logp_difference/mean": 0.31340301036834717, "step": 409, "step_time": 6.081780126987724 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.7082982063293457, "epoch": 0.0041, "grad_norm": 0.023951590061187744, "kl": 1.2153182178735733, "learning_rate": 9.999935265619325e-06, "loss": 0.0017, "step": 410, "step_time": 3.0730060420028167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 4.1875, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.04902458190918, "epoch": 0.00411, "frac_reward_zero_std": 0.0, "grad_norm": 0.21952448785305023, "kl": 0.908589094877243, "learning_rate": 9.999934918984371e-06, "loss": -0.0195, "num_tokens": 8336505.0, "reward": 0.33125540614128113, "reward_std": 0.03709753602743149, "rewards/rollout_reward_func/mean": 0.33125540614128113, "rewards/rollout_reward_func/std": 0.4434131681919098, "sampling/importance_sampling_ratio/max": 0.7303194999694824, "sampling/importance_sampling_ratio/mean": 0.4306254982948303, "sampling/importance_sampling_ratio/min": 3.2647937333807266e-16, "sampling/sampling_logp_difference/max": 4.575409412384033, "sampling/sampling_logp_difference/mean": 1.1663563251495361, "step": 411, "step_time": 6.22927525301202 }, { "clip_ratio/high_max": 0.07638888899236917, "clip_ratio/high_mean": 0.03819444449618459, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03819444449618459, "entropy": 4.025368541479111, "epoch": 0.00412, "grad_norm": 0.05125235393643379, "kl": 0.9347952380776405, "learning_rate": 9.999934571423834e-06, "loss": -0.0198, "step": 412, "step_time": 3.5161352130089654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.68949955701828, "epoch": 0.00413, "frac_reward_zero_std": 0.75, "grad_norm": 0.0019265913870185614, "kl": 1.0463660582900047, "learning_rate": 9.999934222937713e-06, "loss": 0.0013, "num_tokens": 8375610.0, "reward": 0.5435304641723633, "reward_std": 4.2828287405427545e-05, "rewards/rollout_reward_func/mean": 0.5435304641723633, "rewards/rollout_reward_func/std": 0.4609009325504303, "sampling/importance_sampling_ratio/max": 0.5536104440689087, "sampling/importance_sampling_ratio/mean": 0.5383789539337158, "sampling/importance_sampling_ratio/min": 0.527043342590332, "sampling/sampling_logp_difference/max": 0.6399010419845581, "sampling/sampling_logp_difference/mean": 0.3097202479839325, "step": 413, "step_time": 6.068807856012427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.6946024298667908, "epoch": 0.00414, "grad_norm": 0.0018899354618042707, "kl": 1.0457541793584824, "learning_rate": 9.99993387352601e-06, "loss": 0.0013, "step": 414, "step_time": 2.903656131995376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.92290335893631, "epoch": 0.00415, "frac_reward_zero_std": 0.0, "grad_norm": 0.006323328707367182, "kl": 1.1178065612912178, "learning_rate": 9.999933523188722e-06, "loss": -0.004, "num_tokens": 8418164.0, "reward": 0.10216911137104034, "reward_std": 0.010800126940011978, "rewards/rollout_reward_func/mean": 0.10216911137104034, "rewards/rollout_reward_func/std": 0.018741479143500328, "sampling/importance_sampling_ratio/max": 0.5516605377197266, "sampling/importance_sampling_ratio/mean": 0.5227677822113037, "sampling/importance_sampling_ratio/min": 1.3563339340549874e-10, "sampling/sampling_logp_difference/max": 4.596334457397461, "sampling/sampling_logp_difference/mean": 0.5022093057632446, "step": 415, "step_time": 6.1405702809934155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.9293308556079865, "epoch": 0.00416, "grad_norm": 0.006119181402027607, "kl": 1.1172736659646034, "learning_rate": 9.999933171925851e-06, "loss": -0.0039, "step": 416, "step_time": 3.1671774329952314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.712521404027939, "epoch": 0.00417, "frac_reward_zero_std": 0.5, "grad_norm": 0.06278253346681595, "kl": 1.1936735063791275, "learning_rate": 9.999932819737398e-06, "loss": 0.0015, "num_tokens": 8457465.0, "reward": 0.07620444893836975, "reward_std": 0.003140647429972887, "rewards/rollout_reward_func/mean": 0.07620444893836975, "rewards/rollout_reward_func/std": 0.7555463314056396, "sampling/importance_sampling_ratio/max": 0.5537720918655396, "sampling/importance_sampling_ratio/mean": 0.5414073467254639, "sampling/importance_sampling_ratio/min": 0.5300549268722534, "sampling/sampling_logp_difference/max": 0.6284019351005554, "sampling/sampling_logp_difference/mean": 0.3082062900066376, "step": 417, "step_time": 5.683685566007625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7130208611488342, "epoch": 0.00418, "grad_norm": 0.12819916009902954, "kl": 1.1924516558647156, "learning_rate": 9.999932466623362e-06, "loss": 0.0013, "step": 418, "step_time": 3.4245517149975058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7012226581573486, "epoch": 0.00419, "frac_reward_zero_std": 0.75, "grad_norm": 0.001785580418072641, "kl": 1.0694089755415916, "learning_rate": 9.999932112583745e-06, "loss": 0.0014, "num_tokens": 8496826.0, "reward": 0.3482673764228821, "reward_std": 1.347156194242416e-05, "rewards/rollout_reward_func/mean": 0.3482673764228821, "rewards/rollout_reward_func/std": 0.4430370032787323, "sampling/importance_sampling_ratio/max": 0.5566336512565613, "sampling/importance_sampling_ratio/mean": 0.5397079586982727, "sampling/importance_sampling_ratio/min": 0.5314926505088806, "sampling/sampling_logp_difference/max": 0.6319528222084045, "sampling/sampling_logp_difference/mean": 0.3084278106689453, "step": 419, "step_time": 6.3406281450006645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7020663022994995, "epoch": 0.0042, "grad_norm": 0.002487346762791276, "kl": 1.0692637860774994, "learning_rate": 9.999931757618544e-06, "loss": 0.0014, "step": 420, "step_time": 3.0727596009892295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.9564856588840485, "epoch": 0.00421, "frac_reward_zero_std": 0.25, "grad_norm": 0.0564020574092865, "kl": 0.9390511810779572, "learning_rate": 9.999931401727761e-06, "loss": -0.0038, "num_tokens": 8539262.0, "reward": 0.34107789397239685, "reward_std": 0.01057212334126234, "rewards/rollout_reward_func/mean": 0.34107789397239685, "rewards/rollout_reward_func/std": 0.4220257103443146, "sampling/importance_sampling_ratio/max": 0.5515941381454468, "sampling/importance_sampling_ratio/mean": 0.5238224267959595, "sampling/importance_sampling_ratio/min": 2.1244957182364743e-11, "sampling/sampling_logp_difference/max": 3.5050621032714844, "sampling/sampling_logp_difference/mean": 0.5224830508232117, "step": 421, "step_time": 6.032975552006974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "entropy": 2.9520371556282043, "epoch": 0.00422, "grad_norm": 0.017479807138442993, "kl": 0.9394653812050819, "learning_rate": 9.999931044911395e-06, "loss": -0.0039, "step": 422, "step_time": 3.1526614079994033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.6866710484027863, "epoch": 0.00423, "frac_reward_zero_std": 0.25, "grad_norm": 0.23287221789360046, "kl": 1.133057877421379, "learning_rate": 9.999930687169449e-06, "loss": 0.0016, "num_tokens": 8579880.0, "reward": 0.3526644706726074, "reward_std": 0.0016748560592532158, "rewards/rollout_reward_func/mean": 0.3526644706726074, "rewards/rollout_reward_func/std": 0.4363982379436493, "sampling/importance_sampling_ratio/max": 0.6528569459915161, "sampling/importance_sampling_ratio/mean": 0.5466225147247314, "sampling/importance_sampling_ratio/min": 0.5321844220161438, "sampling/sampling_logp_difference/max": 0.6306653618812561, "sampling/sampling_logp_difference/mean": 0.30856484174728394, "step": 423, "step_time": 5.77307716300129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.678740471601486, "epoch": 0.00424, "grad_norm": 0.008878440596163273, "kl": 1.1134870424866676, "learning_rate": 9.999930328501917e-06, "loss": 0.0015, "step": 424, "step_time": 4.0016808380169095 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7315665185451508, "epoch": 0.00425, "frac_reward_zero_std": 0.0, "grad_norm": 0.1677035540342331, "kl": 1.3569226115942001, "learning_rate": 9.999929968908805e-06, "loss": 0.002, "num_tokens": 8622678.0, "reward": 0.10453857481479645, "reward_std": 0.0009047042112797499, "rewards/rollout_reward_func/mean": 0.10453857481479645, "rewards/rollout_reward_func/std": 0.0020399694330990314, "sampling/importance_sampling_ratio/max": 0.573111355304718, "sampling/importance_sampling_ratio/mean": 0.5388957262039185, "sampling/importance_sampling_ratio/min": 0.47858884930610657, "sampling/sampling_logp_difference/max": 0.6376197338104248, "sampling/sampling_logp_difference/mean": 0.31196314096450806, "step": 425, "step_time": 6.177756983997824 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 2.7192490696907043, "epoch": 0.00426, "grad_norm": 0.025150200352072716, "kl": 1.3611610978841782, "learning_rate": 9.99992960839011e-06, "loss": 0.0014, "step": 426, "step_time": 3.2094475089979824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.689169615507126, "epoch": 0.00427, "frac_reward_zero_std": 0.75, "grad_norm": 0.006060889456421137, "kl": 1.0454642474651337, "learning_rate": 9.999929246945834e-06, "loss": 0.0013, "num_tokens": 8661699.0, "reward": 0.6020733118057251, "reward_std": 0.000455753062851727, "rewards/rollout_reward_func/mean": 0.6020733118057251, "rewards/rollout_reward_func/std": 0.5058825016021729, "sampling/importance_sampling_ratio/max": 0.5510045289993286, "sampling/importance_sampling_ratio/mean": 0.5426139831542969, "sampling/importance_sampling_ratio/min": 0.5297391414642334, "sampling/sampling_logp_difference/max": 0.635175883769989, "sampling/sampling_logp_difference/mean": 0.3057263493537903, "step": 427, "step_time": 5.6168967580015305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.6897929310798645, "epoch": 0.00428, "grad_norm": 0.006136023439466953, "kl": 1.045371949672699, "learning_rate": 9.999928884575976e-06, "loss": 0.0013, "step": 428, "step_time": 3.0513231139993877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.6835611760616302, "epoch": 0.00429, "frac_reward_zero_std": 0.0, "grad_norm": 0.07187142223119736, "kl": 1.1301250979304314, "learning_rate": 9.999928521280536e-06, "loss": 0.001, "num_tokens": 8704073.0, "reward": 0.10525877773761749, "reward_std": 0.0007617705268785357, "rewards/rollout_reward_func/mean": 0.10525877773761749, "rewards/rollout_reward_func/std": 0.0022673988714814186, "sampling/importance_sampling_ratio/max": 0.5549001693725586, "sampling/importance_sampling_ratio/mean": 0.538770318031311, "sampling/importance_sampling_ratio/min": 0.4152451157569885, "sampling/sampling_logp_difference/max": 0.6373658180236816, "sampling/sampling_logp_difference/mean": 0.3097766041755676, "step": 429, "step_time": 6.58192044399766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.6907349228858948, "epoch": 0.0043, "grad_norm": 0.006957888137549162, "kl": 1.122695803642273, "learning_rate": 9.999928157059513e-06, "loss": 0.0009, "step": 430, "step_time": 3.754892712997389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.930550068616867, "epoch": 0.00431, "frac_reward_zero_std": 0.0, "grad_norm": 0.02355450764298439, "kl": 1.211604230105877, "learning_rate": 9.99992779191291e-06, "loss": -0.0036, "num_tokens": 8746307.0, "reward": 0.10170510411262512, "reward_std": 0.010590854100883007, "rewards/rollout_reward_func/mean": 0.10170510411262512, "rewards/rollout_reward_func/std": 0.018668873235583305, "sampling/importance_sampling_ratio/max": 0.552848219871521, "sampling/importance_sampling_ratio/mean": 0.524114727973938, "sampling/importance_sampling_ratio/min": 6.808364982902049e-11, "sampling/sampling_logp_difference/max": 3.0836143493652344, "sampling/sampling_logp_difference/mean": 0.5082923173904419, "step": 431, "step_time": 5.727189257006103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.9296302795410156, "epoch": 0.00432, "grad_norm": 0.02960243634879589, "kl": 1.2113270685076714, "learning_rate": 9.999927425840725e-06, "loss": -0.0036, "step": 432, "step_time": 3.0854539899955853 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.09375, "completions/mean_terminated_length": 2.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.8842803835868835, "epoch": 0.00433, "frac_reward_zero_std": 0.0, "grad_norm": 0.13376402854919434, "kl": 1.1748857498168945, "learning_rate": 9.999927058842958e-06, "loss": -0.0025, "num_tokens": 8788488.0, "reward": 0.0965985506772995, "reward_std": 0.010480819270014763, "rewards/rollout_reward_func/mean": 0.0965985506772995, "rewards/rollout_reward_func/std": 0.020270125940442085, "sampling/importance_sampling_ratio/max": 0.5520986914634705, "sampling/importance_sampling_ratio/mean": 0.5100770592689514, "sampling/importance_sampling_ratio/min": 0.00010261138231726363, "sampling/sampling_logp_difference/max": 4.835245132446289, "sampling/sampling_logp_difference/mean": 0.43661442399024963, "step": 433, "step_time": 5.893072790982842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.8908298015594482, "epoch": 0.00434, "grad_norm": 0.0906207412481308, "kl": 1.1652782335877419, "learning_rate": 9.999926690919612e-06, "loss": -0.0027, "step": 434, "step_time": 3.0792049109950312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.1000001430511475, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.308382272720337, "epoch": 0.00435, "frac_reward_zero_std": 0.0, "grad_norm": 0.08150126785039902, "kl": 0.9193316251039505, "learning_rate": 9.999926322070682e-06, "loss": -0.0144, "num_tokens": 8829875.0, "reward": 0.09514087438583374, "reward_std": 0.027883773669600487, "rewards/rollout_reward_func/mean": 0.09514087438583374, "rewards/rollout_reward_func/std": 0.03109660930931568, "sampling/importance_sampling_ratio/max": 0.5545209646224976, "sampling/importance_sampling_ratio/mean": 0.48661333322525024, "sampling/importance_sampling_ratio/min": 5.470319191386037e-15, "sampling/sampling_logp_difference/max": 5.0744147300720215, "sampling/sampling_logp_difference/mean": 0.747878909111023, "step": 435, "step_time": 6.453212299005827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.304262161254883, "epoch": 0.00436, "grad_norm": 0.16772998869419098, "kl": 0.9185117334127426, "learning_rate": 9.999925952296172e-06, "loss": -0.0147, "step": 436, "step_time": 3.55816607199813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.734295040369034, "epoch": 0.00437, "frac_reward_zero_std": 0.5, "grad_norm": 0.018815012648701668, "kl": 1.1601218357682228, "learning_rate": 9.99992558159608e-06, "loss": 0.0016, "num_tokens": 8870971.0, "reward": 0.3369315266609192, "reward_std": 0.000492552004288882, "rewards/rollout_reward_func/mean": 0.3369315266609192, "rewards/rollout_reward_func/std": 0.42415472865104675, "sampling/importance_sampling_ratio/max": 0.5509318113327026, "sampling/importance_sampling_ratio/mean": 0.5345597267150879, "sampling/importance_sampling_ratio/min": 0.4903438091278076, "sampling/sampling_logp_difference/max": 0.6758821606636047, "sampling/sampling_logp_difference/mean": 0.31386566162109375, "step": 437, "step_time": 5.878999984000984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7433095276355743, "epoch": 0.00438, "grad_norm": 0.0426999107003212, "kl": 1.1571246683597565, "learning_rate": 9.999925209970408e-06, "loss": 0.0015, "step": 438, "step_time": 3.107405872011441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.4193546772003174, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.1812087297439575, "epoch": 0.00439, "frac_reward_zero_std": 0.25, "grad_norm": 0.045081570744514465, "kl": 0.9653512164950371, "learning_rate": 9.999924837419155e-06, "loss": -0.0106, "num_tokens": 8908915.0, "reward": 0.5945838093757629, "reward_std": 0.02146821655333042, "rewards/rollout_reward_func/mean": 0.5945838093757629, "rewards/rollout_reward_func/std": 0.5049446821212769, "sampling/importance_sampling_ratio/max": 0.6475657224655151, "sampling/importance_sampling_ratio/mean": 0.4945265054702759, "sampling/importance_sampling_ratio/min": 1.1787316225309041e-06, "sampling/sampling_logp_difference/max": 5.021643161773682, "sampling/sampling_logp_difference/mean": 0.5425955057144165, "step": 439, "step_time": 5.792061946995091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.1788491308689117, "epoch": 0.0044, "grad_norm": 0.04368813335895538, "kl": 0.9647703468799591, "learning_rate": 9.999924463942322e-06, "loss": -0.0107, "step": 440, "step_time": 3.0487328030067147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7181134819984436, "epoch": 0.00441, "frac_reward_zero_std": 0.5, "grad_norm": 0.007758176885545254, "kl": 1.1325199827551842, "learning_rate": 9.999924089539907e-06, "loss": 0.0013, "num_tokens": 8949001.0, "reward": 0.35546717047691345, "reward_std": 0.00021613524586427957, "rewards/rollout_reward_func/mean": 0.35546717047691345, "rewards/rollout_reward_func/std": 0.4390576183795929, "sampling/importance_sampling_ratio/max": 0.5524361729621887, "sampling/importance_sampling_ratio/mean": 0.5361099243164062, "sampling/importance_sampling_ratio/min": 0.5150182843208313, "sampling/sampling_logp_difference/max": 0.661779522895813, "sampling/sampling_logp_difference/mean": 0.3119322955608368, "step": 441, "step_time": 6.395494129988947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7171662151813507, "epoch": 0.00442, "grad_norm": 0.008344382978975773, "kl": 1.1325877532362938, "learning_rate": 9.999923714211912e-06, "loss": 0.0013, "step": 442, "step_time": 3.5609445799927926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 2.125, "completions/mean_terminated_length": 2.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.877919316291809, "epoch": 0.00443, "frac_reward_zero_std": 0.5, "grad_norm": 0.06161070615053177, "kl": 1.0097579583525658, "learning_rate": 9.999923337958336e-06, "loss": -0.0038, "num_tokens": 8989247.0, "reward": 0.35039111971855164, "reward_std": 0.009577627293765545, "rewards/rollout_reward_func/mean": 0.35039111971855164, "rewards/rollout_reward_func/std": 0.44241026043891907, "sampling/importance_sampling_ratio/max": 0.8192674517631531, "sampling/importance_sampling_ratio/mean": 0.5258864164352417, "sampling/importance_sampling_ratio/min": 0.00016126171976793557, "sampling/sampling_logp_difference/max": 4.420963287353516, "sampling/sampling_logp_difference/mean": 0.41757825016975403, "step": 443, "step_time": 5.838284781006223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8866722881793976, "epoch": 0.00444, "grad_norm": 0.14290612936019897, "kl": 1.0144018605351448, "learning_rate": 9.99992296077918e-06, "loss": -0.0043, "step": 444, "step_time": 3.093900820000272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.743689000606537, "epoch": 0.00445, "frac_reward_zero_std": 0.5, "grad_norm": 0.1949336677789688, "kl": 1.2524998784065247, "learning_rate": 9.999922582674445e-06, "loss": 0.002, "num_tokens": 9030575.0, "reward": -0.15759995579719543, "reward_std": 0.00046386668691411614, "rewards/rollout_reward_func/mean": -0.15759995579719543, "rewards/rollout_reward_func/std": 0.4474571943283081, "sampling/importance_sampling_ratio/max": 0.5602570176124573, "sampling/importance_sampling_ratio/mean": 0.5402560234069824, "sampling/importance_sampling_ratio/min": 0.4965556859970093, "sampling/sampling_logp_difference/max": 0.6753796339035034, "sampling/sampling_logp_difference/mean": 0.31320029497146606, "step": 445, "step_time": 5.90837532900332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7629294991493225, "epoch": 0.00446, "grad_norm": 0.14090707898139954, "kl": 1.248983919620514, "learning_rate": 9.999922203644126e-06, "loss": 0.0014, "step": 446, "step_time": 3.0690266270103166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.1318198442459106, "epoch": 0.00447, "frac_reward_zero_std": 0.0, "grad_norm": 0.14261695742607117, "kl": 1.0193668082356453, "learning_rate": 9.99992182368823e-06, "loss": -0.0084, "num_tokens": 9072151.0, "reward": 0.09321726858615875, "reward_std": 0.01745021715760231, "rewards/rollout_reward_func/mean": 0.09321726858615875, "rewards/rollout_reward_func/std": 0.026682652533054352, "sampling/importance_sampling_ratio/max": 0.5556139349937439, "sampling/importance_sampling_ratio/mean": 0.5026503801345825, "sampling/importance_sampling_ratio/min": 1.0125966905250028e-11, "sampling/sampling_logp_difference/max": 4.505624294281006, "sampling/sampling_logp_difference/mean": 0.6571238040924072, "step": 447, "step_time": 6.4473275109994574 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.1287147402763367, "epoch": 0.00448, "grad_norm": 0.042559485882520676, "kl": 1.023899957537651, "learning_rate": 9.999921442806754e-06, "loss": -0.0085, "step": 448, "step_time": 3.5804524940031115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7095624208450317, "epoch": 0.00449, "frac_reward_zero_std": 0.5, "grad_norm": 0.02973693609237671, "kl": 1.2340107858181, "learning_rate": 9.999921060999696e-06, "loss": 0.0011, "num_tokens": 9111464.0, "reward": 0.3571970760822296, "reward_std": 0.0002021922409767285, "rewards/rollout_reward_func/mean": 0.3571970760822296, "rewards/rollout_reward_func/std": 0.4380638301372528, "sampling/importance_sampling_ratio/max": 0.5565949082374573, "sampling/importance_sampling_ratio/mean": 0.5375528335571289, "sampling/importance_sampling_ratio/min": 0.45937126874923706, "sampling/sampling_logp_difference/max": 0.6518743634223938, "sampling/sampling_logp_difference/mean": 0.31068602204322815, "step": 449, "step_time": 5.696552839996002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.708295911550522, "epoch": 0.0045, "grad_norm": 0.19239631295204163, "kl": 1.2210954502224922, "learning_rate": 9.99992067826706e-06, "loss": 0.0014, "step": 450, "step_time": 3.055624339991482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.896091639995575, "epoch": 0.00451, "frac_reward_zero_std": 0.25, "grad_norm": 0.008821312338113785, "kl": 0.9796069264411926, "learning_rate": 9.999920294608844e-06, "loss": -0.0039, "num_tokens": 9152971.0, "reward": 0.3494406044483185, "reward_std": 0.00893965270370245, "rewards/rollout_reward_func/mean": 0.3494406044483185, "rewards/rollout_reward_func/std": 0.4315055012702942, "sampling/importance_sampling_ratio/max": 0.5581957101821899, "sampling/importance_sampling_ratio/mean": 0.5245254635810852, "sampling/importance_sampling_ratio/min": 1.4688904537152325e-09, "sampling/sampling_logp_difference/max": 2.937666654586792, "sampling/sampling_logp_difference/mean": 0.4744534492492676, "step": 451, "step_time": 5.736738471015997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8947673439979553, "epoch": 0.00452, "grad_norm": 0.008754800073802471, "kl": 0.9797859117388725, "learning_rate": 9.999919910025047e-06, "loss": -0.004, "step": 452, "step_time": 3.078919137005869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 3.3125, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.422116309404373, "epoch": 0.00453, "frac_reward_zero_std": 0.0, "grad_norm": 0.021231329068541527, "kl": 1.077489361166954, "learning_rate": 9.999919524515672e-06, "loss": -0.0141, "num_tokens": 9193946.0, "reward": 0.3449193835258484, "reward_std": 0.02778114750981331, "rewards/rollout_reward_func/mean": 0.3449193835258484, "rewards/rollout_reward_func/std": 0.4387126564979553, "sampling/importance_sampling_ratio/max": 0.5558863878250122, "sampling/importance_sampling_ratio/mean": 0.48779526352882385, "sampling/importance_sampling_ratio/min": 1.2984136708242121e-14, "sampling/sampling_logp_difference/max": 4.517063140869141, "sampling/sampling_logp_difference/mean": 0.8613619804382324, "step": 453, "step_time": 6.711300229013432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.4198447167873383, "epoch": 0.00454, "grad_norm": 0.03543883562088013, "kl": 1.0769778415560722, "learning_rate": 9.999919138080717e-06, "loss": -0.0141, "step": 454, "step_time": 3.0941112610016717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.961470514535904, "epoch": 0.00455, "frac_reward_zero_std": 0.25, "grad_norm": 0.04734553396701813, "kl": 0.9307474568486214, "learning_rate": 9.999918750720182e-06, "loss": -0.0038, "num_tokens": 9233016.0, "reward": 0.0611567348241806, "reward_std": 0.009611873887479305, "rewards/rollout_reward_func/mean": 0.0611567348241806, "rewards/rollout_reward_func/std": 0.7727709412574768, "sampling/importance_sampling_ratio/max": 0.5524157881736755, "sampling/importance_sampling_ratio/mean": 0.5225967168807983, "sampling/importance_sampling_ratio/min": 1.9738126877302115e-13, "sampling/sampling_logp_difference/max": 3.8511829376220703, "sampling/sampling_logp_difference/mean": 0.5732860565185547, "step": 455, "step_time": 5.77700033900328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.9525896310806274, "epoch": 0.00456, "grad_norm": 0.024526406079530716, "kl": 0.9323331788182259, "learning_rate": 9.99991836243407e-06, "loss": -0.0037, "step": 456, "step_time": 3.0714843070163624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.749163508415222, "epoch": 0.00457, "frac_reward_zero_std": 0.25, "grad_norm": 0.16673530638217926, "kl": 1.0703470408916473, "learning_rate": 9.999917973222375e-06, "loss": -0.0006, "num_tokens": 9274466.0, "reward": -0.16849292814731598, "reward_std": 0.0004664571606554091, "rewards/rollout_reward_func/mean": -0.16849292814731598, "rewards/rollout_reward_func/std": 0.4811290204524994, "sampling/importance_sampling_ratio/max": 0.6929132342338562, "sampling/importance_sampling_ratio/mean": 0.535216748714447, "sampling/importance_sampling_ratio/min": 0.2163926661014557, "sampling/sampling_logp_difference/max": 0.8983720541000366, "sampling/sampling_logp_difference/mean": 0.3262324929237366, "step": 457, "step_time": 5.972061799991934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.73340567946434, "epoch": 0.00458, "grad_norm": 0.10521449148654938, "kl": 1.0679423958063126, "learning_rate": 9.999917583085104e-06, "loss": -0.0006, "step": 458, "step_time": 3.5983961829988402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.6877855360507965, "epoch": 0.00459, "frac_reward_zero_std": 0.75, "grad_norm": 0.002841962967067957, "kl": 0.9765288755297661, "learning_rate": 9.999917192022251e-06, "loss": 0.0013, "num_tokens": 9310862.0, "reward": 0.3141860365867615, "reward_std": 4.868111864197999e-05, "rewards/rollout_reward_func/mean": 0.3141860365867615, "rewards/rollout_reward_func/std": 0.9043595194816589, "sampling/importance_sampling_ratio/max": 0.5505468249320984, "sampling/importance_sampling_ratio/mean": 0.5413336753845215, "sampling/importance_sampling_ratio/min": 0.5334103107452393, "sampling/sampling_logp_difference/max": 0.6282782554626465, "sampling/sampling_logp_difference/mean": 0.3069126009941101, "step": 459, "step_time": 6.14748296700418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.6851991415023804, "epoch": 0.0046, "grad_norm": 0.0027534456457942724, "kl": 0.9763061851263046, "learning_rate": 9.999916800033823e-06, "loss": 0.0013, "step": 460, "step_time": 2.9414823460028856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.660596549510956, "epoch": 0.00461, "frac_reward_zero_std": 0.25, "grad_norm": 0.08066925406455994, "kl": 1.318151980638504, "learning_rate": 9.999916407119812e-06, "loss": 0.0018, "num_tokens": 9352896.0, "reward": 0.10801655054092407, "reward_std": 0.003147308249026537, "rewards/rollout_reward_func/mean": 0.10801655054092407, "rewards/rollout_reward_func/std": 0.0043716575019061565, "sampling/importance_sampling_ratio/max": 0.5602051019668579, "sampling/importance_sampling_ratio/mean": 0.5451946258544922, "sampling/importance_sampling_ratio/min": 0.5233088731765747, "sampling/sampling_logp_difference/max": 0.6196065545082092, "sampling/sampling_logp_difference/mean": 0.30345094203948975, "step": 461, "step_time": 5.648663913001656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.66501846909523, "epoch": 0.00462, "grad_norm": 0.11379145085811615, "kl": 1.3136860728263855, "learning_rate": 9.999916013280226e-06, "loss": 0.0015, "step": 462, "step_time": 3.0400655060147983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.6447930335998535, "epoch": 0.00463, "frac_reward_zero_std": 0.0, "grad_norm": 0.04842361435294151, "kl": 1.0901550129055977, "learning_rate": 9.999915618515059e-06, "loss": 0.0008, "num_tokens": 9395194.0, "reward": 0.10449589788913727, "reward_std": 0.0004907778347842395, "rewards/rollout_reward_func/mean": 0.10449589788913727, "rewards/rollout_reward_func/std": 0.0007726186886429787, "sampling/importance_sampling_ratio/max": 0.558030366897583, "sampling/importance_sampling_ratio/mean": 0.5407710671424866, "sampling/importance_sampling_ratio/min": 0.471336305141449, "sampling/sampling_logp_difference/max": 0.6335321068763733, "sampling/sampling_logp_difference/mean": 0.30832725763320923, "step": 463, "step_time": 5.761668926010316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.6462598741054535, "epoch": 0.00464, "grad_norm": 0.03338288143277168, "kl": 1.087962567806244, "learning_rate": 9.999915222824314e-06, "loss": 0.0007, "step": 464, "step_time": 3.5085811969911447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.647010177373886, "epoch": 0.00465, "frac_reward_zero_std": 0.5, "grad_norm": 0.005422234069555998, "kl": 1.1719671189785004, "learning_rate": 9.999914826207992e-06, "loss": 0.001, "num_tokens": 9434148.0, "reward": 0.6020398736000061, "reward_std": 0.0005678617162629962, "rewards/rollout_reward_func/mean": 0.6020398736000061, "rewards/rollout_reward_func/std": 0.5059764981269836, "sampling/importance_sampling_ratio/max": 0.9884330034255981, "sampling/importance_sampling_ratio/mean": 0.5569239854812622, "sampling/importance_sampling_ratio/min": 0.5324875712394714, "sampling/sampling_logp_difference/max": 0.6298899054527283, "sampling/sampling_logp_difference/mean": 0.3148520886898041, "step": 465, "step_time": 6.169592013990041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.6522029042243958, "epoch": 0.00466, "grad_norm": 0.012538041919469833, "kl": 1.1711322963237762, "learning_rate": 9.99991442866609e-06, "loss": 0.001, "step": 466, "step_time": 3.069545349011605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 3.3125, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.2995337545871735, "epoch": 0.00467, "frac_reward_zero_std": 0.0, "grad_norm": 0.005546664819121361, "kl": 0.9911949411034584, "learning_rate": 9.999914030198609e-06, "loss": -0.0144, "num_tokens": 9470511.0, "reward": 0.5894797444343567, "reward_std": 0.025651250034570694, "rewards/rollout_reward_func/mean": 0.5894797444343567, "rewards/rollout_reward_func/std": 0.5104472041130066, "sampling/importance_sampling_ratio/max": 0.5521209836006165, "sampling/importance_sampling_ratio/mean": 0.48860475420951843, "sampling/importance_sampling_ratio/min": 6.495149436466698e-15, "sampling/sampling_logp_difference/max": 4.621286869049072, "sampling/sampling_logp_difference/mean": 1.006779432296753, "step": 467, "step_time": 5.818309629001305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.3081510961055756, "epoch": 0.00468, "grad_norm": 0.00550453644245863, "kl": 0.9896981474012136, "learning_rate": 9.999913630805554e-06, "loss": -0.0144, "step": 468, "step_time": 3.0409024650070933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7931310832500458, "epoch": 0.00469, "frac_reward_zero_std": 0.5, "grad_norm": 0.04087354987859726, "kl": 0.9596055373549461, "learning_rate": 9.999913230486916e-06, "loss": -0.004, "num_tokens": 9510103.0, "reward": 0.5833223462104797, "reward_std": 0.005738573148846626, "rewards/rollout_reward_func/mean": 0.5833223462104797, "rewards/rollout_reward_func/std": 0.49945905804634094, "sampling/importance_sampling_ratio/max": 0.557659387588501, "sampling/importance_sampling_ratio/mean": 0.509528636932373, "sampling/importance_sampling_ratio/min": 2.953791636173264e-06, "sampling/sampling_logp_difference/max": 2.5759854316711426, "sampling/sampling_logp_difference/mean": 0.41752859950065613, "step": 469, "step_time": 5.716695035000157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8025714457035065, "epoch": 0.0047, "grad_norm": 0.032887183129787445, "kl": 0.9527268707752228, "learning_rate": 9.999912829242704e-06, "loss": -0.004, "step": 470, "step_time": 3.5224401319937897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.9277610778808594, "epoch": 0.00471, "frac_reward_zero_std": 0.75, "grad_norm": 0.004006526432931423, "kl": 0.8843130469322205, "learning_rate": 9.999912427072911e-06, "loss": -0.004, "num_tokens": 9547477.0, "reward": 0.05603373050689697, "reward_std": 0.007317809853702784, "rewards/rollout_reward_func/mean": 0.05603373050689697, "rewards/rollout_reward_func/std": 1.0657845735549927, "sampling/importance_sampling_ratio/max": 0.5557239055633545, "sampling/importance_sampling_ratio/mean": 0.5240480899810791, "sampling/importance_sampling_ratio/min": 1.9429446387279198e-13, "sampling/sampling_logp_difference/max": 4.0564470291137695, "sampling/sampling_logp_difference/mean": 0.571541965007782, "step": 471, "step_time": 5.987312760022178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.9223932325839996, "epoch": 0.00472, "grad_norm": 0.004040502943098545, "kl": 0.8851438611745834, "learning_rate": 9.999912023977543e-06, "loss": -0.004, "step": 472, "step_time": 2.9066303090075962 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.9670132100582123, "epoch": 0.00473, "frac_reward_zero_std": 0.0, "grad_norm": 0.16038697957992554, "kl": 1.1969030275940895, "learning_rate": 9.999911619956595e-06, "loss": -0.0069, "num_tokens": 9589673.0, "reward": 0.09560089558362961, "reward_std": 0.01107706967741251, "rewards/rollout_reward_func/mean": 0.09560089558362961, "rewards/rollout_reward_func/std": 0.020523136481642723, "sampling/importance_sampling_ratio/max": 0.970779538154602, "sampling/importance_sampling_ratio/mean": 0.5252382755279541, "sampling/importance_sampling_ratio/min": 1.4472556833511341e-11, "sampling/sampling_logp_difference/max": 4.050342559814453, "sampling/sampling_logp_difference/mean": 0.5643775463104248, "step": 473, "step_time": 5.8612524250056595 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 2.987207591533661, "epoch": 0.00474, "grad_norm": 0.05307908356189728, "kl": 1.0673877000808716, "learning_rate": 9.999911215010072e-06, "loss": -0.0074, "step": 474, "step_time": 3.09597721001046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.876398116350174, "epoch": 0.00475, "frac_reward_zero_std": 0.25, "grad_norm": 0.06296765804290771, "kl": 0.9788105934858322, "learning_rate": 9.99991080913797e-06, "loss": -0.0045, "num_tokens": 9628165.0, "reward": 0.6025270223617554, "reward_std": 0.010603957809507847, "rewards/rollout_reward_func/mean": 0.6025270223617554, "rewards/rollout_reward_func/std": 0.5029216408729553, "sampling/importance_sampling_ratio/max": 0.7855824828147888, "sampling/importance_sampling_ratio/mean": 0.5294749140739441, "sampling/importance_sampling_ratio/min": 2.8265867868526584e-08, "sampling/sampling_logp_difference/max": 2.68607759475708, "sampling/sampling_logp_difference/mean": 0.4493069648742676, "step": 475, "step_time": 5.917479548006668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8801376819610596, "epoch": 0.00476, "grad_norm": 0.06454192847013474, "kl": 0.9792990386486053, "learning_rate": 9.999910402340289e-06, "loss": -0.0045, "step": 476, "step_time": 3.5076979880032013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.999976873397827, "epoch": 0.00477, "frac_reward_zero_std": 0.25, "grad_norm": 0.3056122958660126, "kl": 1.143636092543602, "learning_rate": 9.999909994617032e-06, "loss": -0.0053, "num_tokens": 9668987.0, "reward": 0.10166998207569122, "reward_std": 0.009171811863780022, "rewards/rollout_reward_func/mean": 0.10166998207569122, "rewards/rollout_reward_func/std": 0.01872175559401512, "sampling/importance_sampling_ratio/max": 1.1166965961456299, "sampling/importance_sampling_ratio/mean": 0.5188345909118652, "sampling/importance_sampling_ratio/min": 2.6191345641780883e-13, "sampling/sampling_logp_difference/max": 4.580442428588867, "sampling/sampling_logp_difference/mean": 0.6363483667373657, "step": 477, "step_time": 6.295597207994433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 3.0031436383724213, "epoch": 0.00478, "grad_norm": 0.14710178971290588, "kl": 0.9742936342954636, "learning_rate": 9.9999095859682e-06, "loss": -0.0029, "step": 478, "step_time": 3.0709809370018775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.9535921216011047, "epoch": 0.00479, "frac_reward_zero_std": 0.75, "grad_norm": 0.7165324091911316, "kl": 0.9907315373420715, "learning_rate": 9.99990917639379e-06, "loss": 0.0043, "num_tokens": 9706263.0, "reward": 0.47077253460884094, "reward_std": 0.13119207322597504, "rewards/rollout_reward_func/mean": 0.47077253460884094, "rewards/rollout_reward_func/std": 0.7160126566886902, "sampling/importance_sampling_ratio/max": 0.7382158041000366, "sampling/importance_sampling_ratio/mean": 0.5229501724243164, "sampling/importance_sampling_ratio/min": 3.3510329927599614e-11, "sampling/sampling_logp_difference/max": 3.8995471000671387, "sampling/sampling_logp_difference/mean": 0.5262254476547241, "step": 479, "step_time": 5.569471561997489 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.034722222248092294, "entropy": 2.919057250022888, "epoch": 0.0048, "grad_norm": 0.7439448237419128, "kl": 2.4015181809663773, "learning_rate": 9.999908765893802e-06, "loss": 0.0058, "step": 480, "step_time": 2.918934793997323 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7884040474891663, "epoch": 0.00481, "frac_reward_zero_std": 0.5, "grad_norm": 0.6425349712371826, "kl": 1.1533272564411163, "learning_rate": 9.999908354468237e-06, "loss": -0.003, "num_tokens": 9748013.0, "reward": 0.060738492757081985, "reward_std": 0.1156291514635086, "rewards/rollout_reward_func/mean": 0.060738492757081985, "rewards/rollout_reward_func/std": 0.23100529611110687, "sampling/importance_sampling_ratio/max": 0.6532464623451233, "sampling/importance_sampling_ratio/mean": 0.5239980220794678, "sampling/importance_sampling_ratio/min": 0.20038093626499176, "sampling/sampling_logp_difference/max": 0.9598023891448975, "sampling/sampling_logp_difference/mean": 0.3525767922401428, "step": 481, "step_time": 6.007366552003077 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 2.7646283507347107, "epoch": 0.00482, "grad_norm": 0.2952437102794647, "kl": 1.9175776839256287, "learning_rate": 9.999907942117095e-06, "loss": -0.0034, "step": 482, "step_time": 3.5431507369939936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 2.75, "completions/mean_terminated_length": 2.322580575942993, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.2464130520820618, "epoch": 0.00483, "frac_reward_zero_std": 0.0, "grad_norm": 0.013485653325915337, "kl": 1.0659892708063126, "learning_rate": 9.999907528840379e-06, "loss": -0.0092, "num_tokens": 9791069.0, "reward": 0.09781157970428467, "reward_std": 0.018528103828430176, "rewards/rollout_reward_func/mean": 0.09781157970428467, "rewards/rollout_reward_func/std": 0.025700338184833527, "sampling/importance_sampling_ratio/max": 0.5682226419448853, "sampling/importance_sampling_ratio/mean": 0.5087196826934814, "sampling/importance_sampling_ratio/min": 3.270833473859036e-12, "sampling/sampling_logp_difference/max": 4.343385219573975, "sampling/sampling_logp_difference/mean": 0.7923517823219299, "step": 483, "step_time": 6.687501880995114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.240082412958145, "epoch": 0.00484, "grad_norm": 0.013662425801157951, "kl": 1.067462146282196, "learning_rate": 9.999907114638084e-06, "loss": -0.0093, "step": 484, "step_time": 3.213321442999586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.8673077523708344, "epoch": 0.00485, "frac_reward_zero_std": 0.25, "grad_norm": 0.1645040214061737, "kl": 1.0000761970877647, "learning_rate": 9.999906699510213e-06, "loss": -0.0031, "num_tokens": 9831583.0, "reward": 0.3512289226055145, "reward_std": 0.009537361562252045, "rewards/rollout_reward_func/mean": 0.3512289226055145, "rewards/rollout_reward_func/std": 0.43427085876464844, "sampling/importance_sampling_ratio/max": 0.6285489797592163, "sampling/importance_sampling_ratio/mean": 0.5275900363922119, "sampling/importance_sampling_ratio/min": 2.66933248838086e-08, "sampling/sampling_logp_difference/max": 3.210526466369629, "sampling/sampling_logp_difference/mean": 0.44869160652160645, "step": 485, "step_time": 5.7710697679940495 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.871576577425003, "epoch": 0.00486, "grad_norm": 0.009449208155274391, "kl": 1.0057553127408028, "learning_rate": 9.999906283456766e-06, "loss": -0.0034, "step": 486, "step_time": 3.078282703005243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.694427102804184, "epoch": 0.00487, "frac_reward_zero_std": 0.25, "grad_norm": 0.007759067229926586, "kl": 1.0113678574562073, "learning_rate": 9.999905866477743e-06, "loss": 0.0009, "num_tokens": 9874713.0, "reward": 0.10498730093240738, "reward_std": 0.0002978151896968484, "rewards/rollout_reward_func/mean": 0.10498730093240738, "rewards/rollout_reward_func/std": 0.0006212832522578537, "sampling/importance_sampling_ratio/max": 0.5542179942131042, "sampling/importance_sampling_ratio/mean": 0.537543773651123, "sampling/importance_sampling_ratio/min": 0.5245906114578247, "sampling/sampling_logp_difference/max": 0.6448960900306702, "sampling/sampling_logp_difference/mean": 0.31044191122055054, "step": 487, "step_time": 6.5372335630017915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.6949787735939026, "epoch": 0.00488, "grad_norm": 0.007755948230624199, "kl": 1.0114487558603287, "learning_rate": 9.999905448573144e-06, "loss": 0.0009, "step": 488, "step_time": 3.170108133010217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.8055759370326996, "epoch": 0.00489, "frac_reward_zero_std": 0.75, "grad_norm": 0.004924205597490072, "kl": 1.2360328361392021, "learning_rate": 9.999905029742968e-06, "loss": 0.0018, "num_tokens": 9915189.0, "reward": 0.02923479676246643, "reward_std": 0.0002675300929695368, "rewards/rollout_reward_func/mean": 0.02923479676246643, "rewards/rollout_reward_func/std": 0.6956003308296204, "sampling/importance_sampling_ratio/max": 0.5543280243873596, "sampling/importance_sampling_ratio/mean": 0.5336862206459045, "sampling/importance_sampling_ratio/min": 0.47337785363197327, "sampling/sampling_logp_difference/max": 0.6654193997383118, "sampling/sampling_logp_difference/mean": 0.3141803443431854, "step": 489, "step_time": 6.175441724997654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8139052391052246, "epoch": 0.0049, "grad_norm": 0.004935828968882561, "kl": 1.2350043132901192, "learning_rate": 9.99990460998722e-06, "loss": 0.0017, "step": 490, "step_time": 3.0520327520061983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.6903190314769745, "epoch": 0.00491, "frac_reward_zero_std": 0.5, "grad_norm": 0.011650137603282928, "kl": 1.142210640013218, "learning_rate": 9.999904189305892e-06, "loss": 0.0016, "num_tokens": 9954915.0, "reward": 0.6022869944572449, "reward_std": 0.0004891662974841893, "rewards/rollout_reward_func/mean": 0.6022869944572449, "rewards/rollout_reward_func/std": 0.5059731006622314, "sampling/importance_sampling_ratio/max": 0.5547549724578857, "sampling/importance_sampling_ratio/mean": 0.5420206785202026, "sampling/importance_sampling_ratio/min": 0.5303077101707458, "sampling/sampling_logp_difference/max": 0.6339887380599976, "sampling/sampling_logp_difference/mean": 0.3068014085292816, "step": 491, "step_time": 5.825483778986381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.6993240118026733, "epoch": 0.00492, "grad_norm": 0.013608118519186974, "kl": 1.1413128674030304, "learning_rate": 9.999903767698988e-06, "loss": 0.0017, "step": 492, "step_time": 3.065422330997535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 2.21875, "completions/mean_terminated_length": 2.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.1976709067821503, "epoch": 0.00493, "frac_reward_zero_std": 0.0, "grad_norm": 0.22274678945541382, "kl": 1.0953970775008202, "learning_rate": 9.999903345166511e-06, "loss": 0.0019, "num_tokens": 9997264.0, "reward": 0.044681914150714874, "reward_std": 0.09423120319843292, "rewards/rollout_reward_func/mean": 0.044681914150714874, "rewards/rollout_reward_func/std": 0.20536279678344727, "sampling/importance_sampling_ratio/max": 0.577857494354248, "sampling/importance_sampling_ratio/mean": 0.4966244101524353, "sampling/importance_sampling_ratio/min": 5.040499786446162e-07, "sampling/sampling_logp_difference/max": 3.776716709136963, "sampling/sampling_logp_difference/mean": 0.4897999167442322, "step": 493, "step_time": 6.568301665989566 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 3.2227938771247864, "epoch": 0.00494, "grad_norm": 0.12340746074914932, "kl": 0.9870791807770729, "learning_rate": 9.999902921708457e-06, "loss": 0.0007, "step": 494, "step_time": 3.663899361003132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.707371324300766, "epoch": 0.00495, "frac_reward_zero_std": 0.25, "grad_norm": 0.010833553969860077, "kl": 1.1219364926218987, "learning_rate": 9.999902497324827e-06, "loss": 0.0012, "num_tokens": 10036808.0, "reward": 0.35342180728912354, "reward_std": 0.0007837978191673756, "rewards/rollout_reward_func/mean": 0.35342180728912354, "rewards/rollout_reward_func/std": 0.43990081548690796, "sampling/importance_sampling_ratio/max": 0.554828405380249, "sampling/importance_sampling_ratio/mean": 0.539043664932251, "sampling/importance_sampling_ratio/min": 0.5268253684043884, "sampling/sampling_logp_difference/max": 0.6405839323997498, "sampling/sampling_logp_difference/mean": 0.30901965498924255, "step": 495, "step_time": 5.879934859003697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7134980857372284, "epoch": 0.00496, "grad_norm": 0.01070617139339447, "kl": 1.1215835437178612, "learning_rate": 9.999902072015623e-06, "loss": 0.0012, "step": 496, "step_time": 3.071812668997154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7082372903823853, "epoch": 0.00497, "frac_reward_zero_std": 0.75, "grad_norm": 0.00440718699246645, "kl": 1.0161895155906677, "learning_rate": 9.999901645780843e-06, "loss": 0.0011, "num_tokens": 10075681.0, "reward": 0.6038825511932373, "reward_std": 0.00011364868259988725, "rewards/rollout_reward_func/mean": 0.6038825511932373, "rewards/rollout_reward_func/std": 0.5074596405029297, "sampling/importance_sampling_ratio/max": 0.5589516162872314, "sampling/importance_sampling_ratio/mean": 0.5406375527381897, "sampling/importance_sampling_ratio/min": 0.5297780632972717, "sampling/sampling_logp_difference/max": 0.6347733736038208, "sampling/sampling_logp_difference/mean": 0.3075501322746277, "step": 497, "step_time": 5.557151087996317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7135637998580933, "epoch": 0.00498, "grad_norm": 0.004454456269741058, "kl": 1.0156559571623802, "learning_rate": 9.99990121862049e-06, "loss": 0.0011, "step": 498, "step_time": 3.04721496100683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.451612949371338, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.106283336877823, "epoch": 0.00499, "frac_reward_zero_std": 0.25, "grad_norm": 0.14803734421730042, "kl": 1.0528529062867165, "learning_rate": 9.99990079053456e-06, "loss": -0.0096, "num_tokens": 10115713.0, "reward": 0.09114229679107666, "reward_std": 0.018039075657725334, "rewards/rollout_reward_func/mean": 0.09114229679107666, "rewards/rollout_reward_func/std": 0.7195046544075012, "sampling/importance_sampling_ratio/max": 1.03659987449646, "sampling/importance_sampling_ratio/mean": 0.5225780010223389, "sampling/importance_sampling_ratio/min": 2.4061228565130932e-09, "sampling/sampling_logp_difference/max": 2.8250977993011475, "sampling/sampling_logp_difference/mean": 0.5063554048538208, "step": 499, "step_time": 6.187183088011807 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "entropy": 3.105537533760071, "epoch": 0.005, "grad_norm": 0.05959828943014145, "kl": 1.0528444647789001, "learning_rate": 9.999900361523054e-06, "loss": -0.0097, "step": 500, "step_time": 3.5412552110065008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.9009963274002075, "epoch": 0.00501, "frac_reward_zero_std": 0.25, "grad_norm": 0.08279508352279663, "kl": 1.0646364837884903, "learning_rate": 9.999899931585976e-06, "loss": -0.0012, "num_tokens": 10154228.0, "reward": 0.23024408519268036, "reward_std": 0.13625898957252502, "rewards/rollout_reward_func/mean": 0.23024408519268036, "rewards/rollout_reward_func/std": 0.6102222204208374, "sampling/importance_sampling_ratio/max": 0.5550225973129272, "sampling/importance_sampling_ratio/mean": 0.5233029127120972, "sampling/importance_sampling_ratio/min": 0.2967689037322998, "sampling/sampling_logp_difference/max": 0.6926403045654297, "sampling/sampling_logp_difference/mean": 0.3269176781177521, "step": 501, "step_time": 5.751668721015449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.901751697063446, "epoch": 0.00502, "grad_norm": 0.09990847110748291, "kl": 1.0656465664505959, "learning_rate": 9.999899500723323e-06, "loss": -0.0011, "step": 502, "step_time": 3.060891722998349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7468566596508026, "epoch": 0.00503, "frac_reward_zero_std": 0.25, "grad_norm": 0.006550893187522888, "kl": 1.0860062576830387, "learning_rate": 9.999899068935093e-06, "loss": 0.0012, "num_tokens": 10196518.0, "reward": 0.10499174892902374, "reward_std": 0.0013068015687167645, "rewards/rollout_reward_func/mean": 0.10499174892902374, "rewards/rollout_reward_func/std": 0.0026007811538875103, "sampling/importance_sampling_ratio/max": 0.5557186603546143, "sampling/importance_sampling_ratio/mean": 0.5372817516326904, "sampling/importance_sampling_ratio/min": 0.49925291538238525, "sampling/sampling_logp_difference/max": 0.6413341164588928, "sampling/sampling_logp_difference/mean": 0.3107408881187439, "step": 503, "step_time": 5.800437399011571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.750515937805176, "epoch": 0.00504, "grad_norm": 0.006801298353821039, "kl": 1.0852780938148499, "learning_rate": 9.99989863622129e-06, "loss": 0.0012, "step": 504, "step_time": 3.5082135110060335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.2486165165901184, "epoch": 0.00505, "frac_reward_zero_std": 0.25, "grad_norm": 0.4545231759548187, "kl": 1.0012484788894653, "learning_rate": 9.999898202581914e-06, "loss": -0.0093, "num_tokens": 10237409.0, "reward": 0.3485436737537384, "reward_std": 0.021171454340219498, "rewards/rollout_reward_func/mean": 0.3485436737537384, "rewards/rollout_reward_func/std": 0.4359656274318695, "sampling/importance_sampling_ratio/max": 0.559451162815094, "sampling/importance_sampling_ratio/mean": 0.502482533454895, "sampling/importance_sampling_ratio/min": 5.087229682648742e-12, "sampling/sampling_logp_difference/max": 3.5960607528686523, "sampling/sampling_logp_difference/mean": 0.699235200881958, "step": 505, "step_time": 5.992933266003092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.219682276248932, "epoch": 0.00506, "grad_norm": 0.012208974920213223, "kl": 1.0277426317334175, "learning_rate": 9.999897768016961e-06, "loss": -0.0096, "step": 506, "step_time": 3.546885392992408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.1008513271808624, "epoch": 0.00507, "frac_reward_zero_std": 0.25, "grad_norm": 0.08828871697187424, "kl": 1.1104530990123749, "learning_rate": 9.999897332526437e-06, "loss": -0.0037, "num_tokens": 10279625.0, "reward": 0.10115516185760498, "reward_std": 0.009879503399133682, "rewards/rollout_reward_func/mean": 0.10115516185760498, "rewards/rollout_reward_func/std": 0.018520191311836243, "sampling/importance_sampling_ratio/max": 0.6631974577903748, "sampling/importance_sampling_ratio/mean": 0.5186464786529541, "sampling/importance_sampling_ratio/min": 4.123104635489483e-12, "sampling/sampling_logp_difference/max": 3.366384506225586, "sampling/sampling_logp_difference/mean": 0.5489731431007385, "step": 507, "step_time": 6.08201069298957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.08513543009758, "epoch": 0.00508, "grad_norm": 0.05580168217420578, "kl": 1.112167801707983, "learning_rate": 9.999896896110337e-06, "loss": -0.0037, "step": 508, "step_time": 3.1705585829840857 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.058317691087723, "epoch": 0.00509, "frac_reward_zero_std": 0.25, "grad_norm": 0.1491605043411255, "kl": 1.0639017969369888, "learning_rate": 9.999896458768663e-06, "loss": -0.0, "num_tokens": 10320617.0, "reward": -0.14306437969207764, "reward_std": 0.0978630781173706, "rewards/rollout_reward_func/mean": -0.14306437969207764, "rewards/rollout_reward_func/std": 0.46047133207321167, "sampling/importance_sampling_ratio/max": 0.5756272077560425, "sampling/importance_sampling_ratio/mean": 0.4804133176803589, "sampling/importance_sampling_ratio/min": 6.362763087963685e-06, "sampling/sampling_logp_difference/max": 2.602023124694824, "sampling/sampling_logp_difference/mean": 0.4275707006454468, "step": 509, "step_time": 6.002312513985089 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 3.0398890376091003, "epoch": 0.0051, "grad_norm": 0.39512911438941956, "kl": 1.082362376153469, "learning_rate": 9.999896020501416e-06, "loss": 0.0003, "step": 510, "step_time": 3.5122473480078042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.1449178755283356, "epoch": 0.00511, "frac_reward_zero_std": 0.25, "grad_norm": 0.45319753885269165, "kl": 1.1309495270252228, "learning_rate": 9.999895581308597e-06, "loss": -0.001, "num_tokens": 10362121.0, "reward": 0.09946414828300476, "reward_std": 0.0010313661769032478, "rewards/rollout_reward_func/mean": 0.09946414828300476, "rewards/rollout_reward_func/std": 0.010205616243183613, "sampling/importance_sampling_ratio/max": 0.6331384181976318, "sampling/importance_sampling_ratio/mean": 0.5019251108169556, "sampling/importance_sampling_ratio/min": 0.24399547278881073, "sampling/sampling_logp_difference/max": 0.7400410175323486, "sampling/sampling_logp_difference/mean": 0.3611251711845398, "step": 511, "step_time": 6.018266725986905 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.03125, "clip_ratio/region_mean": 0.09375, "entropy": 3.082366555929184, "epoch": 0.00512, "grad_norm": 0.11164135485887527, "kl": 1.12628685683012, "learning_rate": 9.999895141190201e-06, "loss": -0.0029, "step": 512, "step_time": 3.541292205984064 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 424.9375, "completions/mean_terminated_length": 424.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.291973799467087, "epoch": 0.00513, "frac_reward_zero_std": 0.25, "grad_norm": 0.09151848405599594, "kl": 1.037492297589779, "learning_rate": 9.999894700146234e-06, "loss": 0.0011, "num_tokens": 10416830.0, "reward": 0.5536495447158813, "reward_std": 0.6800103783607483, "rewards/rollout_reward_func/mean": 0.5536495447158813, "rewards/rollout_reward_func/std": 0.9141905307769775, "sampling/importance_sampling_ratio/max": 0.5427253842353821, "sampling/importance_sampling_ratio/mean": 0.3060775399208069, "sampling/importance_sampling_ratio/min": 1.2336664892767524e-13, "sampling/sampling_logp_difference/max": 3.4227120876312256, "sampling/sampling_logp_difference/mean": 0.520469605922699, "step": 513, "step_time": 9.022958480018133 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 3.3078406155109406, "epoch": 0.00514, "grad_norm": 0.09447626024484634, "kl": 0.9994410499930382, "learning_rate": 9.999894258176692e-06, "loss": 0.001, "step": 514, "step_time": 4.842416961997515 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0182291679084301, "clip_ratio/low_min": 0.015625, "clip_ratio/region_mean": 0.02864583395421505, "completions/clipped_ratio": 0.03125, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 464.40625, "completions/mean_terminated_length": 478.8709411621094, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.1373082101345062, "epoch": 0.00515, "frac_reward_zero_std": 0.25, "grad_norm": 0.03848043829202652, "kl": 0.9270124211907387, "learning_rate": 9.999893815281578e-06, "loss": 0.007, "num_tokens": 10472326.0, "reward": 0.890926718711853, "reward_std": 0.007344684097915888, "rewards/rollout_reward_func/mean": 0.890926718711853, "rewards/rollout_reward_func/std": 0.415115088224411, "sampling/importance_sampling_ratio/max": 0.5446707606315613, "sampling/importance_sampling_ratio/mean": 0.3098738193511963, "sampling/importance_sampling_ratio/min": 4.530641195543694e-09, "sampling/sampling_logp_difference/max": 3.1674704551696777, "sampling/sampling_logp_difference/mean": 0.44676369428634644, "step": 515, "step_time": 9.299566722998861 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0182291679084301, "entropy": 3.158625155687332, "epoch": 0.00516, "grad_norm": 0.09324386715888977, "kl": 0.9272981062531471, "learning_rate": 9.999893371460891e-06, "loss": 0.0069, "step": 516, "step_time": 5.3055147570121335 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.0, "completions/max_length": 741.0, "completions/max_terminated_length": 741.0, "completions/mean_length": 282.3125, "completions/mean_terminated_length": 282.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.280706077814102, "epoch": 0.00517, "frac_reward_zero_std": 0.5, "grad_norm": 0.28251931071281433, "kl": 1.2915206030011177, "learning_rate": 9.99989292671463e-06, "loss": -0.0006, "num_tokens": 10521253.0, "reward": 0.7067481875419617, "reward_std": 0.46003013849258423, "rewards/rollout_reward_func/mean": 0.7067481875419617, "rewards/rollout_reward_func/std": 0.8371846079826355, "sampling/importance_sampling_ratio/max": 0.549605667591095, "sampling/importance_sampling_ratio/mean": 0.38589829206466675, "sampling/importance_sampling_ratio/min": 1.664903459013584e-27, "sampling/sampling_logp_difference/max": 11.806325912475586, "sampling/sampling_logp_difference/mean": 0.8054937720298767, "step": 517, "step_time": 8.72683486999449 }, { "clip_ratio/high_max": 0.021875000093132257, "clip_ratio/high_mean": 0.010937500046566129, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021354167023673654, "entropy": 3.321650892496109, "epoch": 0.00518, "grad_norm": 0.11687923967838287, "kl": 1.18633633852005, "learning_rate": 9.999892481042796e-06, "loss": -0.002, "step": 518, "step_time": 4.644697059018654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 466.9375, "completions/mean_terminated_length": 466.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.0821089446544647, "epoch": 0.00519, "frac_reward_zero_std": 0.25, "grad_norm": 0.20012637972831726, "kl": 1.0596551448106766, "learning_rate": 9.99989203444539e-06, "loss": -0.0015, "num_tokens": 10577454.0, "reward": 0.3746010661125183, "reward_std": 0.35717058181762695, "rewards/rollout_reward_func/mean": 0.3746010661125183, "rewards/rollout_reward_func/std": 0.9391710758209229, "sampling/importance_sampling_ratio/max": 0.5396425127983093, "sampling/importance_sampling_ratio/mean": 0.32322192192077637, "sampling/importance_sampling_ratio/min": 0.15898817777633667, "sampling/sampling_logp_difference/max": 0.7468349933624268, "sampling/sampling_logp_difference/mean": 0.34915825724601746, "step": 519, "step_time": 9.359376095002517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0182291679084301, "clip_ratio/low_min": 0.015625, "clip_ratio/region_mean": 0.0182291679084301, "entropy": 3.1363045275211334, "epoch": 0.0052, "grad_norm": 0.16676631569862366, "kl": 1.0294494107365608, "learning_rate": 9.99989158692241e-06, "loss": -0.002, "step": 520, "step_time": 4.960159129004751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 591.59375, "completions/mean_terminated_length": 591.59375, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "entropy": 3.7102965116500854, "epoch": 0.00521, "frac_reward_zero_std": 0.0, "grad_norm": 0.12432921677827835, "kl": 0.739802785217762, "learning_rate": 9.999891138473859e-06, "loss": -0.0088, "num_tokens": 10639705.0, "reward": 0.8410996198654175, "reward_std": 0.26828286051750183, "rewards/rollout_reward_func/mean": 0.8410996198654175, "rewards/rollout_reward_func/std": 0.759966254234314, "sampling/importance_sampling_ratio/max": 0.27947938442230225, "sampling/importance_sampling_ratio/mean": 0.20422083139419556, "sampling/importance_sampling_ratio/min": 3.098699963288709e-09, "sampling/sampling_logp_difference/max": 4.650428295135498, "sampling/sampling_logp_difference/mean": 0.5890487432479858, "step": 521, "step_time": 9.116484095997293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.7554132640361786, "epoch": 0.00522, "grad_norm": 0.09734462946653366, "kl": 0.7375306785106659, "learning_rate": 9.999890689099736e-06, "loss": -0.0089, "step": 522, "step_time": 5.31291674500244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 569.8125, "completions/mean_terminated_length": 569.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.525417596101761, "epoch": 0.00523, "frac_reward_zero_std": 0.0, "grad_norm": 0.12713629007339478, "kl": 0.7021817155182362, "learning_rate": 9.999890238800038e-06, "loss": -0.0031, "num_tokens": 10698987.0, "reward": 0.6216514706611633, "reward_std": 0.6497147679328918, "rewards/rollout_reward_func/mean": 0.6216514706611633, "rewards/rollout_reward_func/std": 0.7694067358970642, "sampling/importance_sampling_ratio/max": 0.3471097946166992, "sampling/importance_sampling_ratio/mean": 0.2300793081521988, "sampling/importance_sampling_ratio/min": 0.0808006003499031, "sampling/sampling_logp_difference/max": 1.1159565448760986, "sampling/sampling_logp_difference/mean": 0.40382522344589233, "step": 523, "step_time": 9.512129691989685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.55534827709198, "epoch": 0.00524, "grad_norm": 0.11263661831617355, "kl": 0.7013384103775024, "learning_rate": 9.99988978757477e-06, "loss": -0.0035, "step": 524, "step_time": 4.842758164995757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 480.125, "completions/mean_terminated_length": 480.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.401820480823517, "epoch": 0.00525, "frac_reward_zero_std": 0.25, "grad_norm": 0.07327152788639069, "kl": 0.7102048769593239, "learning_rate": 9.99988933542393e-06, "loss": 0.0047, "num_tokens": 10752748.0, "reward": 0.08975844085216522, "reward_std": 0.293628990650177, "rewards/rollout_reward_func/mean": 0.08975844085216522, "rewards/rollout_reward_func/std": 0.7978200912475586, "sampling/importance_sampling_ratio/max": 0.5362763404846191, "sampling/importance_sampling_ratio/mean": 0.2873417139053345, "sampling/importance_sampling_ratio/min": 0.02723981812596321, "sampling/sampling_logp_difference/max": 1.2787657976150513, "sampling/sampling_logp_difference/mean": 0.4049839973449707, "step": 525, "step_time": 9.124434625999129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.454418361186981, "epoch": 0.00526, "grad_norm": 0.07181105762720108, "kl": 0.7047647312283516, "learning_rate": 9.999888882347517e-06, "loss": 0.0047, "step": 526, "step_time": 4.816773040001863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 883.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 220.375, "completions/mean_terminated_length": 216.65516662597656, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.241218030452728, "epoch": 0.00527, "frac_reward_zero_std": 0.25, "grad_norm": 0.08291082829236984, "kl": 0.7009432427585125, "learning_rate": 9.999888428345532e-06, "loss": -0.0093, "num_tokens": 10797693.0, "reward": 0.6112561821937561, "reward_std": 0.40087783336639404, "rewards/rollout_reward_func/mean": 0.6112561821937561, "rewards/rollout_reward_func/std": 0.8814516067504883, "sampling/importance_sampling_ratio/max": 0.5349564552307129, "sampling/importance_sampling_ratio/mean": 0.3461892008781433, "sampling/importance_sampling_ratio/min": 1.0208807540643128e-12, "sampling/sampling_logp_difference/max": 4.6095051765441895, "sampling/sampling_logp_difference/mean": 0.7616840600967407, "step": 527, "step_time": 9.498372116002429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 4.253588438034058, "epoch": 0.00528, "grad_norm": 0.0787222757935524, "kl": 0.6960654482245445, "learning_rate": 9.999887973417974e-06, "loss": -0.0094, "step": 528, "step_time": 4.935818329999165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 696.65625, "completions/mean_terminated_length": 696.65625, "completions/min_length": 539.0, "completions/min_terminated_length": 539.0, "entropy": 3.577283352613449, "epoch": 0.00529, "frac_reward_zero_std": 0.25, "grad_norm": 0.09305528551340103, "kl": 0.8267230466008186, "learning_rate": 9.999887517564846e-06, "loss": 0.0051, "num_tokens": 10861866.0, "reward": -0.004952743649482727, "reward_std": 0.15217405557632446, "rewards/rollout_reward_func/mean": -0.004952743649482727, "rewards/rollout_reward_func/std": 0.6424797773361206, "sampling/importance_sampling_ratio/max": 0.26061582565307617, "sampling/importance_sampling_ratio/mean": 0.2050025314092636, "sampling/importance_sampling_ratio/min": 0.08482960611581802, "sampling/sampling_logp_difference/max": 0.8591563701629639, "sampling/sampling_logp_difference/mean": 0.404995322227478, "step": 529, "step_time": 9.884136849992501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.593138635158539, "epoch": 0.0053, "grad_norm": 0.04494783282279968, "kl": 0.8256380781531334, "learning_rate": 9.999887060786147e-06, "loss": 0.0049, "step": 530, "step_time": 5.085601106991817 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 677.15625, "completions/mean_terminated_length": 677.15625, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "entropy": 4.131992876529694, "epoch": 0.00531, "frac_reward_zero_std": 0.0, "grad_norm": 0.1441870629787445, "kl": 0.6787116229534149, "learning_rate": 9.999886603081875e-06, "loss": -0.0039, "num_tokens": 10925393.0, "reward": 0.37888625264167786, "reward_std": 0.5860776901245117, "rewards/rollout_reward_func/mean": 0.37888625264167786, "rewards/rollout_reward_func/std": 0.5791106224060059, "sampling/importance_sampling_ratio/max": 0.2700856626033783, "sampling/importance_sampling_ratio/mean": 0.16421779990196228, "sampling/importance_sampling_ratio/min": 5.73620421995269e-11, "sampling/sampling_logp_difference/max": 3.598773956298828, "sampling/sampling_logp_difference/mean": 0.5857592821121216, "step": 531, "step_time": 9.665654817981704 }, { "clip_ratio/high_max": 0.046875, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.013194444589316845, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03663194412365556, "entropy": 4.094708830118179, "epoch": 0.00532, "grad_norm": 0.06904593110084534, "kl": 0.6912289783358574, "learning_rate": 9.999886144452034e-06, "loss": -0.0044, "step": 532, "step_time": 4.997407133996603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 157.96875, "completions/mean_terminated_length": 157.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.1171906888484955, "epoch": 0.00533, "frac_reward_zero_std": 0.75, "grad_norm": 0.04004707559943199, "kl": 0.7582100331783295, "learning_rate": 9.999885684896619e-06, "loss": -0.0031, "num_tokens": 10966395.0, "reward": 0.9450846910476685, "reward_std": 0.2476264387369156, "rewards/rollout_reward_func/mean": 0.9450846910476685, "rewards/rollout_reward_func/std": 0.5452725887298584, "sampling/importance_sampling_ratio/max": 0.5385449528694153, "sampling/importance_sampling_ratio/mean": 0.43648988008499146, "sampling/importance_sampling_ratio/min": 0.053222283720970154, "sampling/sampling_logp_difference/max": 1.252476453781128, "sampling/sampling_logp_difference/mean": 0.3719843924045563, "step": 533, "step_time": 8.045862163002312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.1103139519691467, "epoch": 0.00534, "grad_norm": 0.036359336227178574, "kl": 0.7593566849827766, "learning_rate": 9.999885224415634e-06, "loss": -0.0031, "step": 534, "step_time": 4.29907017898222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 363.375, "completions/mean_terminated_length": 363.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.254286140203476, "epoch": 0.00535, "frac_reward_zero_std": 0.75, "grad_norm": 0.03760892525315285, "kl": 0.8554259091615677, "learning_rate": 9.999884763009078e-06, "loss": -0.001, "num_tokens": 11017236.0, "reward": 0.0943162739276886, "reward_std": 0.00945363100618124, "rewards/rollout_reward_func/mean": 0.0943162739276886, "rewards/rollout_reward_func/std": 0.7916090488433838, "sampling/importance_sampling_ratio/max": 0.5395910739898682, "sampling/importance_sampling_ratio/mean": 0.36150872707366943, "sampling/importance_sampling_ratio/min": 0.004068281035870314, "sampling/sampling_logp_difference/max": 2.9696943759918213, "sampling/sampling_logp_difference/mean": 0.38337305188179016, "step": 535, "step_time": 9.832284621981671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.2540391385555267, "epoch": 0.00536, "grad_norm": 0.03634868562221527, "kl": 0.8537392169237137, "learning_rate": 9.99988430067695e-06, "loss": -0.001, "step": 536, "step_time": 5.02504537000641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 333.0625, "completions/mean_terminated_length": 333.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.0909324288368225, "epoch": 0.00537, "frac_reward_zero_std": 0.5, "grad_norm": 0.054060813039541245, "kl": 0.9362387210130692, "learning_rate": 9.999883837419253e-06, "loss": -0.001, "num_tokens": 11065025.0, "reward": 0.8968298435211182, "reward_std": 0.018232738599181175, "rewards/rollout_reward_func/mean": 0.8968298435211182, "rewards/rollout_reward_func/std": 0.4268321692943573, "sampling/importance_sampling_ratio/max": 0.5381277799606323, "sampling/importance_sampling_ratio/mean": 0.3785022497177124, "sampling/importance_sampling_ratio/min": 0.12464597076177597, "sampling/sampling_logp_difference/max": 0.879535973072052, "sampling/sampling_logp_difference/mean": 0.35363829135894775, "step": 537, "step_time": 8.582896344996698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.079275071620941, "epoch": 0.00538, "grad_norm": 0.026890438050031662, "kl": 0.9365753382444382, "learning_rate": 9.999883373235985e-06, "loss": -0.0012, "step": 538, "step_time": 4.779019035006058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 644.5, "completions/mean_terminated_length": 644.5, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "entropy": 3.450021266937256, "epoch": 0.00539, "frac_reward_zero_std": 0.25, "grad_norm": 0.05533406883478165, "kl": 0.6697255745530128, "learning_rate": 9.999882908127145e-06, "loss": -0.0032, "num_tokens": 11127413.0, "reward": 0.39789289236068726, "reward_std": 0.3807985186576843, "rewards/rollout_reward_func/mean": 0.39789289236068726, "rewards/rollout_reward_func/std": 0.7814945578575134, "sampling/importance_sampling_ratio/max": 0.2718856632709503, "sampling/importance_sampling_ratio/mean": 0.22699204087257385, "sampling/importance_sampling_ratio/min": 1.4613321403650348e-12, "sampling/sampling_logp_difference/max": 3.553898811340332, "sampling/sampling_logp_difference/mean": 0.4974539279937744, "step": 539, "step_time": 9.65495280399773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.451335519552231, "epoch": 0.0054, "grad_norm": 0.05680834501981735, "kl": 0.6691960170865059, "learning_rate": 9.999882442092736e-06, "loss": -0.0033, "step": 540, "step_time": 4.867361263008206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 566.53125, "completions/mean_terminated_length": 570.1290283203125, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "entropy": 3.2480918169021606, "epoch": 0.00541, "frac_reward_zero_std": 0.25, "grad_norm": 0.03504755347967148, "kl": 0.7403276711702347, "learning_rate": 9.999881975132757e-06, "loss": -0.0026, "num_tokens": 11188144.0, "reward": 1.0973725318908691, "reward_std": 0.13891802728176117, "rewards/rollout_reward_func/mean": 1.0973725318908691, "rewards/rollout_reward_func/std": 0.291280061006546, "sampling/importance_sampling_ratio/max": 0.27790266275405884, "sampling/importance_sampling_ratio/mean": 0.2343149334192276, "sampling/importance_sampling_ratio/min": 1.4811935011849187e-09, "sampling/sampling_logp_difference/max": 3.4190869331359863, "sampling/sampling_logp_difference/mean": 0.4485277235507965, "step": 541, "step_time": 9.623201065987814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.237303674221039, "epoch": 0.00542, "grad_norm": 0.0367751270532608, "kl": 0.7394243627786636, "learning_rate": 9.999881507247207e-06, "loss": -0.0027, "step": 542, "step_time": 4.963285401012399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 314.46875, "completions/mean_terminated_length": 324.0967712402344, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.680068075656891, "epoch": 0.00543, "frac_reward_zero_std": 0.25, "grad_norm": 0.0408039316534996, "kl": 0.8024672642350197, "learning_rate": 9.999881038436085e-06, "loss": -0.0035, "num_tokens": 11235203.0, "reward": 0.22821290791034698, "reward_std": 0.151401549577713, "rewards/rollout_reward_func/mean": 0.22821290791034698, "rewards/rollout_reward_func/std": 0.9707299470901489, "sampling/importance_sampling_ratio/max": 0.5399371981620789, "sampling/importance_sampling_ratio/mean": 0.33855560421943665, "sampling/importance_sampling_ratio/min": 5.955426581570611e-13, "sampling/sampling_logp_difference/max": 4.509763717651367, "sampling/sampling_logp_difference/mean": 0.6171887516975403, "step": 543, "step_time": 8.231659251992824 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 3.668300300836563, "epoch": 0.00544, "grad_norm": 0.043076545000076294, "kl": 0.8097511380910873, "learning_rate": 9.999880568699396e-06, "loss": -0.0035, "step": 544, "step_time": 4.999181729996053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.03125, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 575.28125, "completions/mean_terminated_length": 573.51611328125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 3.4427431225776672, "epoch": 0.00545, "frac_reward_zero_std": 0.0, "grad_norm": 0.04301704838871956, "kl": 0.8026700615882874, "learning_rate": 9.999880098037136e-06, "loss": -0.0046, "num_tokens": 11296362.0, "reward": 1.1504747867584229, "reward_std": 0.10690892487764359, "rewards/rollout_reward_func/mean": 1.1504747867584229, "rewards/rollout_reward_func/std": 0.1801799088716507, "sampling/importance_sampling_ratio/max": 0.28041842579841614, "sampling/importance_sampling_ratio/mean": 0.23324745893478394, "sampling/importance_sampling_ratio/min": 7.60099883790157e-20, "sampling/sampling_logp_difference/max": 14.024909019470215, "sampling/sampling_logp_difference/mean": 0.6815534830093384, "step": 545, "step_time": 8.758025341005123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.439167708158493, "epoch": 0.00546, "grad_norm": 0.03612540662288666, "kl": 0.8038536533713341, "learning_rate": 9.999879626449306e-06, "loss": -0.0048, "step": 546, "step_time": 5.130789780989289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 340.8125, "completions/mean_terminated_length": 362.4666748046875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.5948007106781006, "epoch": 0.00547, "frac_reward_zero_std": 0.25, "grad_norm": 0.04364689812064171, "kl": 0.8752921968698502, "learning_rate": 9.999879153935907e-06, "loss": -0.0065, "num_tokens": 11346939.0, "reward": 0.622852087020874, "reward_std": 0.2907658815383911, "rewards/rollout_reward_func/mean": 0.622852087020874, "rewards/rollout_reward_func/std": 0.6560578942298889, "sampling/importance_sampling_ratio/max": 0.539683997631073, "sampling/importance_sampling_ratio/mean": 0.346332311630249, "sampling/importance_sampling_ratio/min": 6.72333282797366e-12, "sampling/sampling_logp_difference/max": 3.3787567615509033, "sampling/sampling_logp_difference/mean": 0.5761181116104126, "step": 547, "step_time": 8.776257336008712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.5798830687999725, "epoch": 0.00548, "grad_norm": 0.04288220405578613, "kl": 0.8766481578350067, "learning_rate": 9.99987868049694e-06, "loss": -0.0065, "step": 548, "step_time": 4.79451856900414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 525.4375, "completions/mean_terminated_length": 525.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.317091852426529, "epoch": 0.00549, "frac_reward_zero_std": 0.25, "grad_norm": 0.06796661764383316, "kl": 0.8447726480662823, "learning_rate": 9.9998782061324e-06, "loss": 0.0035, "num_tokens": 11405133.0, "reward": -0.03595537692308426, "reward_std": 0.15363110601902008, "rewards/rollout_reward_func/mean": -0.03595537692308426, "rewards/rollout_reward_func/std": 0.8786155581474304, "sampling/importance_sampling_ratio/max": 0.5339558124542236, "sampling/importance_sampling_ratio/mean": 0.29919177293777466, "sampling/importance_sampling_ratio/min": 0.16903552412986755, "sampling/sampling_logp_difference/max": 0.8107098340988159, "sampling/sampling_logp_difference/mean": 0.362244188785553, "step": 549, "step_time": 9.275672664007288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.3186263740062714, "epoch": 0.0055, "grad_norm": 0.07158660888671875, "kl": 0.8382585272192955, "learning_rate": 9.999877730842293e-06, "loss": 0.0035, "step": 550, "step_time": 5.475729698002397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 162.9375, "completions/mean_terminated_length": 146.22579956054688, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.2112388610839844, "epoch": 0.00551, "frac_reward_zero_std": 0.75, "grad_norm": 0.04586518555879593, "kl": 0.7114874571561813, "learning_rate": 9.999877254626616e-06, "loss": -0.0012, "num_tokens": 11449355.0, "reward": -0.1704714298248291, "reward_std": 0.16425290703773499, "rewards/rollout_reward_func/mean": -0.1704714298248291, "rewards/rollout_reward_func/std": 0.9804152846336365, "sampling/importance_sampling_ratio/max": 0.5440492033958435, "sampling/importance_sampling_ratio/mean": 0.43813756108283997, "sampling/importance_sampling_ratio/min": 1.6489259610508178e-11, "sampling/sampling_logp_difference/max": 4.381749629974365, "sampling/sampling_logp_difference/mean": 0.49383842945098877, "step": 551, "step_time": 8.58473001200764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.2099355161190033, "epoch": 0.00552, "grad_norm": 0.047630809247493744, "kl": 0.7108201310038567, "learning_rate": 9.99987677748537e-06, "loss": -0.0012, "step": 552, "step_time": 5.211167568995734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 483.65625, "completions/mean_terminated_length": 476.83868408203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.708524703979492, "epoch": 0.00553, "frac_reward_zero_std": 0.25, "grad_norm": 0.07147859036922455, "kl": 1.0597890056669712, "learning_rate": 9.999876299418556e-06, "loss": 0.001, "num_tokens": 11505118.0, "reward": 0.3922448754310608, "reward_std": 0.2937415838241577, "rewards/rollout_reward_func/mean": 0.3922448754310608, "rewards/rollout_reward_func/std": 0.8704720139503479, "sampling/importance_sampling_ratio/max": 0.5475221276283264, "sampling/importance_sampling_ratio/mean": 0.2842029333114624, "sampling/importance_sampling_ratio/min": 5.9542822061775524e-12, "sampling/sampling_logp_difference/max": 3.427743673324585, "sampling/sampling_logp_difference/mean": 0.6015609502792358, "step": 553, "step_time": 9.062727975993766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.6992914974689484, "epoch": 0.00554, "grad_norm": 0.0691293329000473, "kl": 1.0478884801268578, "learning_rate": 9.999875820426172e-06, "loss": 0.0009, "step": 554, "step_time": 4.86232859800657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 500.78125, "completions/mean_terminated_length": 500.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.8506035208702087, "epoch": 0.00555, "frac_reward_zero_std": 0.5, "grad_norm": 0.0715814009308815, "kl": 0.6689929701387882, "learning_rate": 9.999875340508221e-06, "loss": -0.0041, "num_tokens": 11560262.0, "reward": 0.6329653859138489, "reward_std": 0.16645729541778564, "rewards/rollout_reward_func/mean": 0.6329653859138489, "rewards/rollout_reward_func/std": 0.5678977370262146, "sampling/importance_sampling_ratio/max": 0.5451715588569641, "sampling/importance_sampling_ratio/mean": 0.27244120836257935, "sampling/importance_sampling_ratio/min": 3.474783395046588e-13, "sampling/sampling_logp_difference/max": 3.379624128341675, "sampling/sampling_logp_difference/mean": 0.7268892526626587, "step": 555, "step_time": 9.4216969889967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 3.8326282501220703, "epoch": 0.00556, "grad_norm": 0.028389107435941696, "kl": 0.6788825653493404, "learning_rate": 9.999874859664698e-06, "loss": -0.0042, "step": 556, "step_time": 5.451977324009931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 372.4375, "completions/mean_terminated_length": 372.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.2346511781215668, "epoch": 0.00557, "frac_reward_zero_std": 0.5, "grad_norm": 0.03012966364622116, "kl": 0.9517968446016312, "learning_rate": 9.99987437789561e-06, "loss": -0.0009, "num_tokens": 11610560.0, "reward": 0.5088340044021606, "reward_std": 0.1646028459072113, "rewards/rollout_reward_func/mean": 0.5088340044021606, "rewards/rollout_reward_func/std": 0.7028384804725647, "sampling/importance_sampling_ratio/max": 0.5468950867652893, "sampling/importance_sampling_ratio/mean": 0.37107253074645996, "sampling/importance_sampling_ratio/min": 0.0029114759527146816, "sampling/sampling_logp_difference/max": 3.8258938789367676, "sampling/sampling_logp_difference/mean": 0.39607304334640503, "step": 557, "step_time": 8.800960156993824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.207374334335327, "epoch": 0.00558, "grad_norm": 0.02875836379826069, "kl": 0.954418309032917, "learning_rate": 9.999873895200953e-06, "loss": -0.001, "step": 558, "step_time": 5.356925909996789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 577.9375, "completions/mean_terminated_length": 577.9375, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "entropy": 3.2717504799365997, "epoch": 0.00559, "frac_reward_zero_std": 0.25, "grad_norm": 0.13474398851394653, "kl": 0.8028221353888512, "learning_rate": 9.999873411580727e-06, "loss": 0.0053, "num_tokens": 11670704.0, "reward": 0.6346140503883362, "reward_std": 0.3623615503311157, "rewards/rollout_reward_func/mean": 0.6346140503883362, "rewards/rollout_reward_func/std": 0.7594017386436462, "sampling/importance_sampling_ratio/max": 0.28571653366088867, "sampling/importance_sampling_ratio/mean": 0.23207969963550568, "sampling/importance_sampling_ratio/min": 1.2084337974727077e-08, "sampling/sampling_logp_difference/max": 2.8109309673309326, "sampling/sampling_logp_difference/mean": 0.4339717626571655, "step": 559, "step_time": 9.347196489012276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.247618645429611, "epoch": 0.0056, "grad_norm": 0.08945304155349731, "kl": 0.804155707359314, "learning_rate": 9.999872927034932e-06, "loss": 0.0053, "step": 560, "step_time": 4.975007912020374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 299.875, "completions/mean_terminated_length": 299.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.900695025920868, "epoch": 0.00561, "frac_reward_zero_std": 1.0, "grad_norm": 0.002051079645752907, "kl": 0.9505590125918388, "learning_rate": 9.99987244156357e-06, "loss": 0.0019, "num_tokens": 11716841.0, "reward": 0.9016008377075195, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.9016008377075195, "rewards/rollout_reward_func/std": 0.42066892981529236, "sampling/importance_sampling_ratio/max": 0.5445223450660706, "sampling/importance_sampling_ratio/mean": 0.3952590823173523, "sampling/importance_sampling_ratio/min": 0.219766765832901, "sampling/sampling_logp_difference/max": 0.72977215051651, "sampling/sampling_logp_difference/mean": 0.32739686965942383, "step": 561, "step_time": 8.895676218009612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.902265429496765, "epoch": 0.00562, "grad_norm": 0.002074701711535454, "kl": 0.9502061456441879, "learning_rate": 9.999871955166642e-06, "loss": 0.0019, "step": 562, "step_time": 5.116047944007732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 600.15625, "completions/mean_terminated_length": 600.15625, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "entropy": 3.4280565083026886, "epoch": 0.00563, "frac_reward_zero_std": 0.0, "grad_norm": 0.052320607006549835, "kl": 0.8824390470981598, "learning_rate": 9.999871467844145e-06, "loss": -0.0039, "num_tokens": 11778766.0, "reward": 0.7768791317939758, "reward_std": 0.19779768586158752, "rewards/rollout_reward_func/mean": 0.7768791317939758, "rewards/rollout_reward_func/std": 0.7527104616165161, "sampling/importance_sampling_ratio/max": 0.3183054029941559, "sampling/importance_sampling_ratio/mean": 0.2452814280986786, "sampling/importance_sampling_ratio/min": 2.171227065327075e-13, "sampling/sampling_logp_difference/max": 3.4445459842681885, "sampling/sampling_logp_difference/mean": 0.5490127801895142, "step": 563, "step_time": 9.22038819998852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.4281090199947357, "epoch": 0.00564, "grad_norm": 0.05109810456633568, "kl": 0.8808858618140221, "learning_rate": 9.999870979596079e-06, "loss": -0.0041, "step": 564, "step_time": 5.267476904999057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 453.15625, "completions/mean_terminated_length": 467.258056640625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.086675703525543, "epoch": 0.00565, "frac_reward_zero_std": 0.0, "grad_norm": 0.022755127400159836, "kl": 0.7451036795973778, "learning_rate": 9.999870490422448e-06, "loss": -0.0036, "num_tokens": 11833370.0, "reward": 0.77068692445755, "reward_std": 0.34387868642807007, "rewards/rollout_reward_func/mean": 0.77068692445755, "rewards/rollout_reward_func/std": 0.7987755537033081, "sampling/importance_sampling_ratio/max": 0.5426323413848877, "sampling/importance_sampling_ratio/mean": 0.3162575364112854, "sampling/importance_sampling_ratio/min": 1.2646667046922744e-10, "sampling/sampling_logp_difference/max": 2.9716956615448, "sampling/sampling_logp_difference/mean": 0.4711536169052124, "step": 565, "step_time": 9.024141837995558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.0864274501800537, "epoch": 0.00566, "grad_norm": 0.02385132573544979, "kl": 0.7427542507648468, "learning_rate": 9.999870000323247e-06, "loss": -0.0037, "step": 566, "step_time": 4.865025808998325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 632.1875, "completions/mean_terminated_length": 632.1875, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "entropy": 3.7451277375221252, "epoch": 0.00567, "frac_reward_zero_std": 0.0, "grad_norm": 0.09980414807796478, "kl": 0.6609785184264183, "learning_rate": 9.99986950929848e-06, "loss": -0.0016, "num_tokens": 11896242.0, "reward": 0.5980760455131531, "reward_std": 0.3094111680984497, "rewards/rollout_reward_func/mean": 0.5980760455131531, "rewards/rollout_reward_func/std": 0.48978012800216675, "sampling/importance_sampling_ratio/max": 0.2899162471294403, "sampling/importance_sampling_ratio/mean": 0.20933428406715393, "sampling/importance_sampling_ratio/min": 4.632957666323486e-15, "sampling/sampling_logp_difference/max": 4.042977333068848, "sampling/sampling_logp_difference/mean": 0.6070027351379395, "step": 567, "step_time": 10.057956393997301 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.75368332862854, "epoch": 0.00568, "grad_norm": 0.05205734819173813, "kl": 0.6580899767577648, "learning_rate": 9.999869017348145e-06, "loss": -0.0018, "step": 568, "step_time": 4.983557225998084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 368.75, "completions/mean_terminated_length": 380.1290283203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.182327061891556, "epoch": 0.00569, "frac_reward_zero_std": 0.25, "grad_norm": 0.10209041833877563, "kl": 0.8825154155492783, "learning_rate": 9.999868524472245e-06, "loss": -0.0008, "num_tokens": 11947276.0, "reward": 0.7320734858512878, "reward_std": 0.335682213306427, "rewards/rollout_reward_func/mean": 0.7320734858512878, "rewards/rollout_reward_func/std": 0.6366477012634277, "sampling/importance_sampling_ratio/max": 0.5450370907783508, "sampling/importance_sampling_ratio/mean": 0.37705549597740173, "sampling/importance_sampling_ratio/min": 4.6170362111297436e-07, "sampling/sampling_logp_difference/max": 2.6850204467773438, "sampling/sampling_logp_difference/mean": 0.4215684235095978, "step": 569, "step_time": 9.274398186978942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 3.199253350496292, "epoch": 0.0057, "grad_norm": 0.12990239262580872, "kl": 0.8819719702005386, "learning_rate": 9.999868030670776e-06, "loss": -0.0015, "step": 570, "step_time": 5.481604211003287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.016741071827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016741071827709675, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 516.3125, "completions/mean_terminated_length": 516.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.7401595413684845, "epoch": 0.00571, "frac_reward_zero_std": 0.0, "grad_norm": 0.07936757057905197, "kl": 0.7713387161493301, "learning_rate": 9.99986753594374e-06, "loss": -0.0063, "num_tokens": 12003163.0, "reward": 0.4886443018913269, "reward_std": 0.16785560548305511, "rewards/rollout_reward_func/mean": 0.4886443018913269, "rewards/rollout_reward_func/std": 0.47033384442329407, "sampling/importance_sampling_ratio/max": 0.5432619452476501, "sampling/importance_sampling_ratio/mean": 0.266579270362854, "sampling/importance_sampling_ratio/min": 2.93051368761843e-12, "sampling/sampling_logp_difference/max": 4.872079372406006, "sampling/sampling_logp_difference/mean": 0.5763254165649414, "step": 571, "step_time": 9.171812043998216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.7783912420272827, "epoch": 0.00572, "grad_norm": 0.07310974597930908, "kl": 0.7684896737337112, "learning_rate": 9.99986704029114e-06, "loss": -0.0064, "step": 572, "step_time": 4.91624029599916 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 646.03125, "completions/mean_terminated_length": 646.03125, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "entropy": 3.7885546684265137, "epoch": 0.00573, "frac_reward_zero_std": 0.0, "grad_norm": 0.06741020828485489, "kl": 0.886363185942173, "learning_rate": 9.99986654371297e-06, "loss": -0.0069, "num_tokens": 12064406.0, "reward": 0.4589614272117615, "reward_std": 0.47261661291122437, "rewards/rollout_reward_func/mean": 0.4589614272117615, "rewards/rollout_reward_func/std": 0.7026260495185852, "sampling/importance_sampling_ratio/max": 0.2718486785888672, "sampling/importance_sampling_ratio/mean": 0.19462475180625916, "sampling/importance_sampling_ratio/min": 9.574414178148416e-11, "sampling/sampling_logp_difference/max": 4.549474716186523, "sampling/sampling_logp_difference/mean": 0.6040598154067993, "step": 573, "step_time": 9.832897129010234 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.809591442346573, "epoch": 0.00574, "grad_norm": 0.06523196399211884, "kl": 0.8853109627962112, "learning_rate": 9.999866046209236e-06, "loss": -0.007, "step": 574, "step_time": 4.6931510669892305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 399.5, "completions/mean_terminated_length": 399.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.326888620853424, "epoch": 0.00575, "frac_reward_zero_std": 0.25, "grad_norm": 0.33782365918159485, "kl": 0.8884425386786461, "learning_rate": 9.999865547779934e-06, "loss": -0.008, "num_tokens": 12117962.0, "reward": 0.5053281784057617, "reward_std": 0.1367589384317398, "rewards/rollout_reward_func/mean": 0.5053281784057617, "rewards/rollout_reward_func/std": 0.9350661039352417, "sampling/importance_sampling_ratio/max": 0.665722668170929, "sampling/importance_sampling_ratio/mean": 0.30561643838882446, "sampling/importance_sampling_ratio/min": 6.451402346369095e-12, "sampling/sampling_logp_difference/max": 3.8261919021606445, "sampling/sampling_logp_difference/mean": 0.4882638454437256, "step": 575, "step_time": 8.791466881004453 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 3.32874995470047, "epoch": 0.00576, "grad_norm": 0.0311047974973917, "kl": 0.8819375485181808, "learning_rate": 9.999865048425068e-06, "loss": -0.0088, "step": 576, "step_time": 4.4919604339957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 325.40625, "completions/mean_terminated_length": 325.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.1357433199882507, "epoch": 0.00577, "frac_reward_zero_std": 0.25, "grad_norm": 0.13640207052230835, "kl": 0.7020392939448357, "learning_rate": 9.999864548144636e-06, "loss": -0.0032, "num_tokens": 12166989.0, "reward": 0.9858775734901428, "reward_std": 0.13340845704078674, "rewards/rollout_reward_func/mean": 0.9858775734901428, "rewards/rollout_reward_func/std": 0.3418152928352356, "sampling/importance_sampling_ratio/max": 0.5520761609077454, "sampling/importance_sampling_ratio/mean": 0.37231922149658203, "sampling/importance_sampling_ratio/min": 0.0002687709347810596, "sampling/sampling_logp_difference/max": 4.36182165145874, "sampling/sampling_logp_difference/mean": 0.41101720929145813, "step": 577, "step_time": 8.547699291004392 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026041666977107525, "entropy": 3.1391320526599884, "epoch": 0.00578, "grad_norm": 0.059301137924194336, "kl": 0.6969775930047035, "learning_rate": 9.999864046938636e-06, "loss": -0.0035, "step": 578, "step_time": 4.728829713989398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 352.28125, "completions/mean_terminated_length": 352.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.4145836532115936, "epoch": 0.00579, "frac_reward_zero_std": 0.75, "grad_norm": 0.0266743041574955, "kl": 0.8867960721254349, "learning_rate": 9.999863544807073e-06, "loss": 0.0005, "num_tokens": 12218831.0, "reward": 0.3548852801322937, "reward_std": 9.809326729737222e-05, "rewards/rollout_reward_func/mean": 0.3548852801322937, "rewards/rollout_reward_func/std": 0.9124224185943604, "sampling/importance_sampling_ratio/max": 0.5488147735595703, "sampling/importance_sampling_ratio/mean": 0.36139214038848877, "sampling/importance_sampling_ratio/min": 0.06662654876708984, "sampling/sampling_logp_difference/max": 1.210779070854187, "sampling/sampling_logp_difference/mean": 0.40749484300613403, "step": 579, "step_time": 8.780049725995923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.439604341983795, "epoch": 0.0058, "grad_norm": 0.02857833541929722, "kl": 0.8863693922758102, "learning_rate": 9.999863041749942e-06, "loss": 0.0006, "step": 580, "step_time": 4.710396276983374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 610.375, "completions/mean_terminated_length": 610.375, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "entropy": 3.4241095781326294, "epoch": 0.00581, "frac_reward_zero_std": 0.0, "grad_norm": 0.06198433041572571, "kl": 0.8005321994423866, "learning_rate": 9.999862537767247e-06, "loss": -0.0065, "num_tokens": 12280561.0, "reward": 0.7077672481536865, "reward_std": 0.5371489524841309, "rewards/rollout_reward_func/mean": 0.7077672481536865, "rewards/rollout_reward_func/std": 0.7535967826843262, "sampling/importance_sampling_ratio/max": 0.274836003780365, "sampling/importance_sampling_ratio/mean": 0.22067110240459442, "sampling/importance_sampling_ratio/min": 0.002305785659700632, "sampling/sampling_logp_difference/max": 4.285691261291504, "sampling/sampling_logp_difference/mean": 0.4116932153701782, "step": 581, "step_time": 9.470020722008485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.4299002587795258, "epoch": 0.00582, "grad_norm": 0.059643253684043884, "kl": 0.800477534532547, "learning_rate": 9.999862032858985e-06, "loss": -0.0065, "step": 582, "step_time": 4.862543237002683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 546.3125, "completions/mean_terminated_length": 546.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.7677793204784393, "epoch": 0.00583, "frac_reward_zero_std": 0.25, "grad_norm": 0.06375162303447723, "kl": 0.8232520148158073, "learning_rate": 9.999861527025157e-06, "loss": -0.0046, "num_tokens": 12337633.0, "reward": 0.4206044375896454, "reward_std": 0.4101543426513672, "rewards/rollout_reward_func/mean": 0.4206044375896454, "rewards/rollout_reward_func/std": 0.6934565901756287, "sampling/importance_sampling_ratio/max": 0.5535870790481567, "sampling/importance_sampling_ratio/mean": 0.2535381317138672, "sampling/importance_sampling_ratio/min": 0.003141178051009774, "sampling/sampling_logp_difference/max": 3.358602285385132, "sampling/sampling_logp_difference/mean": 0.4631754159927368, "step": 583, "step_time": 9.339125899001374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.80316561460495, "epoch": 0.00584, "grad_norm": 0.0652460977435112, "kl": 0.8228407576680183, "learning_rate": 9.999861020265767e-06, "loss": -0.0046, "step": 584, "step_time": 5.020135793994996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 635.96875, "completions/mean_terminated_length": 635.96875, "completions/min_length": 526.0, "completions/min_terminated_length": 526.0, "entropy": 3.497039169073105, "epoch": 0.00585, "frac_reward_zero_std": 0.0, "grad_norm": 0.04191924259066582, "kl": 0.7654398642480373, "learning_rate": 9.999860512580808e-06, "loss": -0.0079, "num_tokens": 12400392.0, "reward": 0.2869776487350464, "reward_std": 0.7300951480865479, "rewards/rollout_reward_func/mean": 0.2869776487350464, "rewards/rollout_reward_func/std": 1.0246282815933228, "sampling/importance_sampling_ratio/max": 0.27421897649765015, "sampling/importance_sampling_ratio/mean": 0.21823135018348694, "sampling/importance_sampling_ratio/min": 0.05923137068748474, "sampling/sampling_logp_difference/max": 1.1477181911468506, "sampling/sampling_logp_difference/mean": 0.40208700299263, "step": 585, "step_time": 9.201943709005718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.53923037648201, "epoch": 0.00586, "grad_norm": 0.04107699543237686, "kl": 0.7649673894047737, "learning_rate": 9.999860003970287e-06, "loss": -0.008, "step": 586, "step_time": 4.730427692004014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 604.28125, "completions/mean_terminated_length": 604.28125, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "entropy": 3.6273307502269745, "epoch": 0.00587, "frac_reward_zero_std": 0.0, "grad_norm": 0.04576672613620758, "kl": 0.7018714174628258, "learning_rate": 9.9998594944342e-06, "loss": -0.0155, "num_tokens": 12461423.0, "reward": 0.9688677787780762, "reward_std": 0.2563668489456177, "rewards/rollout_reward_func/mean": 0.9688677787780762, "rewards/rollout_reward_func/std": 0.4093657433986664, "sampling/importance_sampling_ratio/max": 0.27473434805870056, "sampling/importance_sampling_ratio/mean": 0.2161649465560913, "sampling/importance_sampling_ratio/min": 3.3859390985435667e-11, "sampling/sampling_logp_difference/max": 3.443336248397827, "sampling/sampling_logp_difference/mean": 0.5161484479904175, "step": 587, "step_time": 9.513277986989124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.652395725250244, "epoch": 0.00588, "grad_norm": 0.040538571774959564, "kl": 0.7004073485732079, "learning_rate": 9.99985898397255e-06, "loss": -0.0156, "step": 588, "step_time": 4.888612078997539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 505.15625, "completions/mean_terminated_length": 520.9354858398438, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.9136448800563812, "epoch": 0.00589, "frac_reward_zero_std": 0.25, "grad_norm": 0.023348290473222733, "kl": 0.8465990945696831, "learning_rate": 9.999858472585334e-06, "loss": -0.0112, "num_tokens": 12517373.0, "reward": 0.26617276668548584, "reward_std": 0.17590883374214172, "rewards/rollout_reward_func/mean": 0.26617276668548584, "rewards/rollout_reward_func/std": 0.8634560108184814, "sampling/importance_sampling_ratio/max": 0.5505878925323486, "sampling/importance_sampling_ratio/mean": 0.259355366230011, "sampling/importance_sampling_ratio/min": 1.1348534721818907e-14, "sampling/sampling_logp_difference/max": 3.4402661323547363, "sampling/sampling_logp_difference/mean": 0.6953409910202026, "step": 589, "step_time": 9.345711817994015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.9256609976291656, "epoch": 0.0059, "grad_norm": 0.022495362907648087, "kl": 0.8432410359382629, "learning_rate": 9.999857960272553e-06, "loss": -0.0112, "step": 590, "step_time": 5.418765393005742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 444.1875, "completions/mean_terminated_length": 444.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.370278239250183, "epoch": 0.00591, "frac_reward_zero_std": 0.25, "grad_norm": 0.06599407643079758, "kl": 0.7717240378260612, "learning_rate": 9.99985744703421e-06, "loss": -0.0038, "num_tokens": 12572095.0, "reward": 0.011392280459403992, "reward_std": 0.3917378783226013, "rewards/rollout_reward_func/mean": 0.011392280459403992, "rewards/rollout_reward_func/std": 1.0571171045303345, "sampling/importance_sampling_ratio/max": 0.5341386198997498, "sampling/importance_sampling_ratio/mean": 0.2914888262748718, "sampling/importance_sampling_ratio/min": 0.062013085931539536, "sampling/sampling_logp_difference/max": 1.2803585529327393, "sampling/sampling_logp_difference/mean": 0.38916629552841187, "step": 591, "step_time": 8.715265595004894 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.376244366168976, "epoch": 0.00592, "grad_norm": 0.047404926270246506, "kl": 0.7724314853549004, "learning_rate": 9.9998569328703e-06, "loss": -0.004, "step": 592, "step_time": 4.739556611988519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 545.875, "completions/mean_terminated_length": 545.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.8933197259902954, "epoch": 0.00593, "frac_reward_zero_std": 0.25, "grad_norm": 0.054049052298069, "kl": 0.6267417594790459, "learning_rate": 9.99985641778083e-06, "loss": -0.0003, "num_tokens": 12631363.0, "reward": 0.324940949678421, "reward_std": 0.4895850718021393, "rewards/rollout_reward_func/mean": 0.324940949678421, "rewards/rollout_reward_func/std": 0.812134861946106, "sampling/importance_sampling_ratio/max": 0.5416864156723022, "sampling/importance_sampling_ratio/mean": 0.22619357705116272, "sampling/importance_sampling_ratio/min": 0.01866179332137108, "sampling/sampling_logp_difference/max": 2.4141035079956055, "sampling/sampling_logp_difference/mean": 0.4826337695121765, "step": 593, "step_time": 9.503242550003051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.9034979045391083, "epoch": 0.00594, "grad_norm": 0.047449320554733276, "kl": 0.630185455083847, "learning_rate": 9.999855901765791e-06, "loss": -0.0004, "step": 594, "step_time": 4.862826051001321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 500.40625, "completions/mean_terminated_length": 500.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.4234394431114197, "epoch": 0.00595, "frac_reward_zero_std": 0.25, "grad_norm": 0.10441609472036362, "kl": 0.8172081559896469, "learning_rate": 9.99985538482519e-06, "loss": -0.0017, "num_tokens": 12688450.0, "reward": 0.5806695222854614, "reward_std": 0.3806608319282532, "rewards/rollout_reward_func/mean": 0.5806695222854614, "rewards/rollout_reward_func/std": 0.7926381826400757, "sampling/importance_sampling_ratio/max": 0.5541810393333435, "sampling/importance_sampling_ratio/mean": 0.2929795980453491, "sampling/importance_sampling_ratio/min": 0.1305004209280014, "sampling/sampling_logp_difference/max": 1.010361671447754, "sampling/sampling_logp_difference/mean": 0.38471806049346924, "step": 595, "step_time": 9.129630103991076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.41462242603302, "epoch": 0.00596, "grad_norm": 0.09685466438531876, "kl": 0.8183141946792603, "learning_rate": 9.999854866959026e-06, "loss": -0.0019, "step": 596, "step_time": 5.362688664004963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 458.21875, "completions/mean_terminated_length": 456.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.6266486942768097, "epoch": 0.00597, "frac_reward_zero_std": 0.0, "grad_norm": 0.09323632717132568, "kl": 0.6350403428077698, "learning_rate": 9.999854348167299e-06, "loss": -0.0183, "num_tokens": 12744249.0, "reward": 1.0620932579040527, "reward_std": 0.23243926465511322, "rewards/rollout_reward_func/mean": 1.0620932579040527, "rewards/rollout_reward_func/std": 0.42877569794654846, "sampling/importance_sampling_ratio/max": 0.5519375205039978, "sampling/importance_sampling_ratio/mean": 0.282217413187027, "sampling/importance_sampling_ratio/min": 2.060176682028292e-11, "sampling/sampling_logp_difference/max": 4.3491339683532715, "sampling/sampling_logp_difference/mean": 0.6311613321304321, "step": 597, "step_time": 8.930548667005496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.596663624048233, "epoch": 0.00598, "grad_norm": 0.03068883903324604, "kl": 0.6474228091537952, "learning_rate": 9.999853828450009e-06, "loss": -0.0184, "step": 598, "step_time": 5.20313078899926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 444.84375, "completions/mean_terminated_length": 444.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.279623359441757, "epoch": 0.00599, "frac_reward_zero_std": 0.5, "grad_norm": 0.05724707245826721, "kl": 0.7982055693864822, "learning_rate": 9.999853307807155e-06, "loss": -0.0066, "num_tokens": 12799445.0, "reward": 0.45930978655815125, "reward_std": 0.3517864942550659, "rewards/rollout_reward_func/mean": 0.45930978655815125, "rewards/rollout_reward_func/std": 1.051370620727539, "sampling/importance_sampling_ratio/max": 0.5491923093795776, "sampling/importance_sampling_ratio/mean": 0.31745773553848267, "sampling/importance_sampling_ratio/min": 2.0786095067393262e-11, "sampling/sampling_logp_difference/max": 3.4802603721618652, "sampling/sampling_logp_difference/mean": 0.49184003472328186, "step": 599, "step_time": 8.8797565349887 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.2736749351024628, "epoch": 0.006, "grad_norm": 0.04258754849433899, "kl": 0.8016273826360703, "learning_rate": 9.999852786238737e-06, "loss": -0.0067, "step": 600, "step_time": 4.731075978008448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 433.96875, "completions/mean_terminated_length": 433.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.844575583934784, "epoch": 0.00601, "frac_reward_zero_std": 0.25, "grad_norm": 0.027940835803747177, "kl": 0.5789410509169102, "learning_rate": 9.999852263744758e-06, "loss": -0.0134, "num_tokens": 12853521.0, "reward": 0.879970908164978, "reward_std": 0.30713483691215515, "rewards/rollout_reward_func/mean": 0.879970908164978, "rewards/rollout_reward_func/std": 0.5514302253723145, "sampling/importance_sampling_ratio/max": 0.5512405633926392, "sampling/importance_sampling_ratio/mean": 0.27667236328125, "sampling/importance_sampling_ratio/min": 2.404101917541368e-11, "sampling/sampling_logp_difference/max": 4.504905700683594, "sampling/sampling_logp_difference/mean": 0.6757233142852783, "step": 601, "step_time": 9.1921541720003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.845778316259384, "epoch": 0.00602, "grad_norm": 0.029288312420248985, "kl": 0.5767678841948509, "learning_rate": 9.999851740325214e-06, "loss": -0.0135, "step": 602, "step_time": 4.766960698994808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 570.875, "completions/mean_terminated_length": 569.8064575195312, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "entropy": 3.9329707622528076, "epoch": 0.00603, "frac_reward_zero_std": 0.0, "grad_norm": 0.03680124878883362, "kl": 0.7155812755227089, "learning_rate": 9.999851215980108e-06, "loss": -0.0091, "num_tokens": 12912851.0, "reward": 0.19298337399959564, "reward_std": 0.4083167612552643, "rewards/rollout_reward_func/mean": 0.19298337399959564, "rewards/rollout_reward_func/std": 0.9874197840690613, "sampling/importance_sampling_ratio/max": 0.28533270955085754, "sampling/importance_sampling_ratio/mean": 0.1923900991678238, "sampling/importance_sampling_ratio/min": 5.987434072846076e-12, "sampling/sampling_logp_difference/max": 3.9272541999816895, "sampling/sampling_logp_difference/mean": 0.6023211479187012, "step": 603, "step_time": 8.690101213011076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.9538289308547974, "epoch": 0.00604, "grad_norm": 0.03644717112183571, "kl": 0.7089653424918652, "learning_rate": 9.99985069070944e-06, "loss": -0.0091, "step": 604, "step_time": 5.095330302989169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 359.625, "completions/mean_terminated_length": 359.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.947944402694702, "epoch": 0.00605, "frac_reward_zero_std": 0.5, "grad_norm": 0.06702155619859695, "kl": 0.9423663318157196, "learning_rate": 9.999850164513208e-06, "loss": 0.0003, "num_tokens": 12962963.0, "reward": 0.7025525569915771, "reward_std": 0.12828543782234192, "rewards/rollout_reward_func/mean": 0.7025525569915771, "rewards/rollout_reward_func/std": 0.47168609499931335, "sampling/importance_sampling_ratio/max": 0.554027795791626, "sampling/importance_sampling_ratio/mean": 0.3206898868083954, "sampling/importance_sampling_ratio/min": 0.05848443880677223, "sampling/sampling_logp_difference/max": 1.3611018657684326, "sampling/sampling_logp_difference/mean": 0.5010882616043091, "step": 605, "step_time": 8.850898288997996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.018229166977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018229166977107525, "entropy": 3.994610995054245, "epoch": 0.00606, "grad_norm": 0.06061568111181259, "kl": 0.9099233448505402, "learning_rate": 9.999849637391415e-06, "loss": 0.0001, "step": 606, "step_time": 4.907044976003817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 614.8125, "completions/mean_terminated_length": 614.8125, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "entropy": 3.761177271604538, "epoch": 0.00607, "frac_reward_zero_std": 0.0, "grad_norm": 0.09139654785394669, "kl": 0.714824415743351, "learning_rate": 9.99984910934406e-06, "loss": -0.0083, "num_tokens": 13025185.0, "reward": 0.6801857352256775, "reward_std": 0.5206847190856934, "rewards/rollout_reward_func/mean": 0.6801857352256775, "rewards/rollout_reward_func/std": 0.70393967628479, "sampling/importance_sampling_ratio/max": 0.28726285696029663, "sampling/importance_sampling_ratio/mean": 0.19294407963752747, "sampling/importance_sampling_ratio/min": 0.00200445088557899, "sampling/sampling_logp_difference/max": 4.469278335571289, "sampling/sampling_logp_difference/mean": 0.4673677682876587, "step": 607, "step_time": 9.973039321004762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.800934821367264, "epoch": 0.00608, "grad_norm": 0.08713239431381226, "kl": 0.7115197218954563, "learning_rate": 9.999848580371143e-06, "loss": -0.0084, "step": 608, "step_time": 4.958952579007018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 498.875, "completions/mean_terminated_length": 490.2903137207031, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.4859661161899567, "epoch": 0.00609, "frac_reward_zero_std": 0.75, "grad_norm": 0.022100405767560005, "kl": 0.6791460365056992, "learning_rate": 9.999848050472662e-06, "loss": -0.0014, "num_tokens": 13081856.0, "reward": 0.3782007098197937, "reward_std": 0.012068275362253189, "rewards/rollout_reward_func/mean": 0.3782007098197937, "rewards/rollout_reward_func/std": 0.9399709701538086, "sampling/importance_sampling_ratio/max": 0.554354190826416, "sampling/importance_sampling_ratio/mean": 0.293942391872406, "sampling/importance_sampling_ratio/min": 4.044602708308531e-10, "sampling/sampling_logp_difference/max": 5.055522441864014, "sampling/sampling_logp_difference/mean": 0.5068832635879517, "step": 609, "step_time": 9.810215913006687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.4778088331222534, "epoch": 0.0061, "grad_norm": 0.021114781498908997, "kl": 0.6800323724746704, "learning_rate": 9.99984751964862e-06, "loss": -0.0014, "step": 610, "step_time": 5.705629736010451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 335.4375, "completions/mean_terminated_length": 325.4193420410156, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.9261260330677032, "epoch": 0.00611, "frac_reward_zero_std": 0.5, "grad_norm": 0.01599167287349701, "kl": 0.7985474839806557, "learning_rate": 9.999846987899019e-06, "loss": -0.0024, "num_tokens": 13130238.0, "reward": 0.659497857093811, "reward_std": 0.10245107114315033, "rewards/rollout_reward_func/mean": 0.659497857093811, "rewards/rollout_reward_func/std": 0.8402972221374512, "sampling/importance_sampling_ratio/max": 0.5500983595848083, "sampling/importance_sampling_ratio/mean": 0.3386106789112091, "sampling/importance_sampling_ratio/min": 2.5874015928926963e-13, "sampling/sampling_logp_difference/max": 3.310086250305176, "sampling/sampling_logp_difference/mean": 0.7284255027770996, "step": 611, "step_time": 8.48638070000743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.916126698255539, "epoch": 0.00612, "grad_norm": 0.015606196597218513, "kl": 0.7995204702019691, "learning_rate": 9.999846455223852e-06, "loss": -0.0023, "step": 612, "step_time": 4.682552252001187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 587.1875, "completions/mean_terminated_length": 587.1875, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "entropy": 3.612683743238449, "epoch": 0.00613, "frac_reward_zero_std": 0.5, "grad_norm": 0.013576059602200985, "kl": 0.7001521624624729, "learning_rate": 9.999845921623126e-06, "loss": 0.0007, "num_tokens": 13191578.0, "reward": 0.9442769289016724, "reward_std": 0.01034801546484232, "rewards/rollout_reward_func/mean": 0.9442769289016724, "rewards/rollout_reward_func/std": 0.4394179582595825, "sampling/importance_sampling_ratio/max": 0.29488444328308105, "sampling/importance_sampling_ratio/mean": 0.2211872935295105, "sampling/importance_sampling_ratio/min": 4.944247442018046e-11, "sampling/sampling_logp_difference/max": 3.3406660556793213, "sampling/sampling_logp_difference/mean": 0.4996758699417114, "step": 613, "step_time": 9.872310187995026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.589840978384018, "epoch": 0.00614, "grad_norm": 0.013203972019255161, "kl": 0.7027223333716393, "learning_rate": 9.999845387096839e-06, "loss": 0.0007, "step": 614, "step_time": 5.04751504300657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 505.78125, "completions/mean_terminated_length": 500.5483703613281, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.479338079690933, "epoch": 0.00615, "frac_reward_zero_std": 0.25, "grad_norm": 0.04712370038032532, "kl": 0.6566323526203632, "learning_rate": 9.99984485164499e-06, "loss": -0.0043, "num_tokens": 13248105.0, "reward": 0.5061022043228149, "reward_std": 0.2725616693496704, "rewards/rollout_reward_func/mean": 0.5061022043228149, "rewards/rollout_reward_func/std": 0.5412116646766663, "sampling/importance_sampling_ratio/max": 0.5537191033363342, "sampling/importance_sampling_ratio/mean": 0.20619061589241028, "sampling/importance_sampling_ratio/min": 2.294268530679022e-22, "sampling/sampling_logp_difference/max": 4.73297119140625, "sampling/sampling_logp_difference/mean": 0.7545320987701416, "step": 615, "step_time": 9.709356386993022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.456086248159409, "epoch": 0.00616, "grad_norm": 0.046487729996442795, "kl": 0.6563984267413616, "learning_rate": 9.99984431526758e-06, "loss": -0.0044, "step": 616, "step_time": 4.80307078500482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 496.59375, "completions/mean_terminated_length": 496.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.6945146322250366, "epoch": 0.00617, "frac_reward_zero_std": 0.5, "grad_norm": 0.019778870046138763, "kl": 0.717349961400032, "learning_rate": 9.99984377796461e-06, "loss": -0.005, "num_tokens": 13303531.0, "reward": 0.7694697380065918, "reward_std": 0.24989986419677734, "rewards/rollout_reward_func/mean": 0.7694697380065918, "rewards/rollout_reward_func/std": 0.5744161009788513, "sampling/importance_sampling_ratio/max": 0.5468647480010986, "sampling/importance_sampling_ratio/mean": 0.2613356113433838, "sampling/importance_sampling_ratio/min": 0.0007044858648441732, "sampling/sampling_logp_difference/max": 3.7495594024658203, "sampling/sampling_logp_difference/mean": 0.4987320899963379, "step": 617, "step_time": 9.127049751987215 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.674067884683609, "epoch": 0.00618, "grad_norm": 0.01853548362851143, "kl": 0.7162343412637711, "learning_rate": 9.999843239736079e-06, "loss": -0.0051, "step": 618, "step_time": 5.276139918998524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 284.5, "completions/mean_terminated_length": 284.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.4392530620098114, "epoch": 0.00619, "frac_reward_zero_std": 0.5, "grad_norm": 0.052032485604286194, "kl": 0.6686298474669456, "learning_rate": 9.999842700581986e-06, "loss": -0.0047, "num_tokens": 13352320.0, "reward": 0.8926974534988403, "reward_std": 0.012146731838583946, "rewards/rollout_reward_func/mean": 0.8926974534988403, "rewards/rollout_reward_func/std": 0.41881462931632996, "sampling/importance_sampling_ratio/max": 0.5477025508880615, "sampling/importance_sampling_ratio/mean": 0.35916668176651, "sampling/importance_sampling_ratio/min": 1.5691068212975119e-18, "sampling/sampling_logp_difference/max": 14.294987678527832, "sampling/sampling_logp_difference/mean": 0.6777853965759277, "step": 619, "step_time": 8.52898875498795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.436995655298233, "epoch": 0.0062, "grad_norm": 0.05459887906908989, "kl": 0.6665188446640968, "learning_rate": 9.999842160502334e-06, "loss": -0.0047, "step": 620, "step_time": 4.6559945970075205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 297.34375, "completions/mean_terminated_length": 297.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.983904391527176, "epoch": 0.00621, "frac_reward_zero_std": 0.5, "grad_norm": 0.047272734344005585, "kl": 0.79878368973732, "learning_rate": 9.99984161949712e-06, "loss": -0.0066, "num_tokens": 13401538.0, "reward": 0.821666419506073, "reward_std": 0.4046993553638458, "rewards/rollout_reward_func/mean": 0.821666419506073, "rewards/rollout_reward_func/std": 0.7167912721633911, "sampling/importance_sampling_ratio/max": 0.5517788529396057, "sampling/importance_sampling_ratio/mean": 0.38719642162323, "sampling/importance_sampling_ratio/min": 0.003958861343562603, "sampling/sampling_logp_difference/max": 4.041252136230469, "sampling/sampling_logp_difference/mean": 0.3730758726596832, "step": 621, "step_time": 9.308180508996884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.989243507385254, "epoch": 0.00622, "grad_norm": 0.04679916799068451, "kl": 0.7968119978904724, "learning_rate": 9.999841077566347e-06, "loss": -0.0067, "step": 622, "step_time": 4.830014223007311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 631.9375, "completions/mean_terminated_length": 631.9375, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "entropy": 3.852447807788849, "epoch": 0.00623, "frac_reward_zero_std": 0.0, "grad_norm": 0.0527174286544323, "kl": 0.6982860267162323, "learning_rate": 9.999840534710012e-06, "loss": -0.0212, "num_tokens": 13464288.0, "reward": 0.7582293748855591, "reward_std": 0.40961188077926636, "rewards/rollout_reward_func/mean": 0.7582293748855591, "rewards/rollout_reward_func/std": 0.5125489234924316, "sampling/importance_sampling_ratio/max": 0.29112839698791504, "sampling/importance_sampling_ratio/mean": 0.1873045414686203, "sampling/importance_sampling_ratio/min": 3.476203699892544e-12, "sampling/sampling_logp_difference/max": 4.8940629959106445, "sampling/sampling_logp_difference/mean": 0.5793657302856445, "step": 623, "step_time": 9.267164005002996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.8814360797405243, "epoch": 0.00624, "grad_norm": 0.04808132350444794, "kl": 0.6999122649431229, "learning_rate": 9.99983999092812e-06, "loss": -0.0213, "step": 624, "step_time": 5.426520029002859 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 345.0, "completions/mean_terminated_length": 345.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.542742758989334, "epoch": 0.00625, "frac_reward_zero_std": 0.5, "grad_norm": 0.046692319214344025, "kl": 0.7250601351261139, "learning_rate": 9.999839446220667e-06, "loss": -0.0095, "num_tokens": 13514593.0, "reward": 0.8616340160369873, "reward_std": 0.2617587149143219, "rewards/rollout_reward_func/mean": 0.8616340160369873, "rewards/rollout_reward_func/std": 0.41763827204704285, "sampling/importance_sampling_ratio/max": 0.5508899092674255, "sampling/importance_sampling_ratio/mean": 0.34325894713401794, "sampling/importance_sampling_ratio/min": 0.03863544762134552, "sampling/sampling_logp_difference/max": 1.8678343296051025, "sampling/sampling_logp_difference/mean": 0.46978768706321716, "step": 625, "step_time": 8.642393055015418 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.560647487640381, "epoch": 0.00626, "grad_norm": 0.04693672060966492, "kl": 0.7267049252986908, "learning_rate": 9.999838900587653e-06, "loss": -0.0096, "step": 626, "step_time": 4.7465070830076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 303.25, "completions/mean_terminated_length": 303.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.328716605901718, "epoch": 0.00627, "frac_reward_zero_std": 0.75, "grad_norm": 0.016848424449563026, "kl": 0.7657586261630058, "learning_rate": 9.999838354029082e-06, "loss": 0.001, "num_tokens": 13562880.0, "reward": 0.8683294057846069, "reward_std": 0.09003566205501556, "rewards/rollout_reward_func/mean": 0.8683294057846069, "rewards/rollout_reward_func/std": 0.4967261254787445, "sampling/importance_sampling_ratio/max": 0.5480550527572632, "sampling/importance_sampling_ratio/mean": 0.3552507162094116, "sampling/importance_sampling_ratio/min": 0.031737618148326874, "sampling/sampling_logp_difference/max": 2.1074118614196777, "sampling/sampling_logp_difference/mean": 0.4197949767112732, "step": 627, "step_time": 9.933849429995462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.3380288779735565, "epoch": 0.00628, "grad_norm": 0.016737043857574463, "kl": 0.7664323225617409, "learning_rate": 9.99983780654495e-06, "loss": 0.001, "step": 628, "step_time": 5.0051114540110575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 524.09375, "completions/mean_terminated_length": 524.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.1666902005672455, "epoch": 0.00629, "frac_reward_zero_std": 0.0, "grad_norm": 0.48886075615882874, "kl": 5.418075233697891, "learning_rate": 9.999837258135259e-06, "loss": -0.0045, "num_tokens": 13620370.0, "reward": 0.5574871301651001, "reward_std": 0.40109628438949585, "rewards/rollout_reward_func/mean": 0.5574871301651001, "rewards/rollout_reward_func/std": 0.6274747252464294, "sampling/importance_sampling_ratio/max": 0.5536638498306274, "sampling/importance_sampling_ratio/mean": 0.21805430948734283, "sampling/importance_sampling_ratio/min": 0.011958534829318523, "sampling/sampling_logp_difference/max": 2.7707407474517822, "sampling/sampling_logp_difference/mean": 0.5997303128242493, "step": 629, "step_time": 9.322810037992895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.240712434053421, "epoch": 0.0063, "grad_norm": 0.11796284466981888, "kl": 2.290219970047474, "learning_rate": 9.999836708800008e-06, "loss": -0.0099, "step": 630, "step_time": 5.48904809801752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 363.90625, "completions/mean_terminated_length": 375.1290283203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.154302507638931, "epoch": 0.00631, "frac_reward_zero_std": 0.25, "grad_norm": 0.06848315894603729, "kl": 1.0218105092644691, "learning_rate": 9.999836158539198e-06, "loss": 0.0009, "num_tokens": 13670410.0, "reward": 0.2425416111946106, "reward_std": 0.27824559807777405, "rewards/rollout_reward_func/mean": 0.2425416111946106, "rewards/rollout_reward_func/std": 0.9056668281555176, "sampling/importance_sampling_ratio/max": 0.5485020875930786, "sampling/importance_sampling_ratio/mean": 0.3053564131259918, "sampling/importance_sampling_ratio/min": 6.834868088168022e-11, "sampling/sampling_logp_difference/max": 3.1076111793518066, "sampling/sampling_logp_difference/mean": 0.7284587025642395, "step": 631, "step_time": 8.78846273898671 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 4.238609492778778, "epoch": 0.00632, "grad_norm": 0.059386592358350754, "kl": 0.9928230941295624, "learning_rate": 9.99983560735283e-06, "loss": 0.0007, "step": 632, "step_time": 4.848444446011854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 173.09375, "completions/mean_terminated_length": 173.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.250191032886505, "epoch": 0.00633, "frac_reward_zero_std": 0.5, "grad_norm": 0.08741013705730438, "kl": 0.810954250395298, "learning_rate": 9.999835055240903e-06, "loss": 0.0047, "num_tokens": 13714023.0, "reward": 0.22389787435531616, "reward_std": 0.3488042652606964, "rewards/rollout_reward_func/mean": 0.22389787435531616, "rewards/rollout_reward_func/std": 1.0133914947509766, "sampling/importance_sampling_ratio/max": 0.5432268381118774, "sampling/importance_sampling_ratio/mean": 0.3511529266834259, "sampling/importance_sampling_ratio/min": 0.008297049440443516, "sampling/sampling_logp_difference/max": 2.3654625415802, "sampling/sampling_logp_difference/mean": 0.5542289614677429, "step": 633, "step_time": 8.527611143006652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.430584490299225, "epoch": 0.00634, "grad_norm": 0.08965589851140976, "kl": 0.7956105321645737, "learning_rate": 9.999834502203417e-06, "loss": 0.0044, "step": 634, "step_time": 4.438106857007369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 292.1875, "completions/mean_terminated_length": 292.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.487055718898773, "epoch": 0.00635, "frac_reward_zero_std": 0.5, "grad_norm": 0.04738738760352135, "kl": 0.834307886660099, "learning_rate": 9.999833948240373e-06, "loss": -0.0025, "num_tokens": 13762145.0, "reward": 0.23037756979465485, "reward_std": 0.4698554277420044, "rewards/rollout_reward_func/mean": 0.23037756979465485, "rewards/rollout_reward_func/std": 1.0657069683074951, "sampling/importance_sampling_ratio/max": 0.5207024216651917, "sampling/importance_sampling_ratio/mean": 0.27580374479293823, "sampling/importance_sampling_ratio/min": 0.0014465543208643794, "sampling/sampling_logp_difference/max": 4.622624397277832, "sampling/sampling_logp_difference/mean": 0.5563734769821167, "step": 635, "step_time": 8.578148440014047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.7245782017707825, "epoch": 0.00636, "grad_norm": 0.04779994860291481, "kl": 0.8109383210539818, "learning_rate": 9.99983339335177e-06, "loss": -0.0024, "step": 636, "step_time": 5.045783735004079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 328.25, "completions/mean_terminated_length": 309.73333740234375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.932779908180237, "epoch": 0.00637, "frac_reward_zero_std": 0.5, "grad_norm": 0.04929406940937042, "kl": 0.7891279682517052, "learning_rate": 9.99983283753761e-06, "loss": -0.0037, "num_tokens": 13812076.0, "reward": 0.3711583614349365, "reward_std": 0.2737616300582886, "rewards/rollout_reward_func/mean": 0.3711583614349365, "rewards/rollout_reward_func/std": 0.928839921951294, "sampling/importance_sampling_ratio/max": 0.514443576335907, "sampling/importance_sampling_ratio/mean": 0.2701605558395386, "sampling/importance_sampling_ratio/min": 1.4200977464004071e-14, "sampling/sampling_logp_difference/max": 4.674909591674805, "sampling/sampling_logp_difference/mean": 0.968134343624115, "step": 637, "step_time": 8.592369634003262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02864583395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02864583395421505, "entropy": 5.058139890432358, "epoch": 0.00638, "grad_norm": 0.0408606119453907, "kl": 0.7563572004437447, "learning_rate": 9.99983228079789e-06, "loss": -0.0037, "step": 638, "step_time": 4.756129658991995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 644.5, "completions/mean_terminated_length": 644.5, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "entropy": 5.675223529338837, "epoch": 0.00639, "frac_reward_zero_std": 0.0, "grad_norm": 0.0749049186706543, "kl": 0.8452305495738983, "learning_rate": 9.999831723132612e-06, "loss": -0.0078, "num_tokens": 13873826.0, "reward": 0.3336421251296997, "reward_std": 0.6889352798461914, "rewards/rollout_reward_func/mean": 0.3336421251296997, "rewards/rollout_reward_func/std": 0.9289819002151489, "sampling/importance_sampling_ratio/max": 0.23301328718662262, "sampling/importance_sampling_ratio/mean": 0.0693858414888382, "sampling/importance_sampling_ratio/min": 9.122531289401659e-14, "sampling/sampling_logp_difference/max": 3.4762516021728516, "sampling/sampling_logp_difference/mean": 0.9420560598373413, "step": 639, "step_time": 9.876154962010332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 5.670800507068634, "epoch": 0.0064, "grad_norm": 0.06174532324075699, "kl": 0.8104548528790474, "learning_rate": 9.999831164541778e-06, "loss": -0.008, "step": 640, "step_time": 4.985198396992928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 438.625, "completions/mean_terminated_length": 438.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.001152813434601, "epoch": 0.00641, "frac_reward_zero_std": 0.25, "grad_norm": 0.046687014400959015, "kl": 0.7587955370545387, "learning_rate": 9.999830605025384e-06, "loss": -0.0088, "num_tokens": 13927365.0, "reward": 0.9662577509880066, "reward_std": 0.3290678560733795, "rewards/rollout_reward_func/mean": 0.9662577509880066, "rewards/rollout_reward_func/std": 0.5298038721084595, "sampling/importance_sampling_ratio/max": 0.4824179410934448, "sampling/importance_sampling_ratio/mean": 0.187177836894989, "sampling/importance_sampling_ratio/min": 5.021108608578638e-16, "sampling/sampling_logp_difference/max": 3.618725299835205, "sampling/sampling_logp_difference/mean": 0.9280962347984314, "step": 641, "step_time": 8.93390615600947 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 4.895167410373688, "epoch": 0.00642, "grad_norm": 0.04429011419415474, "kl": 0.7667798325419426, "learning_rate": 9.999830044583436e-06, "loss": -0.0089, "step": 642, "step_time": 5.282373614994867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 526.09375, "completions/mean_terminated_length": 526.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.1736443638801575, "epoch": 0.00643, "frac_reward_zero_std": 0.25, "grad_norm": 0.04435298219323158, "kl": 0.6725166849792004, "learning_rate": 9.999829483215928e-06, "loss": -0.0047, "num_tokens": 13983965.0, "reward": 0.4681798815727234, "reward_std": 0.5639890432357788, "rewards/rollout_reward_func/mean": 0.4681798815727234, "rewards/rollout_reward_func/std": 0.8238909244537354, "sampling/importance_sampling_ratio/max": 0.43367713689804077, "sampling/importance_sampling_ratio/mean": 0.16119758784770966, "sampling/importance_sampling_ratio/min": 0.018713008612394333, "sampling/sampling_logp_difference/max": 2.2993454933166504, "sampling/sampling_logp_difference/mean": 0.6823427081108093, "step": 643, "step_time": 9.121587821988214 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0182291679084301, "entropy": 5.04133927822113, "epoch": 0.00644, "grad_norm": 0.03536246344447136, "kl": 0.6593996174633503, "learning_rate": 9.999828920922866e-06, "loss": -0.005, "step": 644, "step_time": 5.487529174984957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 621.125, "completions/mean_terminated_length": 616.5806274414062, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "entropy": 4.645089685916901, "epoch": 0.00645, "frac_reward_zero_std": 0.25, "grad_norm": 0.036401957273483276, "kl": 0.6593996323645115, "learning_rate": 9.999828357704242e-06, "loss": -0.0047, "num_tokens": 14045359.0, "reward": 0.19898304343223572, "reward_std": 0.4123609662055969, "rewards/rollout_reward_func/mean": 0.19898304343223572, "rewards/rollout_reward_func/std": 0.9698356986045837, "sampling/importance_sampling_ratio/max": 0.2499689906835556, "sampling/importance_sampling_ratio/mean": 0.1449117511510849, "sampling/importance_sampling_ratio/min": 6.145520070345014e-15, "sampling/sampling_logp_difference/max": 4.221316337585449, "sampling/sampling_logp_difference/mean": 0.7168991565704346, "step": 645, "step_time": 9.101939408006729 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 4.527346044778824, "epoch": 0.00646, "grad_norm": 0.034272294491529465, "kl": 0.6537017785012722, "learning_rate": 9.999827793560063e-06, "loss": -0.0048, "step": 646, "step_time": 4.859038133996364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 883.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 626.59375, "completions/mean_terminated_length": 646.290283203125, "completions/min_length": 16.0, "completions/min_terminated_length": 408.0, "entropy": 4.43855282664299, "epoch": 0.00647, "frac_reward_zero_std": 0.25, "grad_norm": 0.09703579545021057, "kl": 0.6652227565646172, "learning_rate": 9.999827228490327e-06, "loss": -0.0089, "num_tokens": 14106874.0, "reward": 0.819156289100647, "reward_std": 0.34015846252441406, "rewards/rollout_reward_func/mean": 0.819156289100647, "rewards/rollout_reward_func/std": 0.5551051497459412, "sampling/importance_sampling_ratio/max": 0.24712228775024414, "sampling/importance_sampling_ratio/mean": 0.16102999448776245, "sampling/importance_sampling_ratio/min": 3.491912922010121e-12, "sampling/sampling_logp_difference/max": 3.799743890762329, "sampling/sampling_logp_difference/mean": 0.6667817831039429, "step": 647, "step_time": 9.859627596000792 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 4.320941060781479, "epoch": 0.00648, "grad_norm": 0.03265855088829994, "kl": 0.6769094467163086, "learning_rate": 9.999826662495036e-06, "loss": -0.009, "step": 648, "step_time": 5.049307729008433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 792.0, "completions/max_terminated_length": 792.0, "completions/mean_length": 423.34375, "completions/mean_terminated_length": 423.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.5706111788749695, "epoch": 0.00649, "frac_reward_zero_std": 0.25, "grad_norm": 0.07314351201057434, "kl": 0.648336611688137, "learning_rate": 9.999826095574187e-06, "loss": -0.0128, "num_tokens": 14159870.0, "reward": 0.8962860703468323, "reward_std": 0.39800384640693665, "rewards/rollout_reward_func/mean": 0.8962860703468323, "rewards/rollout_reward_func/std": 0.4848165810108185, "sampling/importance_sampling_ratio/max": 0.5164790153503418, "sampling/importance_sampling_ratio/mean": 0.22732974588871002, "sampling/importance_sampling_ratio/min": 1.4159920082984761e-14, "sampling/sampling_logp_difference/max": 4.787222385406494, "sampling/sampling_logp_difference/mean": 0.7877606749534607, "step": 649, "step_time": 9.150005953997606 }, { "clip_ratio/high_max": 0.023863636888563633, "clip_ratio/high_mean": 0.011931818444281816, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011931818444281816, "entropy": 4.489531248807907, "epoch": 0.0065, "grad_norm": 0.09469100087881088, "kl": 0.662656880915165, "learning_rate": 9.999825527727781e-06, "loss": -0.0128, "step": 650, "step_time": 5.293167689014808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 456.78125, "completions/mean_terminated_length": 456.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.7757503390312195, "epoch": 0.00651, "frac_reward_zero_std": 0.0, "grad_norm": 0.07058420032262802, "kl": 0.6547251790761948, "learning_rate": 9.99982495895582e-06, "loss": -0.009, "num_tokens": 14216497.0, "reward": 0.3987351655960083, "reward_std": 0.6865642070770264, "rewards/rollout_reward_func/mean": 0.3987351655960083, "rewards/rollout_reward_func/std": 0.9570579528808594, "sampling/importance_sampling_ratio/max": 0.47689977288246155, "sampling/importance_sampling_ratio/mean": 0.25449615716934204, "sampling/importance_sampling_ratio/min": 0.01872158981859684, "sampling/sampling_logp_difference/max": 1.8082995414733887, "sampling/sampling_logp_difference/mean": 0.447268009185791, "step": 651, "step_time": 8.821478092002508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.7318011820316315, "epoch": 0.00652, "grad_norm": 0.07858389616012573, "kl": 0.6562515571713448, "learning_rate": 9.999824389258302e-06, "loss": -0.009, "step": 652, "step_time": 4.868517456998234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 171.9375, "completions/mean_terminated_length": 176.96774291992188, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.5770840644836426, "epoch": 0.00653, "frac_reward_zero_std": 0.5, "grad_norm": 0.03263108432292938, "kl": 0.7648423165082932, "learning_rate": 9.999823818635227e-06, "loss": 0.0005, "num_tokens": 14260763.0, "reward": 0.8012400269508362, "reward_std": 0.14275716245174408, "rewards/rollout_reward_func/mean": 0.8012400269508362, "rewards/rollout_reward_func/std": 0.5768001675605774, "sampling/importance_sampling_ratio/max": 0.5452690720558167, "sampling/importance_sampling_ratio/mean": 0.408582866191864, "sampling/importance_sampling_ratio/min": 2.337157356535613e-09, "sampling/sampling_logp_difference/max": 2.7566375732421875, "sampling/sampling_logp_difference/mean": 0.5685396790504456, "step": 653, "step_time": 8.698919844006014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.544428914785385, "epoch": 0.00654, "grad_norm": 0.03126371651887894, "kl": 0.7641480565071106, "learning_rate": 9.9998232470866e-06, "loss": 0.0005, "step": 654, "step_time": 4.748403482000867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 328.375, "completions/mean_terminated_length": 328.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.823696792125702, "epoch": 0.00655, "frac_reward_zero_std": 0.25, "grad_norm": 0.01801135204732418, "kl": 0.7723790109157562, "learning_rate": 9.999822674612414e-06, "loss": -0.0041, "num_tokens": 14309026.0, "reward": 0.894508957862854, "reward_std": 0.032429374754428864, "rewards/rollout_reward_func/mean": 0.894508957862854, "rewards/rollout_reward_func/std": 0.42549481987953186, "sampling/importance_sampling_ratio/max": 0.536468505859375, "sampling/importance_sampling_ratio/mean": 0.3278172016143799, "sampling/importance_sampling_ratio/min": 2.927718831258197e-13, "sampling/sampling_logp_difference/max": 3.4273483753204346, "sampling/sampling_logp_difference/mean": 0.6083900928497314, "step": 655, "step_time": 8.446368542994605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.796846628189087, "epoch": 0.00656, "grad_norm": 0.01812843233346939, "kl": 0.7749252542853355, "learning_rate": 9.999822101212674e-06, "loss": -0.0041, "step": 656, "step_time": 5.1210331300098915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 453.6875, "completions/mean_terminated_length": 453.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.298706978559494, "epoch": 0.00657, "frac_reward_zero_std": 0.25, "grad_norm": 0.05491258203983307, "kl": 0.5405551977455616, "learning_rate": 9.999821526887376e-06, "loss": -0.0083, "num_tokens": 14362741.0, "reward": 0.3182727098464966, "reward_std": 0.3746786415576935, "rewards/rollout_reward_func/mean": 0.3182727098464966, "rewards/rollout_reward_func/std": 0.9695032238960266, "sampling/importance_sampling_ratio/max": 0.531684935092926, "sampling/importance_sampling_ratio/mean": 0.24467267096042633, "sampling/importance_sampling_ratio/min": 3.4562021295230666e-21, "sampling/sampling_logp_difference/max": 12.065062522888184, "sampling/sampling_logp_difference/mean": 0.941675066947937, "step": 657, "step_time": 8.973935664995224 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 4.278084814548492, "epoch": 0.00658, "grad_norm": 0.07374536246061325, "kl": 0.5421523824334145, "learning_rate": 9.999820951636526e-06, "loss": -0.0084, "step": 658, "step_time": 4.667377476995171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 668.375, "completions/mean_terminated_length": 667.6333618164062, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "entropy": 4.595681846141815, "epoch": 0.00659, "frac_reward_zero_std": 0.0, "grad_norm": 0.07110581547021866, "kl": 0.6597530543804169, "learning_rate": 9.99982037546012e-06, "loss": -0.0046, "num_tokens": 14426111.0, "reward": 0.24250420928001404, "reward_std": 0.7328588962554932, "rewards/rollout_reward_func/mean": 0.24250420928001404, "rewards/rollout_reward_func/std": 0.8501155972480774, "sampling/importance_sampling_ratio/max": 0.29544949531555176, "sampling/importance_sampling_ratio/mean": 0.13546228408813477, "sampling/importance_sampling_ratio/min": 1.5218196494637404e-11, "sampling/sampling_logp_difference/max": 4.417463779449463, "sampling/sampling_logp_difference/mean": 0.7995935082435608, "step": 659, "step_time": 9.632497013997636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 4.615467607975006, "epoch": 0.0066, "grad_norm": 0.07552625238895416, "kl": 0.6518976464867592, "learning_rate": 9.999819798358157e-06, "loss": -0.0049, "step": 660, "step_time": 4.923949107003864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 438.65625, "completions/mean_terminated_length": 438.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.4595600962638855, "epoch": 0.00661, "frac_reward_zero_std": 0.5, "grad_norm": 0.21352531015872955, "kl": 0.6598216071724892, "learning_rate": 9.999819220330643e-06, "loss": -0.0022, "num_tokens": 14480355.0, "reward": 0.6062455177307129, "reward_std": 0.46986284852027893, "rewards/rollout_reward_func/mean": 0.6062455177307129, "rewards/rollout_reward_func/std": 0.8576981425285339, "sampling/importance_sampling_ratio/max": 0.542469322681427, "sampling/importance_sampling_ratio/mean": 0.2887585461139679, "sampling/importance_sampling_ratio/min": 0.0015446195611730218, "sampling/sampling_logp_difference/max": 3.6849772930145264, "sampling/sampling_logp_difference/mean": 0.44355908036231995, "step": 661, "step_time": 9.232567342995026 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026041666977107525, "entropy": 3.4662413895130157, "epoch": 0.00662, "grad_norm": 0.05941897630691528, "kl": 0.6596015803515911, "learning_rate": 9.99981864137757e-06, "loss": -0.0025, "step": 662, "step_time": 4.806824643994332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 883.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 501.28125, "completions/mean_terminated_length": 501.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.6099815666675568, "epoch": 0.00663, "frac_reward_zero_std": 0.25, "grad_norm": 0.1423235684633255, "kl": 0.9605596363544464, "learning_rate": 9.999818061498945e-06, "loss": -0.0091, "num_tokens": 14535968.0, "reward": 0.6950125694274902, "reward_std": 0.4629790782928467, "rewards/rollout_reward_func/mean": 0.6950125694274902, "rewards/rollout_reward_func/std": 0.7542710900306702, "sampling/importance_sampling_ratio/max": 0.5454257130622864, "sampling/importance_sampling_ratio/mean": 0.27148622274398804, "sampling/importance_sampling_ratio/min": 0.0007652323110960424, "sampling/sampling_logp_difference/max": 3.8798391819000244, "sampling/sampling_logp_difference/mean": 0.4654819965362549, "step": 663, "step_time": 9.243857460001891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.014062500093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014062500093132257, "entropy": 3.6382016241550446, "epoch": 0.00664, "grad_norm": 0.11491624265909195, "kl": 0.9260811060667038, "learning_rate": 9.999817480694764e-06, "loss": -0.0096, "step": 664, "step_time": 5.070816594998178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 268.65625, "completions/mean_terminated_length": 268.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.082021713256836, "epoch": 0.00665, "frac_reward_zero_std": 0.5, "grad_norm": 0.11174893379211426, "kl": 0.7402118295431137, "learning_rate": 9.99981689896503e-06, "loss": 0.0015, "num_tokens": 14582063.0, "reward": 0.7810122966766357, "reward_std": 0.25309330224990845, "rewards/rollout_reward_func/mean": 0.7810122966766357, "rewards/rollout_reward_func/std": 0.7913315296173096, "sampling/importance_sampling_ratio/max": 0.5440056920051575, "sampling/importance_sampling_ratio/mean": 0.38775837421417236, "sampling/importance_sampling_ratio/min": 0.022308792918920517, "sampling/sampling_logp_difference/max": 1.7859920263290405, "sampling/sampling_logp_difference/mean": 0.35750237107276917, "step": 665, "step_time": 8.577186980000988 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 3.0884762704372406, "epoch": 0.00666, "grad_norm": 0.02951303869485855, "kl": 0.7409877702593803, "learning_rate": 9.99981631630974e-06, "loss": 0.0012, "step": 666, "step_time": 4.382513328004279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 775.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 502.5625, "completions/mean_terminated_length": 502.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.1144536435604095, "epoch": 0.00667, "frac_reward_zero_std": 0.25, "grad_norm": 0.034509506076574326, "kl": 0.6886150129139423, "learning_rate": 9.999815732728897e-06, "loss": -0.011, "num_tokens": 14639694.0, "reward": -0.5215446352958679, "reward_std": 0.4036262333393097, "rewards/rollout_reward_func/mean": -0.5215446352958679, "rewards/rollout_reward_func/std": 0.8169009685516357, "sampling/importance_sampling_ratio/max": 0.5450995564460754, "sampling/importance_sampling_ratio/mean": 0.24076466262340546, "sampling/importance_sampling_ratio/min": 7.040468208430184e-09, "sampling/sampling_logp_difference/max": 3.5880837440490723, "sampling/sampling_logp_difference/mean": 0.6687854528427124, "step": 667, "step_time": 9.365982902992982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.117054909467697, "epoch": 0.00668, "grad_norm": 0.03268757089972496, "kl": 0.6892911419272423, "learning_rate": 9.9998151482225e-06, "loss": -0.011, "step": 668, "step_time": 4.908280393006862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 622.59375, "completions/mean_terminated_length": 622.59375, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "entropy": 4.262284427881241, "epoch": 0.00669, "frac_reward_zero_std": 0.0, "grad_norm": 0.040878091007471085, "kl": 0.6777473390102386, "learning_rate": 9.99981456279055e-06, "loss": -0.013, "num_tokens": 14701591.0, "reward": 0.3966163396835327, "reward_std": 0.3106655478477478, "rewards/rollout_reward_func/mean": 0.3966163396835327, "rewards/rollout_reward_func/std": 0.9027458429336548, "sampling/importance_sampling_ratio/max": 0.29441940784454346, "sampling/importance_sampling_ratio/mean": 0.16205474734306335, "sampling/importance_sampling_ratio/min": 6.318280994970296e-14, "sampling/sampling_logp_difference/max": 3.4474239349365234, "sampling/sampling_logp_difference/mean": 0.6899654865264893, "step": 669, "step_time": 9.162696755003708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.278700590133667, "epoch": 0.0067, "grad_norm": 0.04054516553878784, "kl": 0.6776256486773491, "learning_rate": 9.999813976433047e-06, "loss": -0.0131, "step": 670, "step_time": 5.309096343997226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 263.34375, "completions/mean_terminated_length": 261.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.4837905764579773, "epoch": 0.00671, "frac_reward_zero_std": 0.25, "grad_norm": 0.03175972402095795, "kl": 0.8046269118785858, "learning_rate": 9.99981338914999e-06, "loss": -0.0078, "num_tokens": 14749515.0, "reward": 0.41854923963546753, "reward_std": 0.28898805379867554, "rewards/rollout_reward_func/mean": 0.41854923963546753, "rewards/rollout_reward_func/std": 1.0335747003555298, "sampling/importance_sampling_ratio/max": 0.5362771153450012, "sampling/importance_sampling_ratio/mean": 0.35279130935668945, "sampling/importance_sampling_ratio/min": 1.3995407610600807e-10, "sampling/sampling_logp_difference/max": 3.9023866653442383, "sampling/sampling_logp_difference/mean": 0.5568290948867798, "step": 671, "step_time": 8.011396798989153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.480061888694763, "epoch": 0.00672, "grad_norm": 0.035408131778240204, "kl": 0.8050766959786415, "learning_rate": 9.99981280094138e-06, "loss": -0.0079, "step": 672, "step_time": 4.379859403983573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 603.875, "completions/mean_terminated_length": 603.875, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "entropy": 4.021536409854889, "epoch": 0.00673, "frac_reward_zero_std": 0.0, "grad_norm": 0.03867119550704956, "kl": 0.7221251986920834, "learning_rate": 9.999812211807216e-06, "loss": -0.0162, "num_tokens": 14810143.0, "reward": 0.7365218997001648, "reward_std": 0.5854406356811523, "rewards/rollout_reward_func/mean": 0.7365218997001648, "rewards/rollout_reward_func/std": 0.6500831842422485, "sampling/importance_sampling_ratio/max": 0.29804569482803345, "sampling/importance_sampling_ratio/mean": 0.17585277557373047, "sampling/importance_sampling_ratio/min": 0.019424209371209145, "sampling/sampling_logp_difference/max": 1.8333117961883545, "sampling/sampling_logp_difference/mean": 0.5235471129417419, "step": 673, "step_time": 9.788850685996294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.0312879383563995, "epoch": 0.00674, "grad_norm": 0.03765912726521492, "kl": 0.7201073318719864, "learning_rate": 9.9998116217475e-06, "loss": -0.0163, "step": 674, "step_time": 4.978748772016843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 725.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 165.84375, "completions/mean_terminated_length": 147.8064422607422, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.117843121290207, "epoch": 0.00675, "frac_reward_zero_std": 0.75, "grad_norm": 0.009362910874187946, "kl": 0.739013209939003, "learning_rate": 9.99981103076223e-06, "loss": -0.0053, "num_tokens": 14850614.0, "reward": 1.1092793941497803, "reward_std": 0.006923686247318983, "rewards/rollout_reward_func/mean": 1.1092793941497803, "rewards/rollout_reward_func/std": 0.01615438424050808, "sampling/importance_sampling_ratio/max": 0.5426884293556213, "sampling/importance_sampling_ratio/mean": 0.45042484998703003, "sampling/importance_sampling_ratio/min": 1.0799491509394521e-11, "sampling/sampling_logp_difference/max": 4.4118523597717285, "sampling/sampling_logp_difference/mean": 0.49277791380882263, "step": 675, "step_time": 8.416951100007282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.1220555305480957, "epoch": 0.00676, "grad_norm": 0.009173407219350338, "kl": 0.7387880012392998, "learning_rate": 9.999810438851407e-06, "loss": -0.0053, "step": 676, "step_time": 4.695474338986969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 472.0, "completions/mean_terminated_length": 472.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.431653141975403, "epoch": 0.00677, "frac_reward_zero_std": 0.25, "grad_norm": 0.034966520965099335, "kl": 0.6478748172521591, "learning_rate": 9.999809846015032e-06, "loss": -0.0098, "num_tokens": 14906177.0, "reward": 0.5980027914047241, "reward_std": 0.4707610011100769, "rewards/rollout_reward_func/mean": 0.5980027914047241, "rewards/rollout_reward_func/std": 0.7561774849891663, "sampling/importance_sampling_ratio/max": 0.5460342168807983, "sampling/importance_sampling_ratio/mean": 0.23415642976760864, "sampling/importance_sampling_ratio/min": 6.540713110014806e-11, "sampling/sampling_logp_difference/max": 3.6774373054504395, "sampling/sampling_logp_difference/mean": 0.7204867005348206, "step": 677, "step_time": 9.258861151996825 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 4.43287456035614, "epoch": 0.00678, "grad_norm": 0.029102381318807602, "kl": 0.6531800925731659, "learning_rate": 9.999809252253105e-06, "loss": -0.0098, "step": 678, "step_time": 4.789008480009215 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 476.0625, "completions/mean_terminated_length": 476.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.477822333574295, "epoch": 0.00679, "frac_reward_zero_std": 0.25, "grad_norm": 0.09209447354078293, "kl": 0.6577859409153461, "learning_rate": 9.999808657565626e-06, "loss": -0.0061, "num_tokens": 14962550.0, "reward": 0.33918529748916626, "reward_std": 0.3462616801261902, "rewards/rollout_reward_func/mean": 0.33918529748916626, "rewards/rollout_reward_func/std": 0.8916343450546265, "sampling/importance_sampling_ratio/max": 0.5449119806289673, "sampling/importance_sampling_ratio/mean": 0.2135089933872223, "sampling/importance_sampling_ratio/min": 7.685998365536406e-11, "sampling/sampling_logp_difference/max": 4.681614875793457, "sampling/sampling_logp_difference/mean": 0.7275009751319885, "step": 679, "step_time": 9.522193896991666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.490045249462128, "epoch": 0.0068, "grad_norm": 0.11202035844326019, "kl": 0.6675947792828083, "learning_rate": 9.999808061952593e-06, "loss": -0.0063, "step": 680, "step_time": 4.818227092000598 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01065340917557478, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 612.21875, "completions/mean_terminated_length": 612.21875, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "entropy": 4.3465743362903595, "epoch": 0.00681, "frac_reward_zero_std": 0.5, "grad_norm": 0.0680059865117073, "kl": 0.7368146553635597, "learning_rate": 9.999807465414011e-06, "loss": -0.0079, "num_tokens": 15024329.0, "reward": 0.12883248925209045, "reward_std": 0.4134097695350647, "rewards/rollout_reward_func/mean": 0.12883248925209045, "rewards/rollout_reward_func/std": 0.9941380023956299, "sampling/importance_sampling_ratio/max": 0.28839364647865295, "sampling/importance_sampling_ratio/mean": 0.1619049310684204, "sampling/importance_sampling_ratio/min": 1.8518600010657832e-14, "sampling/sampling_logp_difference/max": 5.017234802246094, "sampling/sampling_logp_difference/mean": 0.704042911529541, "step": 681, "step_time": 9.204537584009813 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01065340917557478, "entropy": 4.3496202528476715, "epoch": 0.00682, "grad_norm": 0.06021622568368912, "kl": 0.7423663288354874, "learning_rate": 9.999806867949875e-06, "loss": -0.0078, "step": 682, "step_time": 5.356580538005801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 741.0, "completions/max_terminated_length": 741.0, "completions/mean_length": 426.65625, "completions/mean_terminated_length": 439.9031982421875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.7128382325172424, "epoch": 0.00683, "frac_reward_zero_std": 0.0, "grad_norm": 0.22281983494758606, "kl": 0.6758349947631359, "learning_rate": 9.999806269560189e-06, "loss": -0.0034, "num_tokens": 15078251.0, "reward": 0.39404335618019104, "reward_std": 0.18864864110946655, "rewards/rollout_reward_func/mean": 0.39404335618019104, "rewards/rollout_reward_func/std": 0.9850256443023682, "sampling/importance_sampling_ratio/max": 0.5447583794593811, "sampling/importance_sampling_ratio/mean": 0.2697523534297943, "sampling/importance_sampling_ratio/min": 1.6394473556502476e-09, "sampling/sampling_logp_difference/max": 3.6023430824279785, "sampling/sampling_logp_difference/mean": 0.5854583978652954, "step": 683, "step_time": 8.811914409001474 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.7136290073394775, "epoch": 0.00684, "grad_norm": 0.025193819776177406, "kl": 0.6799204051494598, "learning_rate": 9.99980567024495e-06, "loss": -0.0037, "step": 684, "step_time": 5.049227565003093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0625, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 303.25, "completions/mean_terminated_length": 301.7666931152344, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.447302997112274, "epoch": 0.00685, "frac_reward_zero_std": 0.25, "grad_norm": 0.0485013872385025, "kl": 0.7003577426075935, "learning_rate": 9.99980507000416e-06, "loss": -0.0077, "num_tokens": 15126446.0, "reward": 0.8025417327880859, "reward_std": 0.4797995388507843, "rewards/rollout_reward_func/mean": 0.8025417327880859, "rewards/rollout_reward_func/std": 0.7633161544799805, "sampling/importance_sampling_ratio/max": 0.5477495789527893, "sampling/importance_sampling_ratio/mean": 0.3628997802734375, "sampling/importance_sampling_ratio/min": 1.087995253785945e-12, "sampling/sampling_logp_difference/max": 4.644972801208496, "sampling/sampling_logp_difference/mean": 0.6923376321792603, "step": 685, "step_time": 8.106478453992167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.4536827504634857, "epoch": 0.00686, "grad_norm": 0.06872577220201492, "kl": 0.7014843448996544, "learning_rate": 9.999804468837818e-06, "loss": -0.0077, "step": 686, "step_time": 4.503507886001898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 488.09375, "completions/mean_terminated_length": 503.32257080078125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.335394740104675, "epoch": 0.00687, "frac_reward_zero_std": 0.25, "grad_norm": 0.02348257787525654, "kl": 0.5661375559866428, "learning_rate": 9.999803866745927e-06, "loss": -0.0052, "num_tokens": 15183690.0, "reward": 0.5552704930305481, "reward_std": 0.14889882504940033, "rewards/rollout_reward_func/mean": 0.5552704930305481, "rewards/rollout_reward_func/std": 0.6728827953338623, "sampling/importance_sampling_ratio/max": 0.5422913432121277, "sampling/importance_sampling_ratio/mean": 0.21903973817825317, "sampling/importance_sampling_ratio/min": 2.6880550563213035e-10, "sampling/sampling_logp_difference/max": 3.5803675651550293, "sampling/sampling_logp_difference/mean": 0.6781092882156372, "step": 687, "step_time": 9.915481907992216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.3541547656059265, "epoch": 0.00688, "grad_norm": 0.02316121943295002, "kl": 0.5636966563761234, "learning_rate": 9.999803263728482e-06, "loss": -0.0052, "step": 688, "step_time": 5.609968293996644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 476.5625, "completions/mean_terminated_length": 490.9031982421875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.2924031019210815, "epoch": 0.00689, "frac_reward_zero_std": 0.5, "grad_norm": 0.02909057028591633, "kl": 0.5726450905203819, "learning_rate": 9.999802659785488e-06, "loss": -0.0002, "num_tokens": 15240605.0, "reward": 0.622704803943634, "reward_std": 0.33328425884246826, "rewards/rollout_reward_func/mean": 0.622704803943634, "rewards/rollout_reward_func/std": 0.7105792760848999, "sampling/importance_sampling_ratio/max": 0.5519617199897766, "sampling/importance_sampling_ratio/mean": 0.25683677196502686, "sampling/importance_sampling_ratio/min": 3.3647032914462436e-32, "sampling/sampling_logp_difference/max": 7.393939971923828, "sampling/sampling_logp_difference/mean": 0.9420439600944519, "step": 689, "step_time": 9.127722338998865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.306365609169006, "epoch": 0.0069, "grad_norm": 0.030657785013318062, "kl": 0.5718459226191044, "learning_rate": 9.999802054916945e-06, "loss": -0.0002, "step": 690, "step_time": 5.313232694010367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 464.25, "completions/mean_terminated_length": 464.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.288253456354141, "epoch": 0.00691, "frac_reward_zero_std": 0.25, "grad_norm": 0.054214876145124435, "kl": 0.7787653654813766, "learning_rate": 9.99980144912285e-06, "loss": -0.0029, "num_tokens": 15295764.0, "reward": 0.3018248677253723, "reward_std": 0.351337730884552, "rewards/rollout_reward_func/mean": 0.3018248677253723, "rewards/rollout_reward_func/std": 0.886483371257782, "sampling/importance_sampling_ratio/max": 0.5461412072181702, "sampling/importance_sampling_ratio/mean": 0.2334328293800354, "sampling/importance_sampling_ratio/min": 0.007235710974782705, "sampling/sampling_logp_difference/max": 2.355959415435791, "sampling/sampling_logp_difference/mean": 0.6132176518440247, "step": 691, "step_time": 8.844089258003805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.29803729057312, "epoch": 0.00692, "grad_norm": 0.05111366882920265, "kl": 0.7851946800947189, "learning_rate": 9.999800842403203e-06, "loss": -0.0029, "step": 692, "step_time": 4.895083278999664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 294.75, "completions/mean_terminated_length": 283.70965576171875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.525783509016037, "epoch": 0.00693, "frac_reward_zero_std": 0.5, "grad_norm": 0.04296642541885376, "kl": 0.6347622573375702, "learning_rate": 9.999800234758007e-06, "loss": -0.0024, "num_tokens": 15345852.0, "reward": 0.3438303470611572, "reward_std": 0.20190753042697906, "rewards/rollout_reward_func/mean": 0.3438303470611572, "rewards/rollout_reward_func/std": 0.9933081269264221, "sampling/importance_sampling_ratio/max": 0.5502108931541443, "sampling/importance_sampling_ratio/mean": 0.3647382855415344, "sampling/importance_sampling_ratio/min": 2.761748214017215e-14, "sampling/sampling_logp_difference/max": 3.6893692016601562, "sampling/sampling_logp_difference/mean": 0.6674203276634216, "step": 693, "step_time": 8.85190559199691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.52186581492424, "epoch": 0.00694, "grad_norm": 0.04349556937813759, "kl": 0.6346735395491123, "learning_rate": 9.999799626187263e-06, "loss": -0.0024, "step": 694, "step_time": 4.605413127981592 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.03125, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 271.3125, "completions/mean_terminated_length": 259.2257995605469, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.805594950914383, "epoch": 0.00695, "frac_reward_zero_std": 0.5, "grad_norm": 0.08914367854595184, "kl": 0.7530468702316284, "learning_rate": 9.999799016690968e-06, "loss": -0.0063, "num_tokens": 15393385.0, "reward": 0.8335442543029785, "reward_std": 0.18925848603248596, "rewards/rollout_reward_func/mean": 0.8335442543029785, "rewards/rollout_reward_func/std": 0.6198198795318604, "sampling/importance_sampling_ratio/max": 0.5483358502388, "sampling/importance_sampling_ratio/mean": 0.3444356918334961, "sampling/importance_sampling_ratio/min": 4.4673534077728505e-14, "sampling/sampling_logp_difference/max": 4.42240047454834, "sampling/sampling_logp_difference/mean": 0.5841656923294067, "step": 695, "step_time": 8.94165827298275 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.801077663898468, "epoch": 0.00696, "grad_norm": 0.0961923897266388, "kl": 0.7567232623696327, "learning_rate": 9.999798406269121e-06, "loss": -0.0064, "step": 696, "step_time": 5.245786208986829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 476.78125, "completions/mean_terminated_length": 470.4193420410156, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.3978962898254395, "epoch": 0.00697, "frac_reward_zero_std": 0.25, "grad_norm": 0.05105418711900711, "kl": 0.6352884396910667, "learning_rate": 9.999797794921726e-06, "loss": -0.0068, "num_tokens": 15447647.0, "reward": 0.4908134937286377, "reward_std": 0.42443573474884033, "rewards/rollout_reward_func/mean": 0.4908134937286377, "rewards/rollout_reward_func/std": 0.6845395565032959, "sampling/importance_sampling_ratio/max": 0.5453019142150879, "sampling/importance_sampling_ratio/mean": 0.21476279199123383, "sampling/importance_sampling_ratio/min": 9.572727321938146e-07, "sampling/sampling_logp_difference/max": 4.416143417358398, "sampling/sampling_logp_difference/mean": 0.6337770223617554, "step": 697, "step_time": 8.898415907991875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.3821790516376495, "epoch": 0.00698, "grad_norm": 0.05333668366074562, "kl": 0.6367447525262833, "learning_rate": 9.999797182648783e-06, "loss": -0.0069, "step": 698, "step_time": 4.820016358993598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 413.15625, "completions/mean_terminated_length": 413.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.4792228043079376, "epoch": 0.00699, "frac_reward_zero_std": 0.25, "grad_norm": 0.1030561700463295, "kl": 0.726013258099556, "learning_rate": 9.999796569450289e-06, "loss": 0.0009, "num_tokens": 15500698.0, "reward": 0.8418517112731934, "reward_std": 0.12617827951908112, "rewards/rollout_reward_func/mean": 0.8418517112731934, "rewards/rollout_reward_func/std": 0.6332328915596008, "sampling/importance_sampling_ratio/max": 0.5484995245933533, "sampling/importance_sampling_ratio/mean": 0.2901419401168823, "sampling/importance_sampling_ratio/min": 0.008026973344385624, "sampling/sampling_logp_difference/max": 2.57305908203125, "sampling/sampling_logp_difference/mean": 0.44226178526878357, "step": 699, "step_time": 8.898082453997631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.4648555517196655, "epoch": 0.007, "grad_norm": 0.10271580517292023, "kl": 0.7260833419859409, "learning_rate": 9.999795955326245e-06, "loss": 0.0006, "step": 700, "step_time": 4.605388787014817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 449.5, "completions/mean_terminated_length": 449.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.6095435321331024, "epoch": 0.00701, "frac_reward_zero_std": 0.25, "grad_norm": 0.12811802327632904, "kl": 0.9231266528367996, "learning_rate": 9.999795340276655e-06, "loss": -0.0051, "num_tokens": 15554560.0, "reward": 0.9100659489631653, "reward_std": 0.17292030155658722, "rewards/rollout_reward_func/mean": 0.9100659489631653, "rewards/rollout_reward_func/std": 0.53940349817276, "sampling/importance_sampling_ratio/max": 0.5490774512290955, "sampling/importance_sampling_ratio/mean": 0.27746784687042236, "sampling/importance_sampling_ratio/min": 0.01120566576719284, "sampling/sampling_logp_difference/max": 2.214690923690796, "sampling/sampling_logp_difference/mean": 0.46584153175354004, "step": 701, "step_time": 9.05873603400687 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0147569440305233, "entropy": 3.577676832675934, "epoch": 0.00702, "grad_norm": 0.04323719069361687, "kl": 0.9517825096845627, "learning_rate": 9.999794724301514e-06, "loss": -0.0054, "step": 702, "step_time": 5.336573141008557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 322.65625, "completions/mean_terminated_length": 332.5483703613281, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.604834198951721, "epoch": 0.00703, "frac_reward_zero_std": 0.25, "grad_norm": 0.023302018642425537, "kl": 0.7376464530825615, "learning_rate": 9.999794107400824e-06, "loss": -0.0165, "num_tokens": 15603483.0, "reward": 0.8639661073684692, "reward_std": 0.34036919474601746, "rewards/rollout_reward_func/mean": 0.8639661073684692, "rewards/rollout_reward_func/std": 0.5131677389144897, "sampling/importance_sampling_ratio/max": 0.547881007194519, "sampling/importance_sampling_ratio/mean": 0.34148651361465454, "sampling/importance_sampling_ratio/min": 9.156482150274314e-08, "sampling/sampling_logp_difference/max": 2.6783535480499268, "sampling/sampling_logp_difference/mean": 0.5313702821731567, "step": 703, "step_time": 8.789156394996098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.594515085220337, "epoch": 0.00704, "grad_norm": 0.02281314507126808, "kl": 0.7413600757718086, "learning_rate": 9.999793489574587e-06, "loss": -0.0165, "step": 704, "step_time": 4.882681645991397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 489.875, "completions/mean_terminated_length": 489.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.099765628576279, "epoch": 0.00705, "frac_reward_zero_std": 0.25, "grad_norm": 0.053722724318504333, "kl": 0.700568363070488, "learning_rate": 9.999792870822801e-06, "loss": -0.0069, "num_tokens": 15659990.0, "reward": 0.7317565679550171, "reward_std": 0.34882301092147827, "rewards/rollout_reward_func/mean": 0.7317565679550171, "rewards/rollout_reward_func/std": 0.48572760820388794, "sampling/importance_sampling_ratio/max": 0.5479433536529541, "sampling/importance_sampling_ratio/mean": 0.23982150852680206, "sampling/importance_sampling_ratio/min": 1.6720982376483562e-09, "sampling/sampling_logp_difference/max": 3.904188632965088, "sampling/sampling_logp_difference/mean": 0.6204702258110046, "step": 705, "step_time": 9.419622723995417 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 4.113272398710251, "epoch": 0.00706, "grad_norm": 0.05470872297883034, "kl": 0.6999048441648483, "learning_rate": 9.999792251145466e-06, "loss": -0.0069, "step": 706, "step_time": 4.8776971779734595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 308.15625, "completions/mean_terminated_length": 317.58062744140625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.2093458473682404, "epoch": 0.00707, "frac_reward_zero_std": 0.25, "grad_norm": 0.07783634215593338, "kl": 0.7280929237604141, "learning_rate": 9.999791630542584e-06, "loss": -0.0076, "num_tokens": 15709436.0, "reward": 1.057267189025879, "reward_std": 0.13849987089633942, "rewards/rollout_reward_func/mean": 1.057267189025879, "rewards/rollout_reward_func/std": 0.2828780710697174, "sampling/importance_sampling_ratio/max": 0.5504326820373535, "sampling/importance_sampling_ratio/mean": 0.37969645857810974, "sampling/importance_sampling_ratio/min": 1.44128417534084e-10, "sampling/sampling_logp_difference/max": 2.8366355895996094, "sampling/sampling_logp_difference/mean": 0.4853394627571106, "step": 707, "step_time": 9.077971214006539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 3.2257138788700104, "epoch": 0.00708, "grad_norm": 0.03376467525959015, "kl": 0.7252964004874229, "learning_rate": 9.999791009014154e-06, "loss": -0.0078, "step": 708, "step_time": 4.846057427006599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 324.65625, "completions/mean_terminated_length": 334.6128845214844, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.30644753575325, "epoch": 0.00709, "frac_reward_zero_std": 0.25, "grad_norm": 0.07670197635889053, "kl": 0.672308586537838, "learning_rate": 9.999790386560175e-06, "loss": -0.0105, "num_tokens": 15760381.0, "reward": 1.0163769721984863, "reward_std": 0.1388189196586609, "rewards/rollout_reward_func/mean": 1.0163769721984863, "rewards/rollout_reward_func/std": 0.31615790724754333, "sampling/importance_sampling_ratio/max": 0.549124538898468, "sampling/importance_sampling_ratio/mean": 0.3611670732498169, "sampling/importance_sampling_ratio/min": 1.81790120734604e-08, "sampling/sampling_logp_difference/max": 2.8059213161468506, "sampling/sampling_logp_difference/mean": 0.4733142554759979, "step": 709, "step_time": 8.778755446001014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 3.327965199947357, "epoch": 0.0071, "grad_norm": 0.07711321860551834, "kl": 0.6709862351417542, "learning_rate": 9.99978976318065e-06, "loss": -0.0105, "step": 710, "step_time": 5.295123235999199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 291.875, "completions/mean_terminated_length": 291.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.1865217983722687, "epoch": 0.00711, "frac_reward_zero_std": 0.5, "grad_norm": 0.20639623701572418, "kl": 0.7811077535152435, "learning_rate": 9.999789138875577e-06, "loss": -0.0019, "num_tokens": 15808486.0, "reward": 0.30215322971343994, "reward_std": 0.2654363512992859, "rewards/rollout_reward_func/mean": 0.30215322971343994, "rewards/rollout_reward_func/std": 1.0351903438568115, "sampling/importance_sampling_ratio/max": 0.5462925434112549, "sampling/importance_sampling_ratio/mean": 0.36559098958969116, "sampling/importance_sampling_ratio/min": 0.09987088292837143, "sampling/sampling_logp_difference/max": 1.120473027229309, "sampling/sampling_logp_difference/mean": 0.36635977029800415, "step": 711, "step_time": 8.135424803986098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 3.201673239469528, "epoch": 0.00712, "grad_norm": 0.06106019765138626, "kl": 0.758052084594965, "learning_rate": 9.999788513644958e-06, "loss": -0.0027, "step": 712, "step_time": 4.347213429995463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 437.84375, "completions/mean_terminated_length": 437.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.737181782722473, "epoch": 0.00713, "frac_reward_zero_std": 0.5, "grad_norm": 0.036430735141038895, "kl": 0.976323276758194, "learning_rate": 9.999787887488789e-06, "loss": -0.0068, "num_tokens": 15861180.0, "reward": 0.8156747817993164, "reward_std": 0.26407089829444885, "rewards/rollout_reward_func/mean": 0.8156747817993164, "rewards/rollout_reward_func/std": 0.7807609438896179, "sampling/importance_sampling_ratio/max": 0.5489709377288818, "sampling/importance_sampling_ratio/mean": 0.27747678756713867, "sampling/importance_sampling_ratio/min": 1.4092171440188705e-12, "sampling/sampling_logp_difference/max": 4.240018844604492, "sampling/sampling_logp_difference/mean": 0.6405125856399536, "step": 713, "step_time": 9.115091927000321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.7561155557632446, "epoch": 0.00714, "grad_norm": 0.028421996161341667, "kl": 0.9468590691685677, "learning_rate": 9.999787260407074e-06, "loss": -0.0068, "step": 714, "step_time": 4.551131845990312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 314.46875, "completions/mean_terminated_length": 314.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.85791552066803, "epoch": 0.00715, "frac_reward_zero_std": 0.75, "grad_norm": 0.022633418440818787, "kl": 0.9123654440045357, "learning_rate": 9.999786632399813e-06, "loss": -0.0045, "num_tokens": 15910855.0, "reward": 0.4005934000015259, "reward_std": 0.21956394612789154, "rewards/rollout_reward_func/mean": 0.4005934000015259, "rewards/rollout_reward_func/std": 0.9874002933502197, "sampling/importance_sampling_ratio/max": 0.5504449009895325, "sampling/importance_sampling_ratio/mean": 0.3329528570175171, "sampling/importance_sampling_ratio/min": 1.305656314798398e-05, "sampling/sampling_logp_difference/max": 3.379504442214966, "sampling/sampling_logp_difference/mean": 0.5937477350234985, "step": 715, "step_time": 8.156130667004618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.8684141039848328, "epoch": 0.00716, "grad_norm": 0.02106892690062523, "kl": 0.9155070334672928, "learning_rate": 9.999786003467005e-06, "loss": -0.0046, "step": 716, "step_time": 5.074785684002563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 499.09375, "completions/mean_terminated_length": 493.20001220703125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.870640933513641, "epoch": 0.00717, "frac_reward_zero_std": 0.25, "grad_norm": 0.027208685874938965, "kl": 0.5932632237672806, "learning_rate": 9.99978537360865e-06, "loss": -0.0067, "num_tokens": 15967995.0, "reward": 0.7429980635643005, "reward_std": 0.4001692533493042, "rewards/rollout_reward_func/mean": 0.7429980635643005, "rewards/rollout_reward_func/std": 0.5681722164154053, "sampling/importance_sampling_ratio/max": 0.5496721863746643, "sampling/importance_sampling_ratio/mean": 0.2014206051826477, "sampling/importance_sampling_ratio/min": 1.3289389422687112e-17, "sampling/sampling_logp_difference/max": 4.3986639976501465, "sampling/sampling_logp_difference/mean": 1.030718445777893, "step": 717, "step_time": 9.361795840981358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.871042490005493, "epoch": 0.00718, "grad_norm": 0.027367018163204193, "kl": 0.5911345779895782, "learning_rate": 9.99978474282475e-06, "loss": -0.0067, "step": 718, "step_time": 4.9267885479930555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 160.3125, "completions/mean_terminated_length": 160.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.1396215856075287, "epoch": 0.00719, "frac_reward_zero_std": 0.75, "grad_norm": 0.017047490924596786, "kl": 0.8030869886279106, "learning_rate": 9.999784111115302e-06, "loss": -0.0024, "num_tokens": 16010083.0, "reward": 1.0875234603881836, "reward_std": 0.10567376017570496, "rewards/rollout_reward_func/mean": 1.0875234603881836, "rewards/rollout_reward_func/std": 0.20200522243976593, "sampling/importance_sampling_ratio/max": 0.5492632985115051, "sampling/importance_sampling_ratio/mean": 0.448549747467041, "sampling/importance_sampling_ratio/min": 0.015123225748538971, "sampling/sampling_logp_difference/max": 2.1442813873291016, "sampling/sampling_logp_difference/mean": 0.3713904023170471, "step": 719, "step_time": 8.528983757001697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.139629453420639, "epoch": 0.0072, "grad_norm": 0.016887547448277473, "kl": 0.8047497421503067, "learning_rate": 9.99978347848031e-06, "loss": -0.0024, "step": 720, "step_time": 4.523245333999512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 302.25, "completions/mean_terminated_length": 302.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.555348426103592, "epoch": 0.00721, "frac_reward_zero_std": 0.5, "grad_norm": 0.19782257080078125, "kl": 0.8515413776040077, "learning_rate": 9.99978284491977e-06, "loss": -0.0026, "num_tokens": 16058092.0, "reward": -0.007623806595802307, "reward_std": 0.49094370007514954, "rewards/rollout_reward_func/mean": -0.007623806595802307, "rewards/rollout_reward_func/std": 1.0564497709274292, "sampling/importance_sampling_ratio/max": 0.5466883778572083, "sampling/importance_sampling_ratio/mean": 0.3558090329170227, "sampling/importance_sampling_ratio/min": 3.996689429186517e-06, "sampling/sampling_logp_difference/max": 3.1937341690063477, "sampling/sampling_logp_difference/mean": 0.5353748202323914, "step": 721, "step_time": 8.302546363003785 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02864583395421505, "entropy": 3.554252713918686, "epoch": 0.00722, "grad_norm": 0.030965501442551613, "kl": 0.8684436082839966, "learning_rate": 9.999782210433683e-06, "loss": -0.0029, "step": 722, "step_time": 5.049277762984275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 296.21875, "completions/mean_terminated_length": 296.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.8308228254318237, "epoch": 0.00723, "frac_reward_zero_std": 0.5, "grad_norm": 0.04715416580438614, "kl": 0.8606101498007774, "learning_rate": 9.999781575022053e-06, "loss": -0.0034, "num_tokens": 16106624.0, "reward": -0.37248581647872925, "reward_std": 0.46988195180892944, "rewards/rollout_reward_func/mean": -0.37248581647872925, "rewards/rollout_reward_func/std": 0.902294397354126, "sampling/importance_sampling_ratio/max": 0.547010064125061, "sampling/importance_sampling_ratio/mean": 0.3262750506401062, "sampling/importance_sampling_ratio/min": 3.592782805593293e-22, "sampling/sampling_logp_difference/max": 11.598073959350586, "sampling/sampling_logp_difference/mean": 0.7211078405380249, "step": 723, "step_time": 9.313330869998026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.8294747471809387, "epoch": 0.00724, "grad_norm": 0.04095596820116043, "kl": 0.8622391149401665, "learning_rate": 9.999780938684877e-06, "loss": -0.0034, "step": 724, "step_time": 4.964370751993556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 629.5, "completions/mean_terminated_length": 630.8386840820312, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "entropy": 4.415831238031387, "epoch": 0.00725, "frac_reward_zero_std": 0.0, "grad_norm": 0.0879918783903122, "kl": 0.6729303821921349, "learning_rate": 9.999780301422157e-06, "loss": -0.0082, "num_tokens": 16169326.0, "reward": 0.5336527824401855, "reward_std": 0.3800186216831207, "rewards/rollout_reward_func/mean": 0.5336527824401855, "rewards/rollout_reward_func/std": 0.6071345806121826, "sampling/importance_sampling_ratio/max": 0.2996901869773865, "sampling/importance_sampling_ratio/mean": 0.13273510336875916, "sampling/importance_sampling_ratio/min": 2.491940904292278e-06, "sampling/sampling_logp_difference/max": 4.170950889587402, "sampling/sampling_logp_difference/mean": 0.6498007774353027, "step": 725, "step_time": 9.58190105099493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.4046595096588135, "epoch": 0.00726, "grad_norm": 0.09875447303056717, "kl": 0.6796823889017105, "learning_rate": 9.99977966323389e-06, "loss": -0.0084, "step": 726, "step_time": 4.911381320016517 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 466.1875, "completions/mean_terminated_length": 466.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.6191884875297546, "epoch": 0.00727, "frac_reward_zero_std": 0.25, "grad_norm": 0.01432116236537695, "kl": 0.7085320353507996, "learning_rate": 9.99977902412008e-06, "loss": 0.002, "num_tokens": 16223833.0, "reward": 0.42566996812820435, "reward_std": 0.1875332146883011, "rewards/rollout_reward_func/mean": 0.42566996812820435, "rewards/rollout_reward_func/std": 0.9399080872535706, "sampling/importance_sampling_ratio/max": 0.5533050298690796, "sampling/importance_sampling_ratio/mean": 0.2860541343688965, "sampling/importance_sampling_ratio/min": 6.693374786700669e-17, "sampling/sampling_logp_difference/max": 4.413646697998047, "sampling/sampling_logp_difference/mean": 0.6355666518211365, "step": 727, "step_time": 9.942807452993293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.609797954559326, "epoch": 0.00728, "grad_norm": 0.012723327614367008, "kl": 0.7094575017690659, "learning_rate": 9.999778384080722e-06, "loss": 0.002, "step": 728, "step_time": 5.002657952980371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 360.15625, "completions/mean_terminated_length": 348.06451416015625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.776315093040466, "epoch": 0.00729, "frac_reward_zero_std": 0.5, "grad_norm": 0.017492899671196938, "kl": 0.9414255693554878, "learning_rate": 9.999777743115822e-06, "loss": -0.0054, "num_tokens": 16272294.0, "reward": 0.493784099817276, "reward_std": 0.26030629873275757, "rewards/rollout_reward_func/mean": 0.493784099817276, "rewards/rollout_reward_func/std": 0.8078793883323669, "sampling/importance_sampling_ratio/max": 0.55100417137146, "sampling/importance_sampling_ratio/mean": 0.2951091527938843, "sampling/importance_sampling_ratio/min": 2.29813972282074e-15, "sampling/sampling_logp_difference/max": 4.040643215179443, "sampling/sampling_logp_difference/mean": 0.9913445115089417, "step": 729, "step_time": 8.803657810000004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.768540024757385, "epoch": 0.0073, "grad_norm": 0.0172884464263916, "kl": 0.9390710145235062, "learning_rate": 9.999777101225378e-06, "loss": -0.0054, "step": 730, "step_time": 4.848561200000404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 469.6875, "completions/mean_terminated_length": 469.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.107537508010864, "epoch": 0.00731, "frac_reward_zero_std": 0.25, "grad_norm": 0.038190245628356934, "kl": 0.7833498269319534, "learning_rate": 9.999776458409387e-06, "loss": -0.0108, "num_tokens": 16327934.0, "reward": 0.4012678861618042, "reward_std": 0.3563430905342102, "rewards/rollout_reward_func/mean": 0.4012678861618042, "rewards/rollout_reward_func/std": 0.9278212785720825, "sampling/importance_sampling_ratio/max": 0.5505785942077637, "sampling/importance_sampling_ratio/mean": 0.24045969545841217, "sampling/importance_sampling_ratio/min": 8.796736265745408e-16, "sampling/sampling_logp_difference/max": 4.408750534057617, "sampling/sampling_logp_difference/mean": 0.6964483261108398, "step": 731, "step_time": 9.641432201009593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.1010520458221436, "epoch": 0.00732, "grad_norm": 0.03707683086395264, "kl": 0.7604534961283207, "learning_rate": 9.999775814667854e-06, "loss": -0.0108, "step": 732, "step_time": 4.848329964996083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 609.6875, "completions/mean_terminated_length": 609.6875, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "entropy": 4.332083731889725, "epoch": 0.00733, "frac_reward_zero_std": 0.0, "grad_norm": 0.09964881092309952, "kl": 0.802434541285038, "learning_rate": 9.999775170000777e-06, "loss": -0.0089, "num_tokens": 16388732.0, "reward": 0.013313829898834229, "reward_std": 0.4964298605918884, "rewards/rollout_reward_func/mean": 0.013313829898834229, "rewards/rollout_reward_func/std": 1.0221459865570068, "sampling/importance_sampling_ratio/max": 0.5028300285339355, "sampling/importance_sampling_ratio/mean": 0.16806861758232117, "sampling/importance_sampling_ratio/min": 4.6910673262567926e-11, "sampling/sampling_logp_difference/max": 4.281581401824951, "sampling/sampling_logp_difference/mean": 0.674832820892334, "step": 733, "step_time": 9.594727891009825 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01065340917557478, "entropy": 4.3353612422943115, "epoch": 0.00734, "grad_norm": 0.06147947907447815, "kl": 0.8019740208983421, "learning_rate": 9.999774524408155e-06, "loss": -0.0092, "step": 734, "step_time": 4.853294707987516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 612.125, "completions/mean_terminated_length": 612.3870849609375, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "entropy": 4.1903506219387054, "epoch": 0.00735, "frac_reward_zero_std": 0.0, "grad_norm": 0.04428345710039139, "kl": 0.7177448123693466, "learning_rate": 9.99977387788999e-06, "loss": -0.0173, "num_tokens": 16450434.0, "reward": 0.4915701746940613, "reward_std": 0.5649570226669312, "rewards/rollout_reward_func/mean": 0.4915701746940613, "rewards/rollout_reward_func/std": 0.857500433921814, "sampling/importance_sampling_ratio/max": 0.2976827025413513, "sampling/importance_sampling_ratio/mean": 0.1698811799287796, "sampling/importance_sampling_ratio/min": 1.919245707184599e-14, "sampling/sampling_logp_difference/max": 3.472654104232788, "sampling/sampling_logp_difference/mean": 0.7000269293785095, "step": 735, "step_time": 9.44382334999682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.20357945561409, "epoch": 0.00736, "grad_norm": 0.04511602222919464, "kl": 0.7175312116742134, "learning_rate": 9.99977323044628e-06, "loss": -0.0174, "step": 736, "step_time": 5.457746396001312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 470.59375, "completions/mean_terminated_length": 463.70001220703125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.385215520858765, "epoch": 0.00737, "frac_reward_zero_std": 0.25, "grad_norm": 0.032297998666763306, "kl": 0.6145708933472633, "learning_rate": 9.99977258207703e-06, "loss": -0.0044, "num_tokens": 16506724.0, "reward": 0.5548610091209412, "reward_std": 0.30239611864089966, "rewards/rollout_reward_func/mean": 0.5548610091209412, "rewards/rollout_reward_func/std": 0.7267706990242004, "sampling/importance_sampling_ratio/max": 0.5463544130325317, "sampling/importance_sampling_ratio/mean": 0.21172507107257843, "sampling/importance_sampling_ratio/min": 3.1420688273442465e-09, "sampling/sampling_logp_difference/max": 3.9714772701263428, "sampling/sampling_logp_difference/mean": 0.6709697842597961, "step": 737, "step_time": 9.228469886009407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.390786230564117, "epoch": 0.00738, "grad_norm": 0.029737643897533417, "kl": 0.616021204739809, "learning_rate": 9.999771932782234e-06, "loss": -0.0043, "step": 738, "step_time": 5.325769451010274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 639.84375, "completions/mean_terminated_length": 637.9031982421875, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "entropy": 4.136277109384537, "epoch": 0.00739, "frac_reward_zero_std": 0.0, "grad_norm": 0.01143848616629839, "kl": 0.6217525936663151, "learning_rate": 9.999771282561895e-06, "loss": -0.0155, "num_tokens": 16569445.0, "reward": 0.26844069361686707, "reward_std": 0.6092576384544373, "rewards/rollout_reward_func/mean": 0.26844069361686707, "rewards/rollout_reward_func/std": 0.9002315402030945, "sampling/importance_sampling_ratio/max": 0.29813629388809204, "sampling/importance_sampling_ratio/mean": 0.1827777922153473, "sampling/importance_sampling_ratio/min": 5.6311059500491634e-15, "sampling/sampling_logp_difference/max": 4.274252891540527, "sampling/sampling_logp_difference/mean": 0.7819750308990479, "step": 739, "step_time": 9.525719088996993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.142594367265701, "epoch": 0.0074, "grad_norm": 0.011342843063175678, "kl": 0.622939296066761, "learning_rate": 9.999770631416015e-06, "loss": -0.0155, "step": 740, "step_time": 5.065052130004915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 742.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 401.1875, "completions/mean_terminated_length": 401.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.6889359056949615, "epoch": 0.00741, "frac_reward_zero_std": 0.25, "grad_norm": 0.06072669103741646, "kl": 0.7554387226700783, "learning_rate": 9.99976997934459e-06, "loss": -0.0109, "num_tokens": 16623860.0, "reward": 0.4353845417499542, "reward_std": 0.3416544795036316, "rewards/rollout_reward_func/mean": 0.4353845417499542, "rewards/rollout_reward_func/std": 0.9557913541793823, "sampling/importance_sampling_ratio/max": 0.557153046131134, "sampling/importance_sampling_ratio/mean": 0.2938973903656006, "sampling/importance_sampling_ratio/min": 3.031589909663346e-11, "sampling/sampling_logp_difference/max": 4.177786827087402, "sampling/sampling_logp_difference/mean": 0.612836480140686, "step": 741, "step_time": 8.696781018996262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.6876443326473236, "epoch": 0.00742, "grad_norm": 0.06131424754858017, "kl": 0.7573771253228188, "learning_rate": 9.999769326347624e-06, "loss": -0.0109, "step": 742, "step_time": 5.175828172985348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 273.59375, "completions/mean_terminated_length": 273.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.9771456718444824, "epoch": 0.00743, "frac_reward_zero_std": 0.5, "grad_norm": 0.0757998377084732, "kl": 0.7926981225609779, "learning_rate": 9.999768672425116e-06, "loss": -0.0056, "num_tokens": 16672687.0, "reward": 1.151566982269287, "reward_std": 0.010893160477280617, "rewards/rollout_reward_func/mean": 1.151566982269287, "rewards/rollout_reward_func/std": 0.055790968239307404, "sampling/importance_sampling_ratio/max": 0.550175666809082, "sampling/importance_sampling_ratio/mean": 0.4029200077056885, "sampling/importance_sampling_ratio/min": 3.7180338949838188e-06, "sampling/sampling_logp_difference/max": 4.164350986480713, "sampling/sampling_logp_difference/mean": 0.4238413870334625, "step": 743, "step_time": 8.255739404994529 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018229166977107525, "entropy": 2.9624577164649963, "epoch": 0.00744, "grad_norm": 0.027557209134101868, "kl": 0.7907274290919304, "learning_rate": 9.999768017577065e-06, "loss": -0.0059, "step": 744, "step_time": 4.9237980769903515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 603.65625, "completions/mean_terminated_length": 604.86669921875, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "entropy": 4.542890310287476, "epoch": 0.00745, "frac_reward_zero_std": 0.0, "grad_norm": 0.07183793187141418, "kl": 0.8593917936086655, "learning_rate": 9.999767361803471e-06, "loss": -0.0024, "num_tokens": 16734332.0, "reward": 0.6922554969787598, "reward_std": 0.6076809167861938, "rewards/rollout_reward_func/mean": 0.6922554969787598, "rewards/rollout_reward_func/std": 0.7880223393440247, "sampling/importance_sampling_ratio/max": 0.30067411065101624, "sampling/importance_sampling_ratio/mean": 0.1559254378080368, "sampling/importance_sampling_ratio/min": 3.190915684892631e-13, "sampling/sampling_logp_difference/max": 4.020832061767578, "sampling/sampling_logp_difference/mean": 0.8036243319511414, "step": 745, "step_time": 9.532677406001312 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 4.535570323467255, "epoch": 0.00746, "grad_norm": 0.0706542581319809, "kl": 0.8587269484996796, "learning_rate": 9.999766705104336e-06, "loss": -0.0024, "step": 746, "step_time": 4.959064103990386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 589.1875, "completions/mean_terminated_length": 589.1875, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "entropy": 3.365324705839157, "epoch": 0.00747, "frac_reward_zero_std": 0.0, "grad_norm": 0.11394620686769485, "kl": 0.8212465718388557, "learning_rate": 9.999766047479658e-06, "loss": -0.0142, "num_tokens": 16794730.0, "reward": 0.9145269393920898, "reward_std": 0.5230984091758728, "rewards/rollout_reward_func/mean": 0.9145269393920898, "rewards/rollout_reward_func/std": 0.6124116778373718, "sampling/importance_sampling_ratio/max": 0.3637137711048126, "sampling/importance_sampling_ratio/mean": 0.24069684743881226, "sampling/importance_sampling_ratio/min": 0.010193745605647564, "sampling/sampling_logp_difference/max": 2.1503329277038574, "sampling/sampling_logp_difference/mean": 0.4491940140724182, "step": 747, "step_time": 8.919417838995287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.365147888660431, "epoch": 0.00748, "grad_norm": 0.11865262687206268, "kl": 0.8234266042709351, "learning_rate": 9.99976538892944e-06, "loss": -0.0146, "step": 748, "step_time": 5.365191375996801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0012499999720603228, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012499999720603228, "completions/clipped_ratio": 0.03125, "completions/max_length": 833.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 491.65625, "completions/mean_terminated_length": 480.6451416015625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.918183445930481, "epoch": 0.00749, "frac_reward_zero_std": 0.25, "grad_norm": 0.05304034799337387, "kl": 0.7426018491387367, "learning_rate": 9.99976472945368e-06, "loss": -0.0055, "num_tokens": 16851679.0, "reward": 0.056741371750831604, "reward_std": 0.644063413143158, "rewards/rollout_reward_func/mean": 0.056741371750831604, "rewards/rollout_reward_func/std": 0.9879046678543091, "sampling/importance_sampling_ratio/max": 0.5492287278175354, "sampling/importance_sampling_ratio/mean": 0.2600939869880676, "sampling/importance_sampling_ratio/min": 4.8610533426092053e-26, "sampling/sampling_logp_difference/max": 3.6835973262786865, "sampling/sampling_logp_difference/mean": 0.7414470911026001, "step": 749, "step_time": 9.861019000018132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.9097233712673187, "epoch": 0.0075, "grad_norm": 0.028793729841709137, "kl": 0.7339487336575985, "learning_rate": 9.999764069052378e-06, "loss": -0.0056, "step": 750, "step_time": 5.417621094005881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 330.71875, "completions/mean_terminated_length": 330.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.9403379559516907, "epoch": 0.00751, "frac_reward_zero_std": 0.75, "grad_norm": 0.010057729668915272, "kl": 0.8433247581124306, "learning_rate": 9.999763407725536e-06, "loss": 0.0031, "num_tokens": 16902298.0, "reward": 0.21056488156318665, "reward_std": 0.20359578728675842, "rewards/rollout_reward_func/mean": 0.21056488156318665, "rewards/rollout_reward_func/std": 1.0559297800064087, "sampling/importance_sampling_ratio/max": 0.5497397780418396, "sampling/importance_sampling_ratio/mean": 0.3898675739765167, "sampling/importance_sampling_ratio/min": 0.006777407601475716, "sampling/sampling_logp_difference/max": 3.718884229660034, "sampling/sampling_logp_difference/mean": 0.3714514672756195, "step": 751, "step_time": 8.900863439019304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.929674118757248, "epoch": 0.00752, "grad_norm": 0.01008794829249382, "kl": 0.8420523628592491, "learning_rate": 9.999762745473153e-06, "loss": 0.0031, "step": 752, "step_time": 4.864942601008806 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 447.25, "completions/mean_terminated_length": 447.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.727955460548401, "epoch": 0.00753, "frac_reward_zero_std": 0.25, "grad_norm": 0.048746153712272644, "kl": 1.039544239640236, "learning_rate": 9.999762082295227e-06, "loss": -0.0007, "num_tokens": 16957445.0, "reward": 0.5269116759300232, "reward_std": 0.5680821537971497, "rewards/rollout_reward_func/mean": 0.5269116759300232, "rewards/rollout_reward_func/std": 0.879065990447998, "sampling/importance_sampling_ratio/max": 0.5459772348403931, "sampling/importance_sampling_ratio/mean": 0.27111291885375977, "sampling/importance_sampling_ratio/min": 2.187946490650461e-10, "sampling/sampling_logp_difference/max": 3.998807430267334, "sampling/sampling_logp_difference/mean": 0.5992969274520874, "step": 753, "step_time": 9.223279669990006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.728214204311371, "epoch": 0.00754, "grad_norm": 0.04052431136369705, "kl": 0.9638086743652821, "learning_rate": 9.999761418191762e-06, "loss": -0.0009, "step": 754, "step_time": 5.365506302994618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 650.21875, "completions/mean_terminated_length": 650.21875, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "entropy": 4.495359361171722, "epoch": 0.00755, "frac_reward_zero_std": 0.0, "grad_norm": 0.034851912409067154, "kl": 0.7557166293263435, "learning_rate": 9.999760753162758e-06, "loss": -0.0022, "num_tokens": 17019784.0, "reward": 0.38900724053382874, "reward_std": 0.5162025690078735, "rewards/rollout_reward_func/mean": 0.38900724053382874, "rewards/rollout_reward_func/std": 0.8392570614814758, "sampling/importance_sampling_ratio/max": 0.29811766743659973, "sampling/importance_sampling_ratio/mean": 0.15139652788639069, "sampling/importance_sampling_ratio/min": 1.72558964425542e-18, "sampling/sampling_logp_difference/max": 4.801440238952637, "sampling/sampling_logp_difference/mean": 0.848017692565918, "step": 755, "step_time": 9.463371485006064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 4.492461949586868, "epoch": 0.00756, "grad_norm": 0.03452073410153389, "kl": 0.75554558634758, "learning_rate": 9.999760087208213e-06, "loss": -0.0023, "step": 756, "step_time": 5.3777429969995865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 485.90625, "completions/mean_terminated_length": 471.8000183105469, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.143804728984833, "epoch": 0.00757, "frac_reward_zero_std": 0.25, "grad_norm": 0.04680885002017021, "kl": 0.7797781303524971, "learning_rate": 9.999759420328126e-06, "loss": -0.0129, "num_tokens": 17075347.0, "reward": 0.74830162525177, "reward_std": 0.45142436027526855, "rewards/rollout_reward_func/mean": 0.74830162525177, "rewards/rollout_reward_func/std": 0.5790999531745911, "sampling/importance_sampling_ratio/max": 0.5480608344078064, "sampling/importance_sampling_ratio/mean": 0.2391171008348465, "sampling/importance_sampling_ratio/min": 6.191741997590983e-13, "sampling/sampling_logp_difference/max": 4.5154571533203125, "sampling/sampling_logp_difference/mean": 0.7079084515571594, "step": 757, "step_time": 9.37863615299284 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 4.117809027433395, "epoch": 0.00758, "grad_norm": 0.034836187958717346, "kl": 0.7778778374195099, "learning_rate": 9.999758752522502e-06, "loss": -0.0131, "step": 758, "step_time": 4.965281675991719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 345.28125, "completions/mean_terminated_length": 345.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.644380956888199, "epoch": 0.00759, "frac_reward_zero_std": 0.25, "grad_norm": 0.026449229568243027, "kl": 0.7126868888735771, "learning_rate": 9.999758083791337e-06, "loss": -0.0139, "num_tokens": 17126068.0, "reward": 0.9118133783340454, "reward_std": 0.10974126309156418, "rewards/rollout_reward_func/mean": 0.9118133783340454, "rewards/rollout_reward_func/std": 0.39911314845085144, "sampling/importance_sampling_ratio/max": 0.5521848797798157, "sampling/importance_sampling_ratio/mean": 0.3290778696537018, "sampling/importance_sampling_ratio/min": 3.842293949674058e-07, "sampling/sampling_logp_difference/max": 4.980340003967285, "sampling/sampling_logp_difference/mean": 0.5739490985870361, "step": 759, "step_time": 9.46733258599852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.646657645702362, "epoch": 0.0076, "grad_norm": 0.025639604777097702, "kl": 0.706712257117033, "learning_rate": 9.999757414134631e-06, "loss": -0.0139, "step": 760, "step_time": 4.895836196003074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 491.15625, "completions/mean_terminated_length": 491.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.34150630235672, "epoch": 0.00761, "frac_reward_zero_std": 0.25, "grad_norm": 0.012794766575098038, "kl": 0.7338771149516106, "learning_rate": 9.999756743552387e-06, "loss": -0.0117, "num_tokens": 17182839.0, "reward": 0.5790290236473083, "reward_std": 0.39601635932922363, "rewards/rollout_reward_func/mean": 0.5790290236473083, "rewards/rollout_reward_func/std": 0.7074864506721497, "sampling/importance_sampling_ratio/max": 0.5524452924728394, "sampling/importance_sampling_ratio/mean": 0.2292519360780716, "sampling/importance_sampling_ratio/min": 3.1015034290815535e-12, "sampling/sampling_logp_difference/max": 3.8819122314453125, "sampling/sampling_logp_difference/mean": 0.6934059858322144, "step": 761, "step_time": 9.207928208001249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.334528893232346, "epoch": 0.00762, "grad_norm": 0.01292593777179718, "kl": 0.7341802269220352, "learning_rate": 9.999756072044602e-06, "loss": -0.0117, "step": 762, "step_time": 5.255652959000145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 741.0, "completions/max_terminated_length": 741.0, "completions/mean_length": 305.375, "completions/mean_terminated_length": 314.70965576171875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.3960393965244293, "epoch": 0.00763, "frac_reward_zero_std": 0.25, "grad_norm": 0.12362014502286911, "kl": 1.038987196981907, "learning_rate": 9.99975539961128e-06, "loss": -0.0007, "num_tokens": 17233531.0, "reward": 0.22301113605499268, "reward_std": 0.4529114365577698, "rewards/rollout_reward_func/mean": 0.22301113605499268, "rewards/rollout_reward_func/std": 1.0568679571151733, "sampling/importance_sampling_ratio/max": 0.5557376742362976, "sampling/importance_sampling_ratio/mean": 0.37073591351509094, "sampling/importance_sampling_ratio/min": 2.0865527187996502e-12, "sampling/sampling_logp_difference/max": 3.323479413986206, "sampling/sampling_logp_difference/mean": 0.5733314752578735, "step": 763, "step_time": 8.35409857898776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.3852880597114563, "epoch": 0.00764, "grad_norm": 0.04979877173900604, "kl": 1.1964272633194923, "learning_rate": 9.999754726252418e-06, "loss": -0.0012, "step": 764, "step_time": 4.677487154993287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 662.21875, "completions/mean_terminated_length": 662.21875, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "entropy": 4.340094268321991, "epoch": 0.00765, "frac_reward_zero_std": 0.0, "grad_norm": 0.03592294082045555, "kl": 0.8132451437413692, "learning_rate": 9.999754051968017e-06, "loss": -0.0119, "num_tokens": 17296998.0, "reward": -0.032668739557266235, "reward_std": 0.7925288677215576, "rewards/rollout_reward_func/mean": -0.032668739557266235, "rewards/rollout_reward_func/std": 0.9478312730789185, "sampling/importance_sampling_ratio/max": 0.3002830147743225, "sampling/importance_sampling_ratio/mean": 0.15054447948932648, "sampling/importance_sampling_ratio/min": 0.0018182866042479873, "sampling/sampling_logp_difference/max": 3.75980281829834, "sampling/sampling_logp_difference/mean": 0.631030797958374, "step": 765, "step_time": 9.707191624991538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.3481285572052, "epoch": 0.00766, "grad_norm": 0.03264450281858444, "kl": 0.8138134367763996, "learning_rate": 9.999753376758078e-06, "loss": -0.0119, "step": 766, "step_time": 5.020815041010792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 639.375, "completions/mean_terminated_length": 639.375, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "entropy": 3.9645462334156036, "epoch": 0.00767, "frac_reward_zero_std": 0.25, "grad_norm": 0.0594295971095562, "kl": 0.6734377145767212, "learning_rate": 9.9997527006226e-06, "loss": -0.0059, "num_tokens": 17360410.0, "reward": 0.6340997219085693, "reward_std": 0.5708641409873962, "rewards/rollout_reward_func/mean": 0.6340997219085693, "rewards/rollout_reward_func/std": 0.7833238244056702, "sampling/importance_sampling_ratio/max": 0.29614755511283875, "sampling/importance_sampling_ratio/mean": 0.1813909113407135, "sampling/importance_sampling_ratio/min": 0.013823477551341057, "sampling/sampling_logp_difference/max": 2.3506643772125244, "sampling/sampling_logp_difference/mean": 0.5522049069404602, "step": 767, "step_time": 9.504060507999384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.985161930322647, "epoch": 0.00768, "grad_norm": 0.05778879299759865, "kl": 0.6717037446796894, "learning_rate": 9.999752023561584e-06, "loss": -0.0061, "step": 768, "step_time": 5.463494193994848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completions/clipped_ratio": 0.03125, "completions/max_length": 1444.0, "completions/max_terminated_length": 1444.0, "completions/mean_length": 781.40625, "completions/mean_terminated_length": 788.9354858398438, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "entropy": 4.090695858001709, "epoch": 0.00769, "frac_reward_zero_std": 0.0, "grad_norm": 0.07545054703950882, "kl": 0.8580093905329704, "learning_rate": 9.999751345575029e-06, "loss": -0.0076, "num_tokens": 17427637.0, "reward": 0.3638916611671448, "reward_std": 0.8686623573303223, "rewards/rollout_reward_func/mean": 0.3638916611671448, "rewards/rollout_reward_func/std": 1.0378566980361938, "sampling/importance_sampling_ratio/max": 0.30134695768356323, "sampling/importance_sampling_ratio/mean": 0.15750515460968018, "sampling/importance_sampling_ratio/min": 7.5180898832794e-13, "sampling/sampling_logp_difference/max": 4.958957195281982, "sampling/sampling_logp_difference/mean": 0.7033064961433411, "step": 769, "step_time": 11.18362781599717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 4.098806142807007, "epoch": 0.0077, "grad_norm": 0.06176958605647087, "kl": 0.8599829562008381, "learning_rate": 9.999750666662938e-06, "loss": -0.0078, "step": 770, "step_time": 6.197483897012717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1351.0, "completions/max_terminated_length": 1351.0, "completions/mean_length": 366.625, "completions/mean_terminated_length": 366.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.3937487602233887, "epoch": 0.00771, "frac_reward_zero_std": 0.5, "grad_norm": 0.0806567370891571, "kl": 0.8920351192355156, "learning_rate": 9.999749986825307e-06, "loss": -0.0045, "num_tokens": 17479024.0, "reward": 0.5181906223297119, "reward_std": 0.20846952497959137, "rewards/rollout_reward_func/mean": 0.5181906223297119, "rewards/rollout_reward_func/std": 1.0094512701034546, "sampling/importance_sampling_ratio/max": 0.5541667938232422, "sampling/importance_sampling_ratio/mean": 0.35550037026405334, "sampling/importance_sampling_ratio/min": 0.001136417151428759, "sampling/sampling_logp_difference/max": 2.161799669265747, "sampling/sampling_logp_difference/mean": 0.45774853229522705, "step": 771, "step_time": 10.780134599008306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 3.440255343914032, "epoch": 0.00772, "grad_norm": 0.055573660880327225, "kl": 0.8869542926549911, "learning_rate": 9.999749306062141e-06, "loss": -0.0048, "step": 772, "step_time": 5.933584125006746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1522.0, "completions/max_terminated_length": 1522.0, "completions/mean_length": 765.09375, "completions/mean_terminated_length": 765.09375, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "entropy": 3.8551536202430725, "epoch": 0.00773, "frac_reward_zero_std": 0.0, "grad_norm": 0.019917210564017296, "kl": 0.7988769561052322, "learning_rate": 9.999748624373435e-06, "loss": -0.0024, "num_tokens": 17545561.0, "reward": 0.7628800272941589, "reward_std": 0.42469096183776855, "rewards/rollout_reward_func/mean": 0.7628800272941589, "rewards/rollout_reward_func/std": 0.8361477851867676, "sampling/importance_sampling_ratio/max": 0.2989559769630432, "sampling/importance_sampling_ratio/mean": 0.20129306614398956, "sampling/importance_sampling_ratio/min": 0.00022361849551089108, "sampling/sampling_logp_difference/max": 3.371011257171631, "sampling/sampling_logp_difference/mean": 0.5721548795700073, "step": 773, "step_time": 12.155818749983155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.889989584684372, "epoch": 0.00774, "grad_norm": 0.023001424968242645, "kl": 0.7915590703487396, "learning_rate": 9.999747941759192e-06, "loss": -0.0024, "step": 774, "step_time": 6.456281590006256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1489.0, "completions/max_terminated_length": 1489.0, "completions/mean_length": 495.53125, "completions/mean_terminated_length": 511.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.28292453289032, "epoch": 0.00775, "frac_reward_zero_std": 0.25, "grad_norm": 0.008858385495841503, "kl": 0.7430542334914207, "learning_rate": 9.999747258219414e-06, "loss": -0.0028, "num_tokens": 17599420.0, "reward": 0.7487006187438965, "reward_std": 0.23085986077785492, "rewards/rollout_reward_func/mean": 0.7487006187438965, "rewards/rollout_reward_func/std": 0.7922924160957336, "sampling/importance_sampling_ratio/max": 0.5522856712341309, "sampling/importance_sampling_ratio/mean": 0.31039005517959595, "sampling/importance_sampling_ratio/min": 1.482495699009316e-14, "sampling/sampling_logp_difference/max": 8.006448745727539, "sampling/sampling_logp_difference/mean": 0.9330164194107056, "step": 775, "step_time": 11.107710365002276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.2877748012542725, "epoch": 0.00776, "grad_norm": 0.008371490053832531, "kl": 0.7403643801808357, "learning_rate": 9.999746573754097e-06, "loss": -0.0028, "step": 776, "step_time": 6.25186553599633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.678592801094055, "epoch": 0.00777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002802508824970573, "kl": 0.8938891291618347, "learning_rate": 9.999745888363244e-06, "loss": 0.0011, "num_tokens": 17633704.0, "reward": 1.1019694805145264, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.1019694805145264, "rewards/rollout_reward_func/std": 0.003151739714667201, "sampling/importance_sampling_ratio/max": 0.5501486659049988, "sampling/importance_sampling_ratio/mean": 0.5424795150756836, "sampling/importance_sampling_ratio/min": 0.5329164266586304, "sampling/sampling_logp_difference/max": 0.6259196400642395, "sampling/sampling_logp_difference/mean": 0.30582237243652344, "step": 777, "step_time": 5.453836255015631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.6869366466999054, "epoch": 0.00778, "grad_norm": 0.00028092204593122005, "kl": 0.8926638588309288, "learning_rate": 9.999745202046853e-06, "loss": 0.0011, "step": 778, "step_time": 2.6791717820015037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1299.0, "completions/max_terminated_length": 1299.0, "completions/mean_length": 636.40625, "completions/mean_terminated_length": 644.1034545898438, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.092498570680618, "epoch": 0.00779, "frac_reward_zero_std": 0.0, "grad_norm": 0.12047813087701797, "kl": 0.6569150798022747, "learning_rate": 9.999744514804925e-06, "loss": -0.0063, "num_tokens": 17695849.0, "reward": 0.7925805449485779, "reward_std": 0.5562282800674438, "rewards/rollout_reward_func/mean": 0.7925805449485779, "rewards/rollout_reward_func/std": 0.8570149540901184, "sampling/importance_sampling_ratio/max": 0.37441104650497437, "sampling/importance_sampling_ratio/mean": 0.1752123236656189, "sampling/importance_sampling_ratio/min": 9.486564366088107e-14, "sampling/sampling_logp_difference/max": 11.649304389953613, "sampling/sampling_logp_difference/mean": 0.823940634727478, "step": 779, "step_time": 11.347684524000215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.018229166977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018229166977107525, "entropy": 4.0827847719192505, "epoch": 0.0078, "grad_norm": 0.06605292111635208, "kl": 0.6675577163696289, "learning_rate": 9.999743826637464e-06, "loss": -0.0066, "step": 780, "step_time": 5.8815651740078465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1350.0, "completions/max_terminated_length": 1350.0, "completions/mean_length": 620.9375, "completions/mean_terminated_length": 620.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.862428069114685, "epoch": 0.00781, "frac_reward_zero_std": 0.25, "grad_norm": 0.011891096830368042, "kl": 0.6885473839938641, "learning_rate": 9.999743137544465e-06, "loss": -0.0059, "num_tokens": 17756292.0, "reward": 0.9171995520591736, "reward_std": 0.5220876932144165, "rewards/rollout_reward_func/mean": 0.9171995520591736, "rewards/rollout_reward_func/std": 0.7446943521499634, "sampling/importance_sampling_ratio/max": 0.552784264087677, "sampling/importance_sampling_ratio/mean": 0.26208576560020447, "sampling/importance_sampling_ratio/min": 1.7860647856526857e-12, "sampling/sampling_logp_difference/max": 3.56826114654541, "sampling/sampling_logp_difference/mean": 0.6367967128753662, "step": 781, "step_time": 11.185151345009217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.8474804162979126, "epoch": 0.00782, "grad_norm": 0.011691560037434101, "kl": 0.6882588043808937, "learning_rate": 9.999742447525931e-06, "loss": -0.0059, "step": 782, "step_time": 6.571964253002079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0625, "completions/max_length": 1392.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 547.0, "completions/mean_terminated_length": 544.300048828125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.680471181869507, "epoch": 0.00783, "frac_reward_zero_std": 0.25, "grad_norm": 0.04403857886791229, "kl": 0.6003408432006836, "learning_rate": 9.99974175658186e-06, "loss": -0.0017, "num_tokens": 17814744.0, "reward": 0.6057118773460388, "reward_std": 0.06243757903575897, "rewards/rollout_reward_func/mean": 0.6057118773460388, "rewards/rollout_reward_func/std": 0.979849636554718, "sampling/importance_sampling_ratio/max": 0.5988435745239258, "sampling/importance_sampling_ratio/mean": 0.22829295694828033, "sampling/importance_sampling_ratio/min": 4.960761280383384e-14, "sampling/sampling_logp_difference/max": 4.308991432189941, "sampling/sampling_logp_difference/mean": 0.8801671266555786, "step": 783, "step_time": 11.625375271003577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01822916674427688, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01822916674427688, "entropy": 4.662733972072601, "epoch": 0.00784, "grad_norm": 0.03160157799720764, "kl": 0.5978887602686882, "learning_rate": 9.999741064712254e-06, "loss": -0.0018, "step": 784, "step_time": 6.363804955006344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1444.0, "completions/max_terminated_length": 1444.0, "completions/mean_length": 317.40625, "completions/mean_terminated_length": 310.0322570800781, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.60028800368309, "epoch": 0.00785, "frac_reward_zero_std": 0.5, "grad_norm": 0.11601565778255463, "kl": 0.827159222215414, "learning_rate": 9.999740371917113e-06, "loss": -0.0095, "num_tokens": 17862465.0, "reward": 0.7983338832855225, "reward_std": 0.29095223546028137, "rewards/rollout_reward_func/mean": 0.7983338832855225, "rewards/rollout_reward_func/std": 0.7720068693161011, "sampling/importance_sampling_ratio/max": 0.5530408024787903, "sampling/importance_sampling_ratio/mean": 0.3450859487056732, "sampling/importance_sampling_ratio/min": 9.111365164754798e-22, "sampling/sampling_logp_difference/max": 12.953676223754883, "sampling/sampling_logp_difference/mean": 0.88006192445755, "step": 785, "step_time": 10.477719006004918 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.021938131423667073, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0323547984007746, "entropy": 3.5899732410907745, "epoch": 0.00786, "grad_norm": 0.026949508115649223, "kl": 0.8292879797518253, "learning_rate": 9.999739678196437e-06, "loss": -0.0099, "step": 786, "step_time": 5.889010005987075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1381.0, "completions/max_terminated_length": 1381.0, "completions/mean_length": 782.71875, "completions/mean_terminated_length": 782.71875, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "entropy": 3.817614793777466, "epoch": 0.00787, "frac_reward_zero_std": 0.25, "grad_norm": 0.0160350538790226, "kl": 0.7007187306880951, "learning_rate": 9.999738983550224e-06, "loss": -0.006, "num_tokens": 17928756.0, "reward": 1.0065463781356812, "reward_std": 0.6029738783836365, "rewards/rollout_reward_func/mean": 1.0065463781356812, "rewards/rollout_reward_func/std": 0.6723095178604126, "sampling/importance_sampling_ratio/max": 0.3033126890659332, "sampling/importance_sampling_ratio/mean": 0.17693433165550232, "sampling/importance_sampling_ratio/min": 9.22197431266203e-13, "sampling/sampling_logp_difference/max": 3.558770179748535, "sampling/sampling_logp_difference/mean": 0.6014601588249207, "step": 787, "step_time": 11.408230000008189 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 3.8019691109657288, "epoch": 0.00788, "grad_norm": 0.01582365483045578, "kl": 0.7013603374361992, "learning_rate": 9.999738287978477e-06, "loss": -0.006, "step": 788, "step_time": 6.649938437003584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1277.0, "completions/max_terminated_length": 1277.0, "completions/mean_length": 346.5, "completions/mean_terminated_length": 357.1612854003906, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.354387640953064, "epoch": 0.00789, "frac_reward_zero_std": 0.25, "grad_norm": 0.016610199585556984, "kl": 0.8418178036808968, "learning_rate": 9.999737591481196e-06, "loss": -0.0072, "num_tokens": 17977252.0, "reward": 1.0234614610671997, "reward_std": 0.26559382677078247, "rewards/rollout_reward_func/mean": 1.0234614610671997, "rewards/rollout_reward_func/std": 0.5134736895561218, "sampling/importance_sampling_ratio/max": 0.5463883876800537, "sampling/importance_sampling_ratio/mean": 0.3561770021915436, "sampling/importance_sampling_ratio/min": 2.4197996384422993e-10, "sampling/sampling_logp_difference/max": 2.9595937728881836, "sampling/sampling_logp_difference/mean": 0.5462968349456787, "step": 789, "step_time": 10.266903136005567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.3610975444316864, "epoch": 0.0079, "grad_norm": 0.01653832569718361, "kl": 0.8369379490613937, "learning_rate": 9.999736894058379e-06, "loss": -0.0072, "step": 790, "step_time": 6.023891295015346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1489.0, "completions/max_terminated_length": 1489.0, "completions/mean_length": 559.21875, "completions/mean_terminated_length": 559.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.23792228102684, "epoch": 0.00791, "frac_reward_zero_std": 0.25, "grad_norm": 0.015217098407447338, "kl": 0.8500257357954979, "learning_rate": 9.999736195710027e-06, "loss": 0.011, "num_tokens": 18035144.0, "reward": 1.093949794769287, "reward_std": 0.21699826419353485, "rewards/rollout_reward_func/mean": 1.093949794769287, "rewards/rollout_reward_func/std": 0.4127536714076996, "sampling/importance_sampling_ratio/max": 0.5513357520103455, "sampling/importance_sampling_ratio/mean": 0.2958311438560486, "sampling/importance_sampling_ratio/min": 0.00040042781620286405, "sampling/sampling_logp_difference/max": 2.40462327003479, "sampling/sampling_logp_difference/mean": 0.4831947684288025, "step": 791, "step_time": 10.837889903996256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.2312922179698944, "epoch": 0.00792, "grad_norm": 0.01481719221919775, "kl": 0.851023755967617, "learning_rate": 9.999735496436145e-06, "loss": 0.011, "step": 792, "step_time": 6.120596124019357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1544.0, "completions/max_terminated_length": 1544.0, "completions/mean_length": 287.34375, "completions/mean_terminated_length": 287.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.3670806884765625, "epoch": 0.00793, "frac_reward_zero_std": 0.75, "grad_norm": 0.013636613264679909, "kl": 0.8035697713494301, "learning_rate": 9.999734796236725e-06, "loss": 0.0062, "num_tokens": 18082226.0, "reward": 0.7465364336967468, "reward_std": 0.1598670780658722, "rewards/rollout_reward_func/mean": 0.7465364336967468, "rewards/rollout_reward_func/std": 0.6946610808372498, "sampling/importance_sampling_ratio/max": 0.5538754463195801, "sampling/importance_sampling_ratio/mean": 0.42551836371421814, "sampling/importance_sampling_ratio/min": 4.316163916213919e-17, "sampling/sampling_logp_difference/max": 12.601259231567383, "sampling/sampling_logp_difference/mean": 0.614601731300354, "step": 793, "step_time": 10.887123917003919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.363594889640808, "epoch": 0.00794, "grad_norm": 0.01459919847548008, "kl": 0.803445115685463, "learning_rate": 9.999734095111773e-06, "loss": 0.0062, "step": 794, "step_time": 6.463945955998497 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.0, "completions/max_length": 1511.0, "completions/max_terminated_length": 1511.0, "completions/mean_length": 767.03125, "completions/mean_terminated_length": 767.03125, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "entropy": 3.7904825806617737, "epoch": 0.00795, "frac_reward_zero_std": 0.0, "grad_norm": 0.01050646509975195, "kl": 0.7233243137598038, "learning_rate": 9.999733393061286e-06, "loss": -0.0076, "num_tokens": 18149093.0, "reward": 0.6643549799919128, "reward_std": 0.4119035005569458, "rewards/rollout_reward_func/mean": 0.6643549799919128, "rewards/rollout_reward_func/std": 0.9409052133560181, "sampling/importance_sampling_ratio/max": 0.30172401666641235, "sampling/importance_sampling_ratio/mean": 0.1875109225511551, "sampling/importance_sampling_ratio/min": 1.3374131663246303e-11, "sampling/sampling_logp_difference/max": 4.699708461761475, "sampling/sampling_logp_difference/mean": 0.6347557306289673, "step": 795, "step_time": 11.975600663012301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.789091020822525, "epoch": 0.00796, "grad_norm": 0.010621439665555954, "kl": 0.723754920065403, "learning_rate": 9.999732690085267e-06, "loss": -0.0077, "step": 796, "step_time": 6.881038430990884 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 1390.0, "completions/max_terminated_length": 1390.0, "completions/mean_length": 558.28125, "completions/mean_terminated_length": 558.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.191230446100235, "epoch": 0.00797, "frac_reward_zero_std": 0.5, "grad_norm": 0.01792513020336628, "kl": 0.7524769455194473, "learning_rate": 9.999731986183711e-06, "loss": 0.0015, "num_tokens": 18207456.0, "reward": 1.1026954650878906, "reward_std": 0.18689408898353577, "rewards/rollout_reward_func/mean": 1.1026954650878906, "rewards/rollout_reward_func/std": 0.3551463484764099, "sampling/importance_sampling_ratio/max": 0.5500985383987427, "sampling/importance_sampling_ratio/mean": 0.30042463541030884, "sampling/importance_sampling_ratio/min": 0.00852198339998722, "sampling/sampling_logp_difference/max": 1.6916136741638184, "sampling/sampling_logp_difference/mean": 0.40130844712257385, "step": 797, "step_time": 10.682587889001297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.1962352991104126, "epoch": 0.00798, "grad_norm": 0.15035906434059143, "kl": 0.7383802905678749, "learning_rate": 9.999731281356627e-06, "loss": 0.0018, "step": 798, "step_time": 5.974221801006934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1541.0, "completions/max_terminated_length": 1541.0, "completions/mean_length": 811.3125, "completions/mean_terminated_length": 811.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.773787975311279, "epoch": 0.00799, "frac_reward_zero_std": 0.0, "grad_norm": 0.05318315699696541, "kl": 0.7281761206686497, "learning_rate": 9.999730575604006e-06, "loss": -0.0076, "num_tokens": 18275064.0, "reward": 0.16271142661571503, "reward_std": 0.7583430409431458, "rewards/rollout_reward_func/mean": 0.16271142661571503, "rewards/rollout_reward_func/std": 0.9618786573410034, "sampling/importance_sampling_ratio/max": 0.5427871346473694, "sampling/importance_sampling_ratio/mean": 0.146867573261261, "sampling/importance_sampling_ratio/min": 3.266299719844462e-13, "sampling/sampling_logp_difference/max": 3.6980135440826416, "sampling/sampling_logp_difference/mean": 0.8009088039398193, "step": 799, "step_time": 12.480544379999628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.77292001247406, "epoch": 0.008, "grad_norm": 0.05120360478758812, "kl": 0.7300167605280876, "learning_rate": 9.999729868925855e-06, "loss": -0.0077, "step": 800, "step_time": 6.480775096984871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1471.0, "completions/max_terminated_length": 1471.0, "completions/mean_length": 483.9375, "completions/mean_terminated_length": 499.0322570800781, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.918710082769394, "epoch": 0.00801, "frac_reward_zero_std": 0.25, "grad_norm": 0.20972947776317596, "kl": 0.6107012033462524, "learning_rate": 9.99972916132217e-06, "loss": 0.0107, "num_tokens": 18330323.0, "reward": 0.5642051100730896, "reward_std": 0.2096695601940155, "rewards/rollout_reward_func/mean": 0.5642051100730896, "rewards/rollout_reward_func/std": 1.020442247390747, "sampling/importance_sampling_ratio/max": 0.5479604005813599, "sampling/importance_sampling_ratio/mean": 0.3039044737815857, "sampling/importance_sampling_ratio/min": 8.95082185453644e-12, "sampling/sampling_logp_difference/max": 4.353412628173828, "sampling/sampling_logp_difference/mean": 0.7186938524246216, "step": 801, "step_time": 11.221943918011675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 3.9160813689231873, "epoch": 0.00802, "grad_norm": 0.06172682344913483, "kl": 0.5800062045454979, "learning_rate": 9.999728452792951e-06, "loss": 0.0103, "step": 802, "step_time": 6.189493130012124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1390.0, "completions/max_terminated_length": 1390.0, "completions/mean_length": 682.40625, "completions/mean_terminated_length": 703.9031982421875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.280131816864014, "epoch": 0.00803, "frac_reward_zero_std": 0.0, "grad_norm": 0.3166411519050598, "kl": 0.6836027130484581, "learning_rate": 9.999727743338202e-06, "loss": -0.0055, "num_tokens": 18392384.0, "reward": 0.5326682329177856, "reward_std": 0.18366044759750366, "rewards/rollout_reward_func/mean": 0.5326682329177856, "rewards/rollout_reward_func/std": 0.9650113582611084, "sampling/importance_sampling_ratio/max": 0.551368236541748, "sampling/importance_sampling_ratio/mean": 0.21796661615371704, "sampling/importance_sampling_ratio/min": 6.234621707790211e-08, "sampling/sampling_logp_difference/max": 4.369815826416016, "sampling/sampling_logp_difference/mean": 0.6863468885421753, "step": 803, "step_time": 10.748296223995567 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.028125000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04062500037252903, "entropy": 4.263590157032013, "epoch": 0.00804, "grad_norm": 0.055580511689186096, "kl": 0.6482272893190384, "learning_rate": 9.99972703295792e-06, "loss": -0.0066, "step": 804, "step_time": 5.936967918998562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1561.0, "completions/max_terminated_length": 1561.0, "completions/mean_length": 472.90625, "completions/mean_terminated_length": 472.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.08648219704628, "epoch": 0.00805, "frac_reward_zero_std": 0.5, "grad_norm": 0.01240392867475748, "kl": 0.6880391761660576, "learning_rate": 9.999726321652106e-06, "loss": 0.0012, "num_tokens": 18446219.0, "reward": 0.8881956934928894, "reward_std": 0.2903081774711609, "rewards/rollout_reward_func/mean": 0.8881956934928894, "rewards/rollout_reward_func/std": 0.7042355537414551, "sampling/importance_sampling_ratio/max": 0.5543598532676697, "sampling/importance_sampling_ratio/mean": 0.3192344009876251, "sampling/importance_sampling_ratio/min": 6.196083278614886e-15, "sampling/sampling_logp_difference/max": 5.079160213470459, "sampling/sampling_logp_difference/mean": 0.8193686008453369, "step": 805, "step_time": 11.21417982501589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.089379727840424, "epoch": 0.00806, "grad_norm": 0.012201515026390553, "kl": 0.6866480782628059, "learning_rate": 9.999725609420761e-06, "loss": 0.0012, "step": 806, "step_time": 6.194077339991054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1321.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 427.59375, "completions/mean_terminated_length": 427.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.0983657240867615, "epoch": 0.00807, "frac_reward_zero_std": 0.5, "grad_norm": 0.019046777859330177, "kl": 0.8260966241359711, "learning_rate": 9.999724896263882e-06, "loss": -0.0022, "num_tokens": 18500352.0, "reward": -0.14243794977664948, "reward_std": 0.4519989490509033, "rewards/rollout_reward_func/mean": -0.14243794977664948, "rewards/rollout_reward_func/std": 1.0861990451812744, "sampling/importance_sampling_ratio/max": 0.5395880937576294, "sampling/importance_sampling_ratio/mean": 0.3364320993423462, "sampling/importance_sampling_ratio/min": 0.010114181786775589, "sampling/sampling_logp_difference/max": 2.3754703998565674, "sampling/sampling_logp_difference/mean": 0.38964414596557617, "step": 807, "step_time": 10.63010624301387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.102256566286087, "epoch": 0.00808, "grad_norm": 0.028198441490530968, "kl": 0.828279659152031, "learning_rate": 9.999724182181473e-06, "loss": -0.0022, "step": 808, "step_time": 5.707138252000732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1302.0, "completions/max_terminated_length": 1302.0, "completions/mean_length": 637.5625, "completions/mean_terminated_length": 637.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.948206126689911, "epoch": 0.00809, "frac_reward_zero_std": 0.25, "grad_norm": 0.015539898537099361, "kl": 0.8317869901657104, "learning_rate": 9.999723467173534e-06, "loss": -0.007, "num_tokens": 18559802.0, "reward": 0.7413138151168823, "reward_std": 0.44119638204574585, "rewards/rollout_reward_func/mean": 0.7413138151168823, "rewards/rollout_reward_func/std": 0.8397331237792969, "sampling/importance_sampling_ratio/max": 0.5472633242607117, "sampling/importance_sampling_ratio/mean": 0.24262401461601257, "sampling/importance_sampling_ratio/min": 8.158871222672861e-15, "sampling/sampling_logp_difference/max": 3.604795455932617, "sampling/sampling_logp_difference/mean": 0.656445324420929, "step": 809, "step_time": 10.581130970000231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 3.9274742603302, "epoch": 0.0081, "grad_norm": 0.01582302525639534, "kl": 0.8330071680247784, "learning_rate": 9.999722751240062e-06, "loss": -0.007, "step": 810, "step_time": 5.92819135600439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1597.0, "completions/max_terminated_length": 1597.0, "completions/mean_length": 673.46875, "completions/mean_terminated_length": 694.6773681640625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.10092356801033, "epoch": 0.00811, "frac_reward_zero_std": 0.0, "grad_norm": 0.38058245182037354, "kl": 0.6659963503479958, "learning_rate": 9.999722034381061e-06, "loss": -0.0081, "num_tokens": 18622105.0, "reward": 0.8563543558120728, "reward_std": 0.19776402413845062, "rewards/rollout_reward_func/mean": 0.8563543558120728, "rewards/rollout_reward_func/std": 0.5221542119979858, "sampling/importance_sampling_ratio/max": 0.5535373091697693, "sampling/importance_sampling_ratio/mean": 0.25591516494750977, "sampling/importance_sampling_ratio/min": 1.2705085872255495e-09, "sampling/sampling_logp_difference/max": 2.6318984031677246, "sampling/sampling_logp_difference/mean": 0.6127711534500122, "step": 811, "step_time": 12.01189675200294 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 4.084523469209671, "epoch": 0.00812, "grad_norm": 0.05818013846874237, "kl": 0.6389157772064209, "learning_rate": 9.999721316596529e-06, "loss": -0.009, "step": 812, "step_time": 6.529133509007806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1471.0, "completions/max_terminated_length": 1471.0, "completions/mean_length": 850.59375, "completions/mean_terminated_length": 850.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.314282387495041, "epoch": 0.00813, "frac_reward_zero_std": 0.0, "grad_norm": 0.12119383364915848, "kl": 0.7351368516683578, "learning_rate": 9.999720597886464e-06, "loss": -0.0014, "num_tokens": 18689450.0, "reward": 0.17382773756980896, "reward_std": 0.746678352355957, "rewards/rollout_reward_func/mean": 0.17382773756980896, "rewards/rollout_reward_func/std": 1.1138721704483032, "sampling/importance_sampling_ratio/max": 0.5326306819915771, "sampling/importance_sampling_ratio/mean": 0.12610748410224915, "sampling/importance_sampling_ratio/min": 2.4248323349240763e-09, "sampling/sampling_logp_difference/max": 12.070755958557129, "sampling/sampling_logp_difference/mean": 0.7304210066795349, "step": 813, "step_time": 12.054200098995352 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016666667070239782, "entropy": 4.3099982142448425, "epoch": 0.00814, "grad_norm": 0.0623190738260746, "kl": 0.8176212050020695, "learning_rate": 9.99971987825087e-06, "loss": -0.0013, "step": 814, "step_time": 6.334518907009624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1457.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 785.03125, "completions/mean_terminated_length": 785.03125, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "entropy": 3.8485607504844666, "epoch": 0.00815, "frac_reward_zero_std": 0.0, "grad_norm": 0.14986680448055267, "kl": 1.5317824482917786, "learning_rate": 9.999719157689747e-06, "loss": 0.0048, "num_tokens": 18756781.0, "reward": 1.0456069707870483, "reward_std": 0.415025532245636, "rewards/rollout_reward_func/mean": 1.0456069707870483, "rewards/rollout_reward_func/std": 0.5681360960006714, "sampling/importance_sampling_ratio/max": 0.3002089262008667, "sampling/importance_sampling_ratio/mean": 0.20178340375423431, "sampling/importance_sampling_ratio/min": 2.185690765412142e-17, "sampling/sampling_logp_difference/max": 3.7580695152282715, "sampling/sampling_logp_difference/mean": 0.6519012451171875, "step": 815, "step_time": 11.128683123017254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.8666080832481384, "epoch": 0.00816, "grad_norm": 0.11115767061710358, "kl": 1.2253865897655487, "learning_rate": 9.999718436203094e-06, "loss": 0.0041, "step": 816, "step_time": 6.291888500003552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 524.28125, "completions/mean_terminated_length": 524.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.350756824016571, "epoch": 0.00817, "frac_reward_zero_std": 0.0, "grad_norm": 0.1918005347251892, "kl": 0.8008115291595459, "learning_rate": 9.999717713790909e-06, "loss": -0.011, "num_tokens": 18814390.0, "reward": 1.0850812196731567, "reward_std": 0.2513916492462158, "rewards/rollout_reward_func/mean": 1.0850812196731567, "rewards/rollout_reward_func/std": 0.44078752398490906, "sampling/importance_sampling_ratio/max": 0.3518841862678528, "sampling/importance_sampling_ratio/mean": 0.25818508863449097, "sampling/importance_sampling_ratio/min": 3.818834828982036e-14, "sampling/sampling_logp_difference/max": 3.9743919372558594, "sampling/sampling_logp_difference/mean": 0.5998451709747314, "step": 817, "step_time": 9.225292905997776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 3.3647813498973846, "epoch": 0.00818, "grad_norm": 0.05327212065458298, "kl": 0.8476330265402794, "learning_rate": 9.999716990453195e-06, "loss": -0.0115, "step": 818, "step_time": 4.566569242000696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1579.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 861.5625, "completions/mean_terminated_length": 861.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.866812705993652, "epoch": 0.00819, "frac_reward_zero_std": 0.0, "grad_norm": 0.025169329717755318, "kl": 0.6483453139662743, "learning_rate": 9.999716266189952e-06, "loss": -0.0057, "num_tokens": 18883475.0, "reward": 0.29020094871520996, "reward_std": 0.6882854700088501, "rewards/rollout_reward_func/mean": 0.29020094871520996, "rewards/rollout_reward_func/std": 1.0563002824783325, "sampling/importance_sampling_ratio/max": 0.5420113205909729, "sampling/importance_sampling_ratio/mean": 0.1327405571937561, "sampling/importance_sampling_ratio/min": 2.1165305914432774e-16, "sampling/sampling_logp_difference/max": 5.1460347175598145, "sampling/sampling_logp_difference/mean": 0.7710435390472412, "step": 819, "step_time": 12.628916761990695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.895016729831696, "epoch": 0.0082, "grad_norm": 0.023437628522515297, "kl": 0.6461432576179504, "learning_rate": 9.99971554100118e-06, "loss": -0.0057, "step": 820, "step_time": 6.647956004009757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1428.0, "completions/max_terminated_length": 1428.0, "completions/mean_length": 670.65625, "completions/mean_terminated_length": 670.6128540039062, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "entropy": 4.089717596769333, "epoch": 0.00821, "frac_reward_zero_std": 0.25, "grad_norm": 0.02936265803873539, "kl": 0.8852962478995323, "learning_rate": 9.99971481488688e-06, "loss": -0.0086, "num_tokens": 18945006.0, "reward": 0.3017217516899109, "reward_std": 0.5342571139335632, "rewards/rollout_reward_func/mean": 0.3017217516899109, "rewards/rollout_reward_func/std": 1.076377511024475, "sampling/importance_sampling_ratio/max": 0.2931906282901764, "sampling/importance_sampling_ratio/mean": 0.17600592970848083, "sampling/importance_sampling_ratio/min": 7.828043023364907e-15, "sampling/sampling_logp_difference/max": 3.881350040435791, "sampling/sampling_logp_difference/mean": 0.7402852773666382, "step": 821, "step_time": 10.845636820995423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.110074371099472, "epoch": 0.00822, "grad_norm": 0.030061371624469757, "kl": 0.8791117668151855, "learning_rate": 9.99971408784705e-06, "loss": -0.0087, "step": 822, "step_time": 6.520065232005436 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018229166977107525, "completions/clipped_ratio": 0.0625, "completions/max_length": 1412.0, "completions/max_terminated_length": 1412.0, "completions/mean_length": 698.59375, "completions/mean_terminated_length": 683.7000122070312, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.254970461130142, "epoch": 0.00823, "frac_reward_zero_std": 0.0, "grad_norm": 0.3063943386077881, "kl": 0.5793765150010586, "learning_rate": 9.99971335988169e-06, "loss": -0.0111, "num_tokens": 19007579.0, "reward": 0.8035763502120972, "reward_std": 0.7260856628417969, "rewards/rollout_reward_func/mean": 0.8035763502120972, "rewards/rollout_reward_func/std": 0.8232504725456238, "sampling/importance_sampling_ratio/max": 0.5391681790351868, "sampling/importance_sampling_ratio/mean": 0.18016871809959412, "sampling/importance_sampling_ratio/min": 1.4807727266585857e-09, "sampling/sampling_logp_difference/max": 3.931210994720459, "sampling/sampling_logp_difference/mean": 0.670873761177063, "step": 823, "step_time": 11.354510195997136 }, { "clip_ratio/high_max": 0.06770833395421505, "clip_ratio/high_mean": 0.033854166977107525, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06510416697710752, "entropy": 4.202061951160431, "epoch": 0.00824, "grad_norm": 0.02457244321703911, "kl": 0.5779262892901897, "learning_rate": 9.999712630990802e-06, "loss": -0.0118, "step": 824, "step_time": 6.486740731015743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1399.0, "completions/max_terminated_length": 1399.0, "completions/mean_length": 227.875, "completions/mean_terminated_length": 227.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.3709979355335236, "epoch": 0.00825, "frac_reward_zero_std": 0.75, "grad_norm": 0.011924726888537407, "kl": 0.7678099200129509, "learning_rate": 9.999711901174385e-06, "loss": -0.0032, "num_tokens": 19052607.0, "reward": 0.45529571175575256, "reward_std": 0.24080438911914825, "rewards/rollout_reward_func/mean": 0.45529571175575256, "rewards/rollout_reward_func/std": 1.0095120668411255, "sampling/importance_sampling_ratio/max": 0.540693998336792, "sampling/importance_sampling_ratio/mean": 0.4268019497394562, "sampling/importance_sampling_ratio/min": 0.002526009688153863, "sampling/sampling_logp_difference/max": 1.6034162044525146, "sampling/sampling_logp_difference/mean": 0.4245834946632385, "step": 825, "step_time": 10.446253926005738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.329563319683075, "epoch": 0.00826, "grad_norm": 0.010663014836609364, "kl": 0.7780786678195, "learning_rate": 9.999711170432441e-06, "loss": -0.0032, "step": 826, "step_time": 6.005407929013018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1334.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 584.09375, "completions/mean_terminated_length": 559.9031982421875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.209156274795532, "epoch": 0.00827, "frac_reward_zero_std": 0.0, "grad_norm": 0.04240122437477112, "kl": 0.6656608879566193, "learning_rate": 9.999710438764968e-06, "loss": -0.0112, "num_tokens": 19112496.0, "reward": 0.7072945237159729, "reward_std": 0.4832136034965515, "rewards/rollout_reward_func/mean": 0.7072945237159729, "rewards/rollout_reward_func/std": 0.7806521058082581, "sampling/importance_sampling_ratio/max": 0.54676353931427, "sampling/importance_sampling_ratio/mean": 0.22281832993030548, "sampling/importance_sampling_ratio/min": 1.2981766540649115e-13, "sampling/sampling_logp_difference/max": 10.613835334777832, "sampling/sampling_logp_difference/mean": 0.8419315814971924, "step": 827, "step_time": 11.248416834998352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.204881817102432, "epoch": 0.00828, "grad_norm": 0.05000458285212517, "kl": 0.6778422258794308, "learning_rate": 9.999709706171968e-06, "loss": -0.0112, "step": 828, "step_time": 6.504244717005349 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 1453.0, "completions/max_terminated_length": 1453.0, "completions/mean_length": 769.9375, "completions/mean_terminated_length": 769.9375, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "entropy": 4.313331961631775, "epoch": 0.00829, "frac_reward_zero_std": 0.25, "grad_norm": 0.032338861376047134, "kl": 0.8252175822854042, "learning_rate": 9.999708972653441e-06, "loss": -0.0054, "num_tokens": 19178498.0, "reward": 0.7183601260185242, "reward_std": 0.4540090560913086, "rewards/rollout_reward_func/mean": 0.7183601260185242, "rewards/rollout_reward_func/std": 0.8367286324501038, "sampling/importance_sampling_ratio/max": 0.29164379835128784, "sampling/importance_sampling_ratio/mean": 0.15441593527793884, "sampling/importance_sampling_ratio/min": 9.099578337657402e-17, "sampling/sampling_logp_difference/max": 11.971647262573242, "sampling/sampling_logp_difference/mean": 0.842413067817688, "step": 829, "step_time": 11.331101207979373 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "entropy": 4.29767832159996, "epoch": 0.0083, "grad_norm": 0.029434533789753914, "kl": 0.8165934011340141, "learning_rate": 9.999708238209385e-06, "loss": -0.0054, "step": 830, "step_time": 6.645663916002377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1511.0, "completions/max_terminated_length": 1511.0, "completions/mean_length": 692.09375, "completions/mean_terminated_length": 692.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.1215276420116425, "epoch": 0.00831, "frac_reward_zero_std": 0.25, "grad_norm": 0.04750390350818634, "kl": 0.6566370353102684, "learning_rate": 9.999707502839802e-06, "loss": -0.0024, "num_tokens": 19241299.0, "reward": 1.096445083618164, "reward_std": 0.2381487786769867, "rewards/rollout_reward_func/mean": 1.096445083618164, "rewards/rollout_reward_func/std": 0.43249890208244324, "sampling/importance_sampling_ratio/max": 0.5486360192298889, "sampling/importance_sampling_ratio/mean": 0.22208866477012634, "sampling/importance_sampling_ratio/min": 1.7119508935459216e-12, "sampling/sampling_logp_difference/max": 3.5773115158081055, "sampling/sampling_logp_difference/mean": 0.73301100730896, "step": 831, "step_time": 11.496181114991487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 4.0841323137283325, "epoch": 0.00832, "grad_norm": 0.04603351280093193, "kl": 0.6569323092699051, "learning_rate": 9.999706766544692e-06, "loss": -0.0025, "step": 832, "step_time": 6.328572171005362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1349.0, "completions/max_terminated_length": 1349.0, "completions/mean_length": 373.03125, "completions/mean_terminated_length": 373.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.742939203977585, "epoch": 0.00833, "frac_reward_zero_std": 0.5, "grad_norm": 0.015063686296343803, "kl": 0.7363438419997692, "learning_rate": 9.999706029324055e-06, "loss": -0.004, "num_tokens": 19291752.0, "reward": 0.5680320262908936, "reward_std": 0.03072732500731945, "rewards/rollout_reward_func/mean": 0.5680320262908936, "rewards/rollout_reward_func/std": 0.9167143106460571, "sampling/importance_sampling_ratio/max": 0.5433221459388733, "sampling/importance_sampling_ratio/mean": 0.331358939409256, "sampling/importance_sampling_ratio/min": 5.4619690714741154e-11, "sampling/sampling_logp_difference/max": 4.827879428863525, "sampling/sampling_logp_difference/mean": 0.6257786154747009, "step": 833, "step_time": 10.561462885991205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.70239394903183, "epoch": 0.00834, "grad_norm": 0.015027291141450405, "kl": 0.7405409961938858, "learning_rate": 9.999705291177891e-06, "loss": -0.004, "step": 834, "step_time": 6.454902714998752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1403.0, "completions/max_terminated_length": 1403.0, "completions/mean_length": 648.96875, "completions/mean_terminated_length": 648.96875, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "entropy": 3.945218175649643, "epoch": 0.00835, "frac_reward_zero_std": 0.0, "grad_norm": 0.07413113862276077, "kl": 0.7157803699374199, "learning_rate": 9.999704552106202e-06, "loss": -0.0104, "num_tokens": 19355249.0, "reward": 0.7293528914451599, "reward_std": 0.565803050994873, "rewards/rollout_reward_func/mean": 0.7293528914451599, "rewards/rollout_reward_func/std": 0.8877592086791992, "sampling/importance_sampling_ratio/max": 0.30194923281669617, "sampling/importance_sampling_ratio/mean": 0.18878568708896637, "sampling/importance_sampling_ratio/min": 6.645779759839598e-11, "sampling/sampling_logp_difference/max": 3.5353808403015137, "sampling/sampling_logp_difference/mean": 0.6247765421867371, "step": 835, "step_time": 11.051353779010242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.928915858268738, "epoch": 0.00836, "grad_norm": 0.07086915522813797, "kl": 0.7151546850800514, "learning_rate": 9.999703812108984e-06, "loss": -0.0103, "step": 836, "step_time": 6.627293383986398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1431.0, "completions/max_terminated_length": 1431.0, "completions/mean_length": 839.53125, "completions/mean_terminated_length": 839.53125, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "entropy": 4.01313054561615, "epoch": 0.00837, "frac_reward_zero_std": 0.0, "grad_norm": 0.01351988222450018, "kl": 0.7684448659420013, "learning_rate": 9.999703071186241e-06, "loss": -0.0033, "num_tokens": 19423910.0, "reward": 0.6974587440490723, "reward_std": 0.4703824520111084, "rewards/rollout_reward_func/mean": 0.6974587440490723, "rewards/rollout_reward_func/std": 0.8867810368537903, "sampling/importance_sampling_ratio/max": 0.3075011372566223, "sampling/importance_sampling_ratio/mean": 0.1583968997001648, "sampling/importance_sampling_ratio/min": 7.193116545109457e-13, "sampling/sampling_logp_difference/max": 3.993687868118286, "sampling/sampling_logp_difference/mean": 0.6572608947753906, "step": 837, "step_time": 11.514446193010372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.021058738231659, "epoch": 0.00838, "grad_norm": 0.014377658255398273, "kl": 0.7668037340044975, "learning_rate": 9.999702329337973e-06, "loss": -0.0034, "step": 838, "step_time": 6.255534083000384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1453.0, "completions/max_terminated_length": 1453.0, "completions/mean_length": 499.46875, "completions/mean_terminated_length": 501.0967712402344, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.139230340719223, "epoch": 0.00839, "frac_reward_zero_std": 0.5, "grad_norm": 0.05872396007180214, "kl": 0.7501969635486603, "learning_rate": 9.999701586564176e-06, "loss": 0.0051, "num_tokens": 19477462.0, "reward": 0.3664734959602356, "reward_std": 0.28802087903022766, "rewards/rollout_reward_func/mean": 0.3664734959602356, "rewards/rollout_reward_func/std": 0.9825539588928223, "sampling/importance_sampling_ratio/max": 0.5432742834091187, "sampling/importance_sampling_ratio/mean": 0.2998713254928589, "sampling/importance_sampling_ratio/min": 4.3910293864781336e-17, "sampling/sampling_logp_difference/max": 4.696831226348877, "sampling/sampling_logp_difference/mean": 0.7931091785430908, "step": 839, "step_time": 11.051519904001907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.158269613981247, "epoch": 0.0084, "grad_norm": 0.05988883599638939, "kl": 0.7300044745206833, "learning_rate": 9.999700842864858e-06, "loss": 0.0049, "step": 840, "step_time": 6.641928976001509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1269.0, "completions/max_terminated_length": 1269.0, "completions/mean_length": 263.125, "completions/mean_terminated_length": 263.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.4374454617500305, "epoch": 0.00841, "frac_reward_zero_std": 0.75, "grad_norm": 0.024344848468899727, "kl": 0.7753745689988136, "learning_rate": 9.999700098240011e-06, "loss": 0.0057, "num_tokens": 19522619.0, "reward": 0.5093036890029907, "reward_std": 0.18959423899650574, "rewards/rollout_reward_func/mean": 0.5093036890029907, "rewards/rollout_reward_func/std": 1.0051145553588867, "sampling/importance_sampling_ratio/max": 0.5501900315284729, "sampling/importance_sampling_ratio/mean": 0.4257567822933197, "sampling/importance_sampling_ratio/min": 1.4295915207185317e-07, "sampling/sampling_logp_difference/max": 3.9082067012786865, "sampling/sampling_logp_difference/mean": 0.4897781014442444, "step": 841, "step_time": 10.775195672002155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.451709270477295, "epoch": 0.00842, "grad_norm": 0.021687520667910576, "kl": 0.774255134165287, "learning_rate": 9.999699352689638e-06, "loss": 0.0057, "step": 842, "step_time": 5.621037134988001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1579.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 653.25, "completions/mean_terminated_length": 653.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.8222889602184296, "epoch": 0.00843, "frac_reward_zero_std": 0.25, "grad_norm": 0.00810930971056223, "kl": 0.7820551469922066, "learning_rate": 9.999698606213743e-06, "loss": -0.0096, "num_tokens": 19583337.0, "reward": 1.0471220016479492, "reward_std": 0.3098241686820984, "rewards/rollout_reward_func/mean": 1.0471220016479492, "rewards/rollout_reward_func/std": 0.43402427434921265, "sampling/importance_sampling_ratio/max": 0.5447193384170532, "sampling/importance_sampling_ratio/mean": 0.2555849552154541, "sampling/importance_sampling_ratio/min": 0.00041060056537389755, "sampling/sampling_logp_difference/max": 3.7355475425720215, "sampling/sampling_logp_difference/mean": 0.5475468039512634, "step": 843, "step_time": 12.063694996002596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.8331544399261475, "epoch": 0.00844, "grad_norm": 0.008417705073952675, "kl": 0.7802422717213631, "learning_rate": 9.999697858812321e-06, "loss": -0.0096, "step": 844, "step_time": 6.553405413003929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 143.71875, "completions/mean_terminated_length": 143.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.823262542486191, "epoch": 0.00845, "frac_reward_zero_std": 0.75, "grad_norm": 0.02664402313530445, "kl": 0.7552298381924629, "learning_rate": 9.999697110485375e-06, "loss": -0.0041, "num_tokens": 19626633.0, "reward": 1.0651136636734009, "reward_std": 0.17885589599609375, "rewards/rollout_reward_func/mean": 1.0651136636734009, "rewards/rollout_reward_func/std": 0.3460140526294708, "sampling/importance_sampling_ratio/max": 0.5503713488578796, "sampling/importance_sampling_ratio/mean": 0.4692229628562927, "sampling/importance_sampling_ratio/min": 0.021193552762269974, "sampling/sampling_logp_difference/max": 1.6409833431243896, "sampling/sampling_logp_difference/mean": 0.335918664932251, "step": 845, "step_time": 8.844328120998398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8206825852394104, "epoch": 0.00846, "grad_norm": 0.025176649913191795, "kl": 0.7554197758436203, "learning_rate": 9.999696361232904e-06, "loss": -0.004, "step": 846, "step_time": 4.728950059987255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1412.0, "completions/max_terminated_length": 1412.0, "completions/mean_length": 671.0, "completions/mean_terminated_length": 671.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.909887135028839, "epoch": 0.00847, "frac_reward_zero_std": 0.25, "grad_norm": 0.02492830529808998, "kl": 0.6291069984436035, "learning_rate": 9.999695611054908e-06, "loss": -0.0082, "num_tokens": 19686985.0, "reward": 1.1076912879943848, "reward_std": 0.1954641044139862, "rewards/rollout_reward_func/mean": 1.1076912879943848, "rewards/rollout_reward_func/std": 0.3688003718852997, "sampling/importance_sampling_ratio/max": 0.5411182641983032, "sampling/importance_sampling_ratio/mean": 0.2199677675962448, "sampling/importance_sampling_ratio/min": 4.750666179886842e-11, "sampling/sampling_logp_difference/max": 4.726693153381348, "sampling/sampling_logp_difference/mean": 0.5694982409477234, "step": 847, "step_time": 12.084855679997418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.9281192123889923, "epoch": 0.00848, "grad_norm": 0.024847129359841347, "kl": 0.6285395212471485, "learning_rate": 9.99969485995139e-06, "loss": -0.0082, "step": 848, "step_time": 6.19942693199846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1273.0, "completions/max_terminated_length": 1273.0, "completions/mean_length": 458.65625, "completions/mean_terminated_length": 453.1612854003906, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.6568979024887085, "epoch": 0.00849, "frac_reward_zero_std": 0.25, "grad_norm": 0.17072826623916626, "kl": 0.7133447490632534, "learning_rate": 9.999694107922345e-06, "loss": -0.0155, "num_tokens": 19743455.0, "reward": 0.8882161378860474, "reward_std": 0.3049094080924988, "rewards/rollout_reward_func/mean": 0.8882161378860474, "rewards/rollout_reward_func/std": 0.7252833247184753, "sampling/importance_sampling_ratio/max": 0.5453860759735107, "sampling/importance_sampling_ratio/mean": 0.28758934140205383, "sampling/importance_sampling_ratio/min": 5.543651315775833e-09, "sampling/sampling_logp_difference/max": 4.153485298156738, "sampling/sampling_logp_difference/mean": 0.5770817995071411, "step": 849, "step_time": 10.604346403990348 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625000465661287, "entropy": 3.6389027535915375, "epoch": 0.0085, "grad_norm": 0.05109352245926857, "kl": 0.721594363451004, "learning_rate": 9.999693354967777e-06, "loss": -0.0158, "step": 850, "step_time": 5.762960605999979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1565.0, "completions/max_terminated_length": 1565.0, "completions/mean_length": 660.8125, "completions/mean_terminated_length": 660.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.5642309486865997, "epoch": 0.00851, "frac_reward_zero_std": 0.25, "grad_norm": 0.028919804841279984, "kl": 0.663690596818924, "learning_rate": 9.999692601087686e-06, "loss": -0.0033, "num_tokens": 19804800.0, "reward": 0.7927628755569458, "reward_std": 0.4713031053543091, "rewards/rollout_reward_func/mean": 0.7927628755569458, "rewards/rollout_reward_func/std": 0.833512008190155, "sampling/importance_sampling_ratio/max": 0.5473960041999817, "sampling/importance_sampling_ratio/mean": 0.25932586193084717, "sampling/importance_sampling_ratio/min": 0.0043584806844592094, "sampling/sampling_logp_difference/max": 4.0749287605285645, "sampling/sampling_logp_difference/mean": 0.4960511028766632, "step": 851, "step_time": 12.137012410988973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.5471021831035614, "epoch": 0.00852, "grad_norm": 0.029339682310819626, "kl": 0.6668665036559105, "learning_rate": 9.999691846282073e-06, "loss": -0.0033, "step": 852, "step_time": 6.390542532994004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1421.0, "completions/max_terminated_length": 1421.0, "completions/mean_length": 693.0625, "completions/mean_terminated_length": 693.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.396577537059784, "epoch": 0.00853, "frac_reward_zero_std": 0.0, "grad_norm": 0.4408170282840729, "kl": 0.7025334015488625, "learning_rate": 9.999691090550936e-06, "loss": 0.0028, "num_tokens": 19868304.0, "reward": 0.5088589787483215, "reward_std": 0.5317432880401611, "rewards/rollout_reward_func/mean": 0.5088589787483215, "rewards/rollout_reward_func/std": 1.0469651222229004, "sampling/importance_sampling_ratio/max": 0.8300964832305908, "sampling/importance_sampling_ratio/mean": 0.2342674285173416, "sampling/importance_sampling_ratio/min": 0.005632955580949783, "sampling/sampling_logp_difference/max": 3.822679042816162, "sampling/sampling_logp_difference/mean": 0.45736968517303467, "step": 853, "step_time": 11.269726181999431 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 3.4018488824367523, "epoch": 0.00854, "grad_norm": 0.10627055168151855, "kl": 0.7002018019556999, "learning_rate": 9.999690333894273e-06, "loss": 0.0018, "step": 854, "step_time": 6.058472705990425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1366.0, "completions/max_terminated_length": 1366.0, "completions/mean_length": 435.71875, "completions/mean_terminated_length": 435.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.534135788679123, "epoch": 0.00855, "frac_reward_zero_std": 0.25, "grad_norm": 0.1550411731004715, "kl": 0.8179811909794807, "learning_rate": 9.99968957631209e-06, "loss": -0.0037, "num_tokens": 19923295.0, "reward": 0.9093496799468994, "reward_std": 0.2842506170272827, "rewards/rollout_reward_func/mean": 0.9093496799468994, "rewards/rollout_reward_func/std": 0.6999328136444092, "sampling/importance_sampling_ratio/max": 0.5428343415260315, "sampling/importance_sampling_ratio/mean": 0.3058928847312927, "sampling/importance_sampling_ratio/min": 0.0018980201566591859, "sampling/sampling_logp_difference/max": 2.5330734252929688, "sampling/sampling_logp_difference/mean": 0.47686874866485596, "step": 855, "step_time": 10.367241682994063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.5360266864299774, "epoch": 0.00856, "grad_norm": 0.058991312980651855, "kl": 0.8168417438864708, "learning_rate": 9.999688817804385e-06, "loss": -0.0041, "step": 856, "step_time": 5.824724101992615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1615.0, "completions/max_terminated_length": 1615.0, "completions/mean_length": 780.5, "completions/mean_terminated_length": 780.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.185929358005524, "epoch": 0.00857, "frac_reward_zero_std": 0.25, "grad_norm": 0.13558445870876312, "kl": 0.666168637573719, "learning_rate": 9.999688058371155e-06, "loss": -0.0029, "num_tokens": 19988700.0, "reward": 0.37940630316734314, "reward_std": 0.6749091744422913, "rewards/rollout_reward_func/mean": 0.37940630316734314, "rewards/rollout_reward_func/std": 0.8707819581031799, "sampling/importance_sampling_ratio/max": 0.5504950284957886, "sampling/importance_sampling_ratio/mean": 0.21254277229309082, "sampling/importance_sampling_ratio/min": 1.005129757203349e-09, "sampling/sampling_logp_difference/max": 4.259945392608643, "sampling/sampling_logp_difference/mean": 0.6030187010765076, "step": 857, "step_time": 13.026402625997434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03125000046566129, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125000046566129, "entropy": 4.217237442731857, "epoch": 0.00858, "grad_norm": 0.052960190922021866, "kl": 0.6566792316734791, "learning_rate": 9.999687298012404e-06, "loss": -0.003, "step": 858, "step_time": 6.7095125959895086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1656.0, "completions/max_terminated_length": 1656.0, "completions/mean_length": 943.28125, "completions/mean_terminated_length": 943.28125, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "entropy": 4.2970790565013885, "epoch": 0.00859, "frac_reward_zero_std": 0.0, "grad_norm": 0.030176032334566116, "kl": 0.7852507904171944, "learning_rate": 9.999686536728131e-06, "loss": -0.0003, "num_tokens": 20060669.0, "reward": 0.5822441577911377, "reward_std": 0.947333037853241, "rewards/rollout_reward_func/mean": 0.5822441577911377, "rewards/rollout_reward_func/std": 0.9904115200042725, "sampling/importance_sampling_ratio/max": 0.2979072332382202, "sampling/importance_sampling_ratio/mean": 0.12199525535106659, "sampling/importance_sampling_ratio/min": 2.3785806657627973e-09, "sampling/sampling_logp_difference/max": 10.602238655090332, "sampling/sampling_logp_difference/mean": 0.7019683718681335, "step": 859, "step_time": 12.593296545994235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.302322149276733, "epoch": 0.0086, "grad_norm": 0.030811958014965057, "kl": 0.7848627045750618, "learning_rate": 9.999685774518335e-06, "loss": -0.0003, "step": 860, "step_time": 6.739298589011014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1399.0, "completions/max_terminated_length": 1399.0, "completions/mean_length": 652.28125, "completions/mean_terminated_length": 652.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.245221316814423, "epoch": 0.00861, "frac_reward_zero_std": 0.0, "grad_norm": 0.013978094793856144, "kl": 0.675038680434227, "learning_rate": 9.999685011383017e-06, "loss": -0.0124, "num_tokens": 20122041.0, "reward": 1.061816930770874, "reward_std": 0.24295824766159058, "rewards/rollout_reward_func/mean": 1.061816930770874, "rewards/rollout_reward_func/std": 0.47021639347076416, "sampling/importance_sampling_ratio/max": 0.5467327237129211, "sampling/importance_sampling_ratio/mean": 0.2084590494632721, "sampling/importance_sampling_ratio/min": 7.842947753917973e-14, "sampling/sampling_logp_difference/max": 10.79220962524414, "sampling/sampling_logp_difference/mean": 0.7944018840789795, "step": 861, "step_time": 11.160032225008763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0024038462433964014, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024038462433964014, "entropy": 4.234974503517151, "epoch": 0.00862, "grad_norm": 0.01448937226086855, "kl": 0.6785344704985619, "learning_rate": 9.999684247322179e-06, "loss": -0.0124, "step": 862, "step_time": 6.123488608005573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1494.0, "completions/max_terminated_length": 1494.0, "completions/mean_length": 991.5, "completions/mean_terminated_length": 991.5, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "entropy": 4.430701524019241, "epoch": 0.00863, "frac_reward_zero_std": 0.0, "grad_norm": 0.0268523171544075, "kl": 0.5742590501904488, "learning_rate": 9.999683482335817e-06, "loss": 0.0142, "num_tokens": 20195499.0, "reward": 1.1182639598846436, "reward_std": 0.24553655087947845, "rewards/rollout_reward_func/mean": 1.1182639598846436, "rewards/rollout_reward_func/std": 0.4615548849105835, "sampling/importance_sampling_ratio/max": 0.2991795539855957, "sampling/importance_sampling_ratio/mean": 0.11642762273550034, "sampling/importance_sampling_ratio/min": 6.515939251352124e-17, "sampling/sampling_logp_difference/max": 12.306806564331055, "sampling/sampling_logp_difference/mean": 0.7670648097991943, "step": 863, "step_time": 12.348293944000034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.410548061132431, "epoch": 0.00864, "grad_norm": 0.02663460187613964, "kl": 0.5774456225335598, "learning_rate": 9.999682716423937e-06, "loss": 0.0142, "step": 864, "step_time": 6.862948571993911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 158.21875, "completions/mean_terminated_length": 158.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.1196250915527344, "epoch": 0.00865, "frac_reward_zero_std": 0.75, "grad_norm": 0.04065261036157608, "kl": 1.0133925825357437, "learning_rate": 9.999681949586533e-06, "loss": -0.0017, "num_tokens": 20236923.0, "reward": 0.5174115896224976, "reward_std": 0.19864946603775024, "rewards/rollout_reward_func/mean": 0.5174115896224976, "rewards/rollout_reward_func/std": 1.001173496246338, "sampling/importance_sampling_ratio/max": 0.5452809929847717, "sampling/importance_sampling_ratio/mean": 0.4439510703086853, "sampling/importance_sampling_ratio/min": 0.035850707441568375, "sampling/sampling_logp_difference/max": 2.5125033855438232, "sampling/sampling_logp_difference/mean": 0.3714459538459778, "step": 865, "step_time": 7.79462520800007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.111884504556656, "epoch": 0.00866, "grad_norm": 0.022242290899157524, "kl": 1.0321418195962906, "learning_rate": 9.999681181823611e-06, "loss": -0.0017, "step": 866, "step_time": 4.282008503003453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1468.0, "completions/max_terminated_length": 1468.0, "completions/mean_length": 613.78125, "completions/mean_terminated_length": 613.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.490107238292694, "epoch": 0.00867, "frac_reward_zero_std": 0.25, "grad_norm": 0.0342748761177063, "kl": 0.7038613483309746, "learning_rate": 9.999680413135167e-06, "loss": 0.0069, "num_tokens": 20296447.0, "reward": 1.121889591217041, "reward_std": 0.20156601071357727, "rewards/rollout_reward_func/mean": 1.121889591217041, "rewards/rollout_reward_func/std": 0.37421032786369324, "sampling/importance_sampling_ratio/max": 0.5437099933624268, "sampling/importance_sampling_ratio/mean": 0.2723310589790344, "sampling/importance_sampling_ratio/min": 6.054633461261005e-10, "sampling/sampling_logp_difference/max": 11.400404930114746, "sampling/sampling_logp_difference/mean": 0.5774911642074585, "step": 867, "step_time": 11.369064304002677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.479478031396866, "epoch": 0.00868, "grad_norm": 0.03929932042956352, "kl": 0.7042494937777519, "learning_rate": 9.9996796435212e-06, "loss": 0.007, "step": 868, "step_time": 6.563893968006596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1507.0, "completions/max_terminated_length": 1507.0, "completions/mean_length": 839.875, "completions/mean_terminated_length": 847.774169921875, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "entropy": 4.106594026088715, "epoch": 0.00869, "frac_reward_zero_std": 0.0, "grad_norm": 0.049719855189323425, "kl": 0.8760334774851799, "learning_rate": 9.999678872981717e-06, "loss": -0.012, "num_tokens": 20365465.0, "reward": 0.9680005311965942, "reward_std": 0.6388726234436035, "rewards/rollout_reward_func/mean": 0.9680005311965942, "rewards/rollout_reward_func/std": 0.7066001296043396, "sampling/importance_sampling_ratio/max": 0.30369946360588074, "sampling/importance_sampling_ratio/mean": 0.15255075693130493, "sampling/importance_sampling_ratio/min": 3.44121322623926e-16, "sampling/sampling_logp_difference/max": 7.201017379760742, "sampling/sampling_logp_difference/mean": 0.7769647836685181, "step": 869, "step_time": 11.677927118005755 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 4.073058515787125, "epoch": 0.0087, "grad_norm": 0.03010251373052597, "kl": 0.8841226398944855, "learning_rate": 9.999678101516712e-06, "loss": -0.0121, "step": 870, "step_time": 6.800596346009115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1489.0, "completions/max_terminated_length": 1489.0, "completions/mean_length": 735.21875, "completions/mean_terminated_length": 737.7096557617188, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.7830812633037567, "epoch": 0.00871, "frac_reward_zero_std": 0.5, "grad_norm": 0.017720939591526985, "kl": 0.7121473625302315, "learning_rate": 9.999677329126187e-06, "loss": -0.0005, "num_tokens": 20430081.0, "reward": 0.9402081370353699, "reward_std": 0.28861355781555176, "rewards/rollout_reward_func/mean": 0.9402081370353699, "rewards/rollout_reward_func/std": 0.6668363213539124, "sampling/importance_sampling_ratio/max": 0.5409446358680725, "sampling/importance_sampling_ratio/mean": 0.24546122550964355, "sampling/importance_sampling_ratio/min": 4.5518427915780535e-10, "sampling/sampling_logp_difference/max": 3.6405527591705322, "sampling/sampling_logp_difference/mean": 0.5416300892829895, "step": 871, "step_time": 11.982784879000974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.7566580176353455, "epoch": 0.00872, "grad_norm": 0.016664255410432816, "kl": 0.7156333476305008, "learning_rate": 9.999676555810143e-06, "loss": -0.0005, "step": 872, "step_time": 6.466531358018983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1393.0, "completions/max_terminated_length": 1393.0, "completions/mean_length": 758.125, "completions/mean_terminated_length": 758.125, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "entropy": 3.648083508014679, "epoch": 0.00873, "frac_reward_zero_std": 0.0, "grad_norm": 0.03032510168850422, "kl": 0.7823175936937332, "learning_rate": 9.999675781568578e-06, "loss": 0.0048, "num_tokens": 20496191.0, "reward": 0.8802897930145264, "reward_std": 0.5122235417366028, "rewards/rollout_reward_func/mean": 0.8802897930145264, "rewards/rollout_reward_func/std": 0.8171675205230713, "sampling/importance_sampling_ratio/max": 0.30128517746925354, "sampling/importance_sampling_ratio/mean": 0.1830490082502365, "sampling/importance_sampling_ratio/min": 0.011638143099844456, "sampling/sampling_logp_difference/max": 2.2276597023010254, "sampling/sampling_logp_difference/mean": 0.46818608045578003, "step": 873, "step_time": 11.271756519985502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.6268293261528015, "epoch": 0.00874, "grad_norm": 0.030737295746803284, "kl": 0.7887743264436722, "learning_rate": 9.999675006401496e-06, "loss": 0.0047, "step": 874, "step_time": 6.581513098994037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1512.0, "completions/max_terminated_length": 1512.0, "completions/mean_length": 930.8125, "completions/mean_terminated_length": 944.3547973632812, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "entropy": 4.353439897298813, "epoch": 0.00875, "frac_reward_zero_std": 0.0, "grad_norm": 0.11212024092674255, "kl": 0.9098343327641487, "learning_rate": 9.999674230308893e-06, "loss": -0.0025, "num_tokens": 20567771.0, "reward": 0.32560229301452637, "reward_std": 0.7475287914276123, "rewards/rollout_reward_func/mean": 0.32560229301452637, "rewards/rollout_reward_func/std": 1.015779972076416, "sampling/importance_sampling_ratio/max": 0.3030262291431427, "sampling/importance_sampling_ratio/mean": 0.11033259332180023, "sampling/importance_sampling_ratio/min": 3.87300785809281e-21, "sampling/sampling_logp_difference/max": 4.180938720703125, "sampling/sampling_logp_difference/mean": 0.7704100012779236, "step": 875, "step_time": 12.037293515000783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.336438685655594, "epoch": 0.00876, "grad_norm": 0.09994693845510483, "kl": 0.8785628527402878, "learning_rate": 9.999673453290772e-06, "loss": -0.0028, "step": 876, "step_time": 6.9712343709979905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1460.0, "completions/max_terminated_length": 1460.0, "completions/mean_length": 377.5, "completions/mean_terminated_length": 389.1612854003906, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.690333366394043, "epoch": 0.00877, "frac_reward_zero_std": 0.25, "grad_norm": 0.04075988754630089, "kl": 0.8686072379350662, "learning_rate": 9.999672675347131e-06, "loss": -0.0119, "num_tokens": 20616776.0, "reward": 0.9497777223587036, "reward_std": 0.28744739294052124, "rewards/rollout_reward_func/mean": 0.9497777223587036, "rewards/rollout_reward_func/std": 0.597453236579895, "sampling/importance_sampling_ratio/max": 0.543874204158783, "sampling/importance_sampling_ratio/mean": 0.32850974798202515, "sampling/importance_sampling_ratio/min": 2.1003421983405157e-14, "sampling/sampling_logp_difference/max": 3.645066499710083, "sampling/sampling_logp_difference/mean": 0.7280857563018799, "step": 877, "step_time": 11.08846096900379 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.672116845846176, "epoch": 0.00878, "grad_norm": 0.021163204684853554, "kl": 0.8660959899425507, "learning_rate": 9.999671896477973e-06, "loss": -0.0121, "step": 878, "step_time": 6.142022650987201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1570.0, "completions/max_terminated_length": 1570.0, "completions/mean_length": 586.1875, "completions/mean_terminated_length": 604.5806274414062, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.993423968553543, "epoch": 0.00879, "frac_reward_zero_std": 0.0, "grad_norm": 0.014999774284660816, "kl": 0.618561327457428, "learning_rate": 9.999671116683296e-06, "loss": -0.0146, "num_tokens": 20673686.0, "reward": 0.8980450630187988, "reward_std": 0.4201517701148987, "rewards/rollout_reward_func/mean": 0.8980450630187988, "rewards/rollout_reward_func/std": 0.5797138810157776, "sampling/importance_sampling_ratio/max": 0.5443617105484009, "sampling/importance_sampling_ratio/mean": 0.2762049436569214, "sampling/importance_sampling_ratio/min": 3.7174345379753504e-08, "sampling/sampling_logp_difference/max": 4.581626892089844, "sampling/sampling_logp_difference/mean": 0.6177619099617004, "step": 879, "step_time": 11.432849855002132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.977937400341034, "epoch": 0.0088, "grad_norm": 0.01438906230032444, "kl": 0.6187920495867729, "learning_rate": 9.9996703359631e-06, "loss": -0.0146, "step": 880, "step_time": 6.814849676004087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1457.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 695.5625, "completions/mean_terminated_length": 717.4838256835938, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.8553946018218994, "epoch": 0.00881, "frac_reward_zero_std": 0.0, "grad_norm": 0.021366719156503677, "kl": 0.7280021570622921, "learning_rate": 9.999669554317389e-06, "loss": -0.0138, "num_tokens": 20735212.0, "reward": 0.823502779006958, "reward_std": 0.6849095821380615, "rewards/rollout_reward_func/mean": 0.823502779006958, "rewards/rollout_reward_func/std": 0.7894726991653442, "sampling/importance_sampling_ratio/max": 0.5407387018203735, "sampling/importance_sampling_ratio/mean": 0.2236168384552002, "sampling/importance_sampling_ratio/min": 2.164879608912429e-09, "sampling/sampling_logp_difference/max": 3.3874030113220215, "sampling/sampling_logp_difference/mean": 0.5827034711837769, "step": 881, "step_time": 12.24224786200648 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 3.8374691009521484, "epoch": 0.00882, "grad_norm": 0.020626839250326157, "kl": 0.7149551510810852, "learning_rate": 9.999668771746158e-06, "loss": -0.0139, "step": 882, "step_time": 6.3610765599951264 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1318.0, "completions/max_terminated_length": 1318.0, "completions/mean_length": 460.5, "completions/mean_terminated_length": 460.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.278507262468338, "epoch": 0.00883, "frac_reward_zero_std": 0.0, "grad_norm": 0.07476596534252167, "kl": 0.7426802441477776, "learning_rate": 9.99966798824941e-06, "loss": -0.0016, "num_tokens": 20790527.0, "reward": 1.102715253829956, "reward_std": 0.20970427989959717, "rewards/rollout_reward_func/mean": 1.102715253829956, "rewards/rollout_reward_func/std": 0.3700082302093506, "sampling/importance_sampling_ratio/max": 0.5448822975158691, "sampling/importance_sampling_ratio/mean": 0.3077276945114136, "sampling/importance_sampling_ratio/min": 2.8694117645500228e-05, "sampling/sampling_logp_difference/max": 3.2905797958374023, "sampling/sampling_logp_difference/mean": 0.44193777441978455, "step": 883, "step_time": 10.82008153400966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.2689197957515717, "epoch": 0.00884, "grad_norm": 0.04659262299537659, "kl": 0.7423987649381161, "learning_rate": 9.999667203827144e-06, "loss": -0.0018, "step": 884, "step_time": 6.020292913999583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1498.0, "completions/max_terminated_length": 1498.0, "completions/mean_length": 657.78125, "completions/mean_terminated_length": 678.4838256835938, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.9539120495319366, "epoch": 0.00885, "frac_reward_zero_std": 0.0, "grad_norm": 0.03719451278448105, "kl": 0.7078397199511528, "learning_rate": 9.999666418479359e-06, "loss": -0.0102, "num_tokens": 20852438.0, "reward": 0.9871550798416138, "reward_std": 0.4739251136779785, "rewards/rollout_reward_func/mean": 0.9871550798416138, "rewards/rollout_reward_func/std": 0.6499216556549072, "sampling/importance_sampling_ratio/max": 0.5411307215690613, "sampling/importance_sampling_ratio/mean": 0.23551008105278015, "sampling/importance_sampling_ratio/min": 2.7453451668837513e-15, "sampling/sampling_logp_difference/max": 4.303837776184082, "sampling/sampling_logp_difference/mean": 0.8767943382263184, "step": 885, "step_time": 12.158522079997056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.944524437189102, "epoch": 0.00886, "grad_norm": 0.034126151353120804, "kl": 0.7082798853516579, "learning_rate": 9.999665632206059e-06, "loss": -0.0103, "step": 886, "step_time": 6.385602598980768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 439.3125, "completions/mean_terminated_length": 433.83868408203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.118860900402069, "epoch": 0.00887, "frac_reward_zero_std": 0.5, "grad_norm": 0.0683286115527153, "kl": 0.750885434448719, "learning_rate": 9.999664845007243e-06, "loss": 0.0012, "num_tokens": 20905614.0, "reward": 1.046398401260376, "reward_std": 0.1756153702735901, "rewards/rollout_reward_func/mean": 1.046398401260376, "rewards/rollout_reward_func/std": 0.39970874786376953, "sampling/importance_sampling_ratio/max": 0.6697027683258057, "sampling/importance_sampling_ratio/mean": 0.35009437799453735, "sampling/importance_sampling_ratio/min": 9.708689581444663e-12, "sampling/sampling_logp_difference/max": 4.140015602111816, "sampling/sampling_logp_difference/mean": 0.4742482006549835, "step": 887, "step_time": 9.954573459996027 }, { "clip_ratio/high_max": 0.02130681835114956, "clip_ratio/high_mean": 0.01065340917557478, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01065340917557478, "entropy": 3.1096373796463013, "epoch": 0.00888, "grad_norm": 0.016349833458662033, "kl": 0.7522421106696129, "learning_rate": 9.99966405688291e-06, "loss": 0.0013, "step": 888, "step_time": 5.166654411012132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1710.0, "completions/max_terminated_length": 1710.0, "completions/mean_length": 895.21875, "completions/mean_terminated_length": 895.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.26615834236145, "epoch": 0.00889, "frac_reward_zero_std": 0.25, "grad_norm": 0.03590855747461319, "kl": 0.6942851543426514, "learning_rate": 9.99966326783306e-06, "loss": -0.0087, "num_tokens": 20975007.0, "reward": 0.5170195698738098, "reward_std": 0.5477789640426636, "rewards/rollout_reward_func/mean": 0.5170195698738098, "rewards/rollout_reward_func/std": 0.7606124877929688, "sampling/importance_sampling_ratio/max": 0.5505340695381165, "sampling/importance_sampling_ratio/mean": 0.1952972710132599, "sampling/importance_sampling_ratio/min": 0.0022392889950424433, "sampling/sampling_logp_difference/max": 2.5504050254821777, "sampling/sampling_logp_difference/mean": 0.574578583240509, "step": 889, "step_time": 12.512432130999514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.2744570672512054, "epoch": 0.0089, "grad_norm": 0.037163522094488144, "kl": 0.693439856171608, "learning_rate": 9.999662477857692e-06, "loss": -0.0087, "step": 890, "step_time": 6.936352023993095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1547.0, "completions/max_terminated_length": 1547.0, "completions/mean_length": 744.3125, "completions/mean_terminated_length": 744.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.674791634082794, "epoch": 0.00891, "frac_reward_zero_std": 0.0, "grad_norm": 0.10759776830673218, "kl": 0.629101574420929, "learning_rate": 9.99966168695681e-06, "loss": 0.0049, "num_tokens": 21039049.0, "reward": 0.6962877511978149, "reward_std": 0.12728923559188843, "rewards/rollout_reward_func/mean": 0.6962877511978149, "rewards/rollout_reward_func/std": 0.8927145600318909, "sampling/importance_sampling_ratio/max": 0.5512545704841614, "sampling/importance_sampling_ratio/mean": 0.23196280002593994, "sampling/importance_sampling_ratio/min": 0.0002290441916557029, "sampling/sampling_logp_difference/max": 4.902531623840332, "sampling/sampling_logp_difference/mean": 0.4872594177722931, "step": 891, "step_time": 12.224944866000442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.021875000093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021875000093132257, "entropy": 3.6859176456928253, "epoch": 0.00892, "grad_norm": 0.04947493225336075, "kl": 0.6201998591423035, "learning_rate": 9.999660895130413e-06, "loss": 0.0046, "step": 892, "step_time": 6.457706542001688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1236.0, "completions/max_terminated_length": 1236.0, "completions/mean_length": 177.09375, "completions/mean_terminated_length": 177.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.1625945568084717, "epoch": 0.00893, "frac_reward_zero_std": 0.75, "grad_norm": 0.054048098623752594, "kl": 0.8160910233855247, "learning_rate": 9.9996601023785e-06, "loss": -0.0033, "num_tokens": 21083203.0, "reward": 0.827621579170227, "reward_std": 0.301574170589447, "rewards/rollout_reward_func/mean": 0.827621579170227, "rewards/rollout_reward_func/std": 0.7327286601066589, "sampling/importance_sampling_ratio/max": 0.5503501296043396, "sampling/importance_sampling_ratio/mean": 0.43282827734947205, "sampling/importance_sampling_ratio/min": 0.0008196581620723009, "sampling/sampling_logp_difference/max": 3.671034336090088, "sampling/sampling_logp_difference/mean": 0.4218992590904236, "step": 893, "step_time": 10.280502926005283 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.1693173348903656, "epoch": 0.00894, "grad_norm": 0.02873270772397518, "kl": 0.8127509579062462, "learning_rate": 9.999659308701071e-06, "loss": -0.0033, "step": 894, "step_time": 5.503357119006978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1505.0, "completions/max_terminated_length": 1505.0, "completions/mean_length": 674.28125, "completions/mean_terminated_length": 674.28125, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "entropy": 3.184246212244034, "epoch": 0.00895, "frac_reward_zero_std": 0.25, "grad_norm": 0.02255510911345482, "kl": 0.8199687823653221, "learning_rate": 9.999658514098125e-06, "loss": -0.011, "num_tokens": 21145102.0, "reward": 0.6102895140647888, "reward_std": 0.09459929913282394, "rewards/rollout_reward_func/mean": 0.6102895140647888, "rewards/rollout_reward_func/std": 0.9911017417907715, "sampling/importance_sampling_ratio/max": 0.3052484691143036, "sampling/importance_sampling_ratio/mean": 0.2346709817647934, "sampling/importance_sampling_ratio/min": 8.38630148791708e-05, "sampling/sampling_logp_difference/max": 4.5655012130737305, "sampling/sampling_logp_difference/mean": 0.4266622066497803, "step": 895, "step_time": 11.84969157899468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.1788520514965057, "epoch": 0.00896, "grad_norm": 0.023671666160225868, "kl": 0.8300644904375076, "learning_rate": 9.999657718569665e-06, "loss": -0.011, "step": 896, "step_time": 6.375892442993063 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 1331.0, "completions/max_terminated_length": 1331.0, "completions/mean_length": 523.0625, "completions/mean_terminated_length": 523.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.4721965193748474, "epoch": 0.00897, "frac_reward_zero_std": 0.5, "grad_norm": 0.056016627699136734, "kl": 0.6752464473247528, "learning_rate": 9.99965692211569e-06, "loss": 0.0071, "num_tokens": 21199055.0, "reward": 0.925065815448761, "reward_std": 0.2451423704624176, "rewards/rollout_reward_func/mean": 0.925065815448761, "rewards/rollout_reward_func/std": 0.6018522381782532, "sampling/importance_sampling_ratio/max": 0.5511050820350647, "sampling/importance_sampling_ratio/mean": 0.3221270740032196, "sampling/importance_sampling_ratio/min": 0.0030940731521695852, "sampling/sampling_logp_difference/max": 1.88163423538208, "sampling/sampling_logp_difference/mean": 0.4289086163043976, "step": 897, "step_time": 11.531617932007066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.4581895768642426, "epoch": 0.00898, "grad_norm": 0.05185190588235855, "kl": 0.6726248040795326, "learning_rate": 9.999656124736203e-06, "loss": 0.0071, "step": 898, "step_time": 5.8858003900022595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1347.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 386.15625, "completions/mean_terminated_length": 386.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.5023577511310577, "epoch": 0.00899, "frac_reward_zero_std": 0.5, "grad_norm": 0.03770056739449501, "kl": 0.7330816686153412, "learning_rate": 9.9996553264312e-06, "loss": -0.0092, "num_tokens": 21248600.0, "reward": 0.6381405591964722, "reward_std": 0.49544042348861694, "rewards/rollout_reward_func/mean": 0.6381405591964722, "rewards/rollout_reward_func/std": 0.8835453391075134, "sampling/importance_sampling_ratio/max": 0.5494030714035034, "sampling/importance_sampling_ratio/mean": 0.3465766906738281, "sampling/importance_sampling_ratio/min": 2.1614030365675718e-19, "sampling/sampling_logp_difference/max": 4.904896259307861, "sampling/sampling_logp_difference/mean": 0.6080437898635864, "step": 899, "step_time": 11.000647312001092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.500220626592636, "epoch": 0.009, "grad_norm": 0.03950389847159386, "kl": 0.733522042632103, "learning_rate": 9.999654527200682e-06, "loss": -0.0093, "step": 900, "step_time": 6.036460075993091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1457.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 384.03125, "completions/mean_terminated_length": 395.9032287597656, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.5989753007888794, "epoch": 0.00901, "frac_reward_zero_std": 0.25, "grad_norm": 0.07479818165302277, "kl": 0.766119658946991, "learning_rate": 9.999653727044649e-06, "loss": -0.0113, "num_tokens": 21299313.0, "reward": 0.7439435720443726, "reward_std": 0.4819370210170746, "rewards/rollout_reward_func/mean": 0.7439435720443726, "rewards/rollout_reward_func/std": 0.8411625027656555, "sampling/importance_sampling_ratio/max": 0.5554190278053284, "sampling/importance_sampling_ratio/mean": 0.33838409185409546, "sampling/importance_sampling_ratio/min": 1.7517188871316036e-13, "sampling/sampling_logp_difference/max": 3.9915244579315186, "sampling/sampling_logp_difference/mean": 0.6942647695541382, "step": 901, "step_time": 11.506013917998644 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.6096377968788147, "epoch": 0.00902, "grad_norm": 0.05310950428247452, "kl": 0.7608201876282692, "learning_rate": 9.999652925963103e-06, "loss": -0.0113, "step": 902, "step_time": 6.803168774000369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 293.71875, "completions/mean_terminated_length": 293.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.0504801273345947, "epoch": 0.00903, "frac_reward_zero_std": 0.5, "grad_norm": 0.065932497382164, "kl": 0.6643981635570526, "learning_rate": 9.999652123956045e-06, "loss": -0.0095, "num_tokens": 21349811.0, "reward": 0.4007798433303833, "reward_std": 0.435733437538147, "rewards/rollout_reward_func/mean": 0.4007798433303833, "rewards/rollout_reward_func/std": 1.051051378250122, "sampling/importance_sampling_ratio/max": 0.5520442128181458, "sampling/importance_sampling_ratio/mean": 0.39903873205184937, "sampling/importance_sampling_ratio/min": 0.04440218210220337, "sampling/sampling_logp_difference/max": 1.2132800817489624, "sampling/sampling_logp_difference/mean": 0.3677765130996704, "step": 903, "step_time": 8.113109029014595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.055822491645813, "epoch": 0.00904, "grad_norm": 0.06846341490745544, "kl": 0.6619070321321487, "learning_rate": 9.99965132102347e-06, "loss": -0.0098, "step": 904, "step_time": 4.501898985021398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1309.0, "completions/max_terminated_length": 1309.0, "completions/mean_length": 430.03125, "completions/mean_terminated_length": 457.63336181640625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.8138362169265747, "epoch": 0.00905, "frac_reward_zero_std": 0.0, "grad_norm": 0.029383603483438492, "kl": 0.6792964339256287, "learning_rate": 9.999650517165385e-06, "loss": -0.0175, "num_tokens": 21400363.0, "reward": 0.8654701709747314, "reward_std": 0.4044342637062073, "rewards/rollout_reward_func/mean": 0.8654701709747314, "rewards/rollout_reward_func/std": 0.7058354020118713, "sampling/importance_sampling_ratio/max": 0.5487860441207886, "sampling/importance_sampling_ratio/mean": 0.30315637588500977, "sampling/importance_sampling_ratio/min": 6.639103156125259e-11, "sampling/sampling_logp_difference/max": 3.54526686668396, "sampling/sampling_logp_difference/mean": 0.6440671682357788, "step": 905, "step_time": 10.740544025997224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.835054337978363, "epoch": 0.00906, "grad_norm": 0.02503383159637451, "kl": 0.6765760108828545, "learning_rate": 9.999649712381786e-06, "loss": -0.0176, "step": 906, "step_time": 5.812692572995729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1327.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 351.5625, "completions/mean_terminated_length": 351.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.8847760558128357, "epoch": 0.00907, "frac_reward_zero_std": 0.5, "grad_norm": 0.0686517059803009, "kl": 0.7447211593389511, "learning_rate": 9.999648906672674e-06, "loss": 0.0081, "num_tokens": 21450629.0, "reward": 0.6435836553573608, "reward_std": 0.20128755271434784, "rewards/rollout_reward_func/mean": 0.6435836553573608, "rewards/rollout_reward_func/std": 0.8650592565536499, "sampling/importance_sampling_ratio/max": 0.5506734251976013, "sampling/importance_sampling_ratio/mean": 0.39858466386795044, "sampling/importance_sampling_ratio/min": 0.05291745439171791, "sampling/sampling_logp_difference/max": 0.9921818971633911, "sampling/sampling_logp_difference/mean": 0.3314378559589386, "step": 907, "step_time": 10.272489137001685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8944114446640015, "epoch": 0.00908, "grad_norm": 0.06791378557682037, "kl": 0.7436458915472031, "learning_rate": 9.999648100038048e-06, "loss": 0.0081, "step": 908, "step_time": 5.760766894993139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1552.0, "completions/max_terminated_length": 1552.0, "completions/mean_length": 1000.65625, "completions/mean_terminated_length": 1000.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.039742588996887, "epoch": 0.00909, "frac_reward_zero_std": 0.25, "grad_norm": 0.05023237690329552, "kl": 0.7418134585022926, "learning_rate": 9.999647292477912e-06, "loss": -0.0067, "num_tokens": 21525004.0, "reward": 0.9167497158050537, "reward_std": 0.4250347316265106, "rewards/rollout_reward_func/mean": 0.9167497158050537, "rewards/rollout_reward_func/std": 0.7063905000686646, "sampling/importance_sampling_ratio/max": 0.29515424370765686, "sampling/importance_sampling_ratio/mean": 0.10865576565265656, "sampling/importance_sampling_ratio/min": 4.973836342792026e-10, "sampling/sampling_logp_difference/max": 12.750555038452148, "sampling/sampling_logp_difference/mean": 0.729651689529419, "step": 909, "step_time": 11.910645391995786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.050915777683258, "epoch": 0.0091, "grad_norm": 0.05162777379155159, "kl": 0.728393018245697, "learning_rate": 9.999646483992262e-06, "loss": -0.0068, "step": 910, "step_time": 6.918367414000386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1412.0, "completions/max_terminated_length": 1412.0, "completions/mean_length": 780.21875, "completions/mean_terminated_length": 780.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.161028355360031, "epoch": 0.00911, "frac_reward_zero_std": 0.25, "grad_norm": 0.031532205641269684, "kl": 0.6169950067996979, "learning_rate": 9.9996456745811e-06, "loss": -0.0077, "num_tokens": 21590940.0, "reward": 0.7608277797698975, "reward_std": 0.5568506121635437, "rewards/rollout_reward_func/mean": 0.7608277797698975, "rewards/rollout_reward_func/std": 0.8393475413322449, "sampling/importance_sampling_ratio/max": 0.553968071937561, "sampling/importance_sampling_ratio/mean": 0.2169400006532669, "sampling/importance_sampling_ratio/min": 4.341632608984014e-11, "sampling/sampling_logp_difference/max": 3.5177974700927734, "sampling/sampling_logp_difference/mean": 0.6530974507331848, "step": 911, "step_time": 11.389666535003926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.14420023560524, "epoch": 0.00912, "grad_norm": 0.0315760038793087, "kl": 0.6206242553889751, "learning_rate": 9.999644864244428e-06, "loss": -0.0078, "step": 912, "step_time": 6.242659273993922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1584.0, "completions/max_terminated_length": 1584.0, "completions/mean_length": 785.46875, "completions/mean_terminated_length": 785.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.0299268662929535, "epoch": 0.00913, "frac_reward_zero_std": 0.25, "grad_norm": 0.024061469361186028, "kl": 0.6584697887301445, "learning_rate": 9.999644052982243e-06, "loss": -0.0022, "num_tokens": 21656476.0, "reward": 0.9851117730140686, "reward_std": 0.22818921506404877, "rewards/rollout_reward_func/mean": 0.9851117730140686, "rewards/rollout_reward_func/std": 0.3995536267757416, "sampling/importance_sampling_ratio/max": 0.5509371757507324, "sampling/importance_sampling_ratio/mean": 0.21988454461097717, "sampling/importance_sampling_ratio/min": 0.001032628701068461, "sampling/sampling_logp_difference/max": 3.171445608139038, "sampling/sampling_logp_difference/mean": 0.5432288646697998, "step": 913, "step_time": 12.619688030994439 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 4.032615691423416, "epoch": 0.00914, "grad_norm": 0.02406361512839794, "kl": 0.6647718623280525, "learning_rate": 9.999643240794546e-06, "loss": -0.0022, "step": 914, "step_time": 6.707501148994197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 160.15625, "completions/mean_terminated_length": 160.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.102870762348175, "epoch": 0.00915, "frac_reward_zero_std": 0.75, "grad_norm": 0.013489228673279285, "kl": 0.8779696971178055, "learning_rate": 9.999642427681338e-06, "loss": -0.0056, "num_tokens": 21698858.0, "reward": 1.099609375, "reward_std": 0.01190187968313694, "rewards/rollout_reward_func/mean": 1.099609375, "rewards/rollout_reward_func/std": 0.05306467041373253, "sampling/importance_sampling_ratio/max": 0.5524823665618896, "sampling/importance_sampling_ratio/mean": 0.45791861414909363, "sampling/importance_sampling_ratio/min": 2.606714011310629e-10, "sampling/sampling_logp_difference/max": 4.091551303863525, "sampling/sampling_logp_difference/mean": 0.5056567192077637, "step": 915, "step_time": 8.132214638986625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.10023957490921, "epoch": 0.00916, "grad_norm": 0.014793249778449535, "kl": 0.8780802190303802, "learning_rate": 9.99964161364262e-06, "loss": -0.0056, "step": 916, "step_time": 4.617282604012871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1408.0, "completions/max_terminated_length": 1408.0, "completions/mean_length": 530.625, "completions/mean_terminated_length": 530.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.831135928630829, "epoch": 0.00917, "frac_reward_zero_std": 0.0, "grad_norm": 0.2743200659751892, "kl": 0.6409710627049208, "learning_rate": 9.999640798678389e-06, "loss": -0.0121, "num_tokens": 21756908.0, "reward": 0.541103720664978, "reward_std": 0.8519519567489624, "rewards/rollout_reward_func/mean": 0.541103720664978, "rewards/rollout_reward_func/std": 1.0219253301620483, "sampling/importance_sampling_ratio/max": 0.553547739982605, "sampling/importance_sampling_ratio/mean": 0.23397547006607056, "sampling/importance_sampling_ratio/min": 0.004447081126272678, "sampling/sampling_logp_difference/max": 2.9771780967712402, "sampling/sampling_logp_difference/mean": 0.5048922300338745, "step": 917, "step_time": 11.024944225995569 }, { "clip_ratio/high_max": 0.05208333395421505, "clip_ratio/high_mean": 0.026041666977107525, "clip_ratio/low_mean": 0.07291666697710752, "clip_ratio/low_min": 0.03125, "clip_ratio/region_mean": 0.09895833395421505, "entropy": 3.7984431087970734, "epoch": 0.00918, "grad_norm": 0.08259648084640503, "kl": 0.6997157391160727, "learning_rate": 9.999639982788647e-06, "loss": -0.0136, "step": 918, "step_time": 6.1638437419969705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1498.0, "completions/max_terminated_length": 1498.0, "completions/mean_length": 644.5, "completions/mean_terminated_length": 644.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.4990070462226868, "epoch": 0.00919, "frac_reward_zero_std": 0.25, "grad_norm": 0.061700593680143356, "kl": 0.6921518258750439, "learning_rate": 9.999639165973397e-06, "loss": -0.0018, "num_tokens": 21818530.0, "reward": 0.9290189146995544, "reward_std": 0.47635239362716675, "rewards/rollout_reward_func/mean": 0.9290189146995544, "rewards/rollout_reward_func/std": 0.7132911086082458, "sampling/importance_sampling_ratio/max": 0.5531511306762695, "sampling/importance_sampling_ratio/mean": 0.25262463092803955, "sampling/importance_sampling_ratio/min": 0.0018729554722085595, "sampling/sampling_logp_difference/max": 4.535905361175537, "sampling/sampling_logp_difference/mean": 0.47866734862327576, "step": 919, "step_time": 11.766821373996208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.4730799794197083, "epoch": 0.0092, "grad_norm": 0.05884584039449692, "kl": 0.704191543161869, "learning_rate": 9.999638348232636e-06, "loss": -0.0021, "step": 920, "step_time": 6.387536838003143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1444.0, "completions/max_terminated_length": 1444.0, "completions/mean_length": 749.65625, "completions/mean_terminated_length": 749.65625, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "entropy": 3.9561703205108643, "epoch": 0.00921, "frac_reward_zero_std": 0.0, "grad_norm": 0.0768553614616394, "kl": 0.6226825304329395, "learning_rate": 9.999637529566364e-06, "loss": -0.0116, "num_tokens": 21883811.0, "reward": 0.8620809316635132, "reward_std": 0.2528398633003235, "rewards/rollout_reward_func/mean": 0.8620809316635132, "rewards/rollout_reward_func/std": 0.6989247798919678, "sampling/importance_sampling_ratio/max": 0.30328860878944397, "sampling/importance_sampling_ratio/mean": 0.1666671484708786, "sampling/importance_sampling_ratio/min": 1.201088395408334e-10, "sampling/sampling_logp_difference/max": 12.737488746643066, "sampling/sampling_logp_difference/mean": 0.6315834522247314, "step": 921, "step_time": 11.285818221003865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.9471870958805084, "epoch": 0.00922, "grad_norm": 0.06330299377441406, "kl": 0.6257747113704681, "learning_rate": 9.999636709974583e-06, "loss": -0.0121, "step": 922, "step_time": 6.634830774004513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1475.0, "completions/max_terminated_length": 1475.0, "completions/mean_length": 484.75, "completions/mean_terminated_length": 499.8709411621094, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.4534088373184204, "epoch": 0.00923, "frac_reward_zero_std": 0.0, "grad_norm": 0.04170434921979904, "kl": 0.7004557140171528, "learning_rate": 9.999635889457293e-06, "loss": -0.0165, "num_tokens": 21938592.0, "reward": 0.9253937005996704, "reward_std": 0.4699001908302307, "rewards/rollout_reward_func/mean": 0.9253937005996704, "rewards/rollout_reward_func/std": 0.6499722003936768, "sampling/importance_sampling_ratio/max": 0.5493537187576294, "sampling/importance_sampling_ratio/mean": 0.3104296922683716, "sampling/importance_sampling_ratio/min": 1.5021985044771924e-10, "sampling/sampling_logp_difference/max": 4.898017883300781, "sampling/sampling_logp_difference/mean": 0.6379365921020508, "step": 923, "step_time": 10.696288952014584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.4499411582946777, "epoch": 0.00924, "grad_norm": 0.024673206731677055, "kl": 0.702115498483181, "learning_rate": 9.999635068014492e-06, "loss": -0.0168, "step": 924, "step_time": 6.093932389994734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1372.0, "completions/max_terminated_length": 1372.0, "completions/mean_length": 569.53125, "completions/mean_terminated_length": 545.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.001573473215103, "epoch": 0.00925, "frac_reward_zero_std": 0.25, "grad_norm": 0.02129686065018177, "kl": 0.6627734899520874, "learning_rate": 9.999634245646181e-06, "loss": -0.009, "num_tokens": 21996631.0, "reward": 0.8512288331985474, "reward_std": 0.49696648120880127, "rewards/rollout_reward_func/mean": 0.8512288331985474, "rewards/rollout_reward_func/std": 0.7758034467697144, "sampling/importance_sampling_ratio/max": 0.5503402352333069, "sampling/importance_sampling_ratio/mean": 0.24460919201374054, "sampling/importance_sampling_ratio/min": 3.3493734880380875e-25, "sampling/sampling_logp_difference/max": 13.277046203613281, "sampling/sampling_logp_difference/mean": 0.9603396058082581, "step": 925, "step_time": 11.122945075992902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.0104039311409, "epoch": 0.00926, "grad_norm": 0.02097608707845211, "kl": 0.6598291769623756, "learning_rate": 9.999633422352361e-06, "loss": -0.0089, "step": 926, "step_time": 5.9790761999975075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1428.0, "completions/max_terminated_length": 1428.0, "completions/mean_length": 789.46875, "completions/mean_terminated_length": 789.46875, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "entropy": 3.5557910799980164, "epoch": 0.00927, "frac_reward_zero_std": 0.0, "grad_norm": 0.0981392189860344, "kl": 0.8584182262420654, "learning_rate": 9.999632598133035e-06, "loss": -0.013, "num_tokens": 22063994.0, "reward": 0.5755701065063477, "reward_std": 0.7516224980354309, "rewards/rollout_reward_func/mean": 0.5755701065063477, "rewards/rollout_reward_func/std": 0.9514103531837463, "sampling/importance_sampling_ratio/max": 0.3034069538116455, "sampling/importance_sampling_ratio/mean": 0.1849723756313324, "sampling/importance_sampling_ratio/min": 0.00488754827529192, "sampling/sampling_logp_difference/max": 2.7242469787597656, "sampling/sampling_logp_difference/mean": 0.453316867351532, "step": 927, "step_time": 11.239784484008851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.570962578058243, "epoch": 0.00928, "grad_norm": 0.07345537096261978, "kl": 0.8509673401713371, "learning_rate": 9.999631772988198e-06, "loss": -0.0131, "step": 928, "step_time": 6.644241770001827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1525.0, "completions/max_terminated_length": 1525.0, "completions/mean_length": 922.53125, "completions/mean_terminated_length": 929.9354858398438, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 4.8141350746154785, "epoch": 0.00929, "frac_reward_zero_std": 0.25, "grad_norm": 0.02805786021053791, "kl": 0.7623603790998459, "learning_rate": 9.999630946917853e-06, "loss": -0.0026, "num_tokens": 22134307.0, "reward": -0.09136903285980225, "reward_std": 0.6822824478149414, "rewards/rollout_reward_func/mean": -0.09136903285980225, "rewards/rollout_reward_func/std": 1.0160292387008667, "sampling/importance_sampling_ratio/max": 0.2958447337150574, "sampling/importance_sampling_ratio/mean": 0.09493688493967056, "sampling/importance_sampling_ratio/min": 2.9968772204359975e-18, "sampling/sampling_logp_difference/max": 14.0010347366333, "sampling/sampling_logp_difference/mean": 1.03011155128479, "step": 929, "step_time": 12.085112330991251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.848455429077148, "epoch": 0.0093, "grad_norm": 0.026605676859617233, "kl": 0.7542569562792778, "learning_rate": 9.999630119922e-06, "loss": -0.0026, "step": 930, "step_time": 6.5301454970031045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1434.0, "completions/max_terminated_length": 1434.0, "completions/mean_length": 508.5625, "completions/mean_terminated_length": 508.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.357640862464905, "epoch": 0.00931, "frac_reward_zero_std": 0.25, "grad_norm": 0.1538199484348297, "kl": 0.9158035591244698, "learning_rate": 9.99962929200064e-06, "loss": 0.003, "num_tokens": 22189809.0, "reward": 1.0545631647109985, "reward_std": 0.26094603538513184, "rewards/rollout_reward_func/mean": 1.0545631647109985, "rewards/rollout_reward_func/std": 0.5221700072288513, "sampling/importance_sampling_ratio/max": 0.5497202277183533, "sampling/importance_sampling_ratio/mean": 0.28360968828201294, "sampling/importance_sampling_ratio/min": 0.00025747905601747334, "sampling/sampling_logp_difference/max": 2.619616985321045, "sampling/sampling_logp_difference/mean": 0.46871164441108704, "step": 931, "step_time": 11.161380056997587 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.3771823048591614, "epoch": 0.00932, "grad_norm": 0.0369156114757061, "kl": 0.8788139969110489, "learning_rate": 9.99962846315377e-06, "loss": 0.0026, "step": 932, "step_time": 6.106726098005311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1331.0, "completions/max_terminated_length": 1331.0, "completions/mean_length": 519.625, "completions/mean_terminated_length": 519.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.018600523471832, "epoch": 0.00933, "frac_reward_zero_std": 0.5, "grad_norm": 0.021091725677251816, "kl": 0.7709114998579025, "learning_rate": 9.999627633381394e-06, "loss": -0.0059, "num_tokens": 22248041.0, "reward": 0.4638264775276184, "reward_std": 0.30440378189086914, "rewards/rollout_reward_func/mean": 0.4638264775276184, "rewards/rollout_reward_func/std": 1.0668201446533203, "sampling/importance_sampling_ratio/max": 0.4401037096977234, "sampling/importance_sampling_ratio/mean": 0.20975413918495178, "sampling/importance_sampling_ratio/min": 8.962296796220142e-17, "sampling/sampling_logp_difference/max": 5.014065265655518, "sampling/sampling_logp_difference/mean": 0.7577472925186157, "step": 933, "step_time": 11.15085154499684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.051321864128113, "epoch": 0.00934, "grad_norm": 0.02252204716205597, "kl": 0.7621016018092632, "learning_rate": 9.99962680268351e-06, "loss": -0.0058, "step": 934, "step_time": 5.860375924006803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1318.0, "completions/max_terminated_length": 1318.0, "completions/mean_length": 580.625, "completions/mean_terminated_length": 580.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.108651280403137, "epoch": 0.00935, "frac_reward_zero_std": 0.25, "grad_norm": 0.021577704697847366, "kl": 0.8335205540060997, "learning_rate": 9.99962597106012e-06, "loss": -0.008, "num_tokens": 22307538.0, "reward": -0.8142471313476562, "reward_std": 0.28784069418907166, "rewards/rollout_reward_func/mean": -0.8142471313476562, "rewards/rollout_reward_func/std": 0.5400792360305786, "sampling/importance_sampling_ratio/max": 0.5549092888832092, "sampling/importance_sampling_ratio/mean": 0.22775676846504211, "sampling/importance_sampling_ratio/min": 9.827908797888085e-05, "sampling/sampling_logp_difference/max": 2.961111545562744, "sampling/sampling_logp_difference/mean": 0.5968791246414185, "step": 935, "step_time": 10.346678985995823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.105507403612137, "epoch": 0.00936, "grad_norm": 0.020139923319220543, "kl": 0.8242523297667503, "learning_rate": 9.99962513851122e-06, "loss": -0.008, "step": 936, "step_time": 5.915402061007626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1344.0, "completions/max_terminated_length": 1344.0, "completions/mean_length": 622.625, "completions/mean_terminated_length": 622.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.092209845781326, "epoch": 0.00937, "frac_reward_zero_std": 0.25, "grad_norm": 0.0789606049656868, "kl": 0.7087065167725086, "learning_rate": 9.999624305036816e-06, "loss": -0.0098, "num_tokens": 22368259.0, "reward": 0.44160234928131104, "reward_std": 0.3773210346698761, "rewards/rollout_reward_func/mean": 0.44160234928131104, "rewards/rollout_reward_func/std": 0.979927122592926, "sampling/importance_sampling_ratio/max": 0.5515396595001221, "sampling/importance_sampling_ratio/mean": 0.2293529063463211, "sampling/importance_sampling_ratio/min": 3.433887286519166e-06, "sampling/sampling_logp_difference/max": 4.029043197631836, "sampling/sampling_logp_difference/mean": 0.6223320960998535, "step": 937, "step_time": 11.20324019099644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012620192486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012620192486792803, "entropy": 4.059358358383179, "epoch": 0.00938, "grad_norm": 0.03489702567458153, "kl": 0.7105628624558449, "learning_rate": 9.999623470636904e-06, "loss": -0.01, "step": 938, "step_time": 5.913554566002858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1567.0, "completions/max_terminated_length": 1567.0, "completions/mean_length": 1086.75, "completions/mean_terminated_length": 1086.75, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "entropy": 4.571664363145828, "epoch": 0.00939, "frac_reward_zero_std": 0.0, "grad_norm": 0.03602788224816322, "kl": 0.7313780784606934, "learning_rate": 9.999622635311485e-06, "loss": 0.0119, "num_tokens": 22445113.0, "reward": 0.017442792654037476, "reward_std": 0.447529137134552, "rewards/rollout_reward_func/mean": 0.017442792654037476, "rewards/rollout_reward_func/std": 0.9691619277000427, "sampling/importance_sampling_ratio/max": 0.29556742310523987, "sampling/importance_sampling_ratio/mean": 0.0919528603553772, "sampling/importance_sampling_ratio/min": 4.104308987942437e-14, "sampling/sampling_logp_difference/max": 10.026341438293457, "sampling/sampling_logp_difference/mean": 0.7697186470031738, "step": 939, "step_time": 12.5742930790002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.546065956354141, "epoch": 0.0094, "grad_norm": 0.040059540420770645, "kl": 0.7308333069086075, "learning_rate": 9.999621799060561e-06, "loss": 0.0119, "step": 940, "step_time": 6.624936660002277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1552.0, "completions/max_terminated_length": 1552.0, "completions/mean_length": 730.78125, "completions/mean_terminated_length": 730.78125, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "entropy": 3.4978173971176147, "epoch": 0.00941, "frac_reward_zero_std": 0.25, "grad_norm": 0.05346358194947243, "kl": 0.7306614965200424, "learning_rate": 9.999620961884131e-06, "loss": -0.0063, "num_tokens": 22510070.0, "reward": 1.0084388256072998, "reward_std": 0.2663317024707794, "rewards/rollout_reward_func/mean": 1.0084388256072998, "rewards/rollout_reward_func/std": 0.5747501850128174, "sampling/importance_sampling_ratio/max": 0.32770368456840515, "sampling/importance_sampling_ratio/mean": 0.22067591547966003, "sampling/importance_sampling_ratio/min": 2.297359393718082e-15, "sampling/sampling_logp_difference/max": 3.883899211883545, "sampling/sampling_logp_difference/mean": 0.6376301050186157, "step": 941, "step_time": 11.805056532997696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.489283800125122, "epoch": 0.00942, "grad_norm": 0.04352061077952385, "kl": 0.7290462404489517, "learning_rate": 9.999620123782196e-06, "loss": -0.0063, "step": 942, "step_time": 6.578271716993186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1543.0, "completions/max_terminated_length": 1543.0, "completions/mean_length": 776.5625, "completions/mean_terminated_length": 776.5625, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "entropy": 3.5735201239585876, "epoch": 0.00943, "frac_reward_zero_std": 0.0, "grad_norm": 0.09971579164266586, "kl": 0.7500312104821205, "learning_rate": 9.999619284754754e-06, "loss": -0.0036, "num_tokens": 22577250.0, "reward": 0.645817756652832, "reward_std": 0.5553750991821289, "rewards/rollout_reward_func/mean": 0.645817756652832, "rewards/rollout_reward_func/std": 0.9332349896430969, "sampling/importance_sampling_ratio/max": 0.37927573919296265, "sampling/importance_sampling_ratio/mean": 0.2175585925579071, "sampling/importance_sampling_ratio/min": 0.004095663316547871, "sampling/sampling_logp_difference/max": 1.7639042139053345, "sampling/sampling_logp_difference/mean": 0.46124327182769775, "step": 943, "step_time": 11.923763263010187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.5732797384262085, "epoch": 0.00944, "grad_norm": 0.09927833080291748, "kl": 0.7515649423003197, "learning_rate": 9.999618444801806e-06, "loss": -0.0039, "step": 944, "step_time": 6.436732680005662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1228.0, "completions/max_terminated_length": 1228.0, "completions/mean_length": 478.09375, "completions/mean_terminated_length": 475.6773986816406, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.816296339035034, "epoch": 0.00945, "frac_reward_zero_std": 0.25, "grad_norm": 0.21843570470809937, "kl": 0.6623215451836586, "learning_rate": 9.999617603923354e-06, "loss": -0.0084, "num_tokens": 22631044.0, "reward": 0.7349539995193481, "reward_std": 0.25481829047203064, "rewards/rollout_reward_func/mean": 0.7349539995193481, "rewards/rollout_reward_func/std": 0.8618153929710388, "sampling/importance_sampling_ratio/max": 0.5550148487091064, "sampling/importance_sampling_ratio/mean": 0.2760593295097351, "sampling/importance_sampling_ratio/min": 4.710022927234547e-10, "sampling/sampling_logp_difference/max": 3.624277114868164, "sampling/sampling_logp_difference/mean": 0.5692020654678345, "step": 945, "step_time": 10.91914958198322 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 3.820192039012909, "epoch": 0.00946, "grad_norm": 0.12124331295490265, "kl": 0.6635278649628162, "learning_rate": 9.999616762119397e-06, "loss": -0.0087, "step": 946, "step_time": 5.681791579001583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 277.65625, "completions/mean_terminated_length": 277.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.875456839799881, "epoch": 0.00947, "frac_reward_zero_std": 0.5, "grad_norm": 0.08424906432628632, "kl": 0.7873030230402946, "learning_rate": 9.999615919389935e-06, "loss": -0.0014, "num_tokens": 22678150.0, "reward": 1.0713716745376587, "reward_std": 0.19489607214927673, "rewards/rollout_reward_func/mean": 1.0713716745376587, "rewards/rollout_reward_func/std": 0.39053788781166077, "sampling/importance_sampling_ratio/max": 0.5535350441932678, "sampling/importance_sampling_ratio/mean": 0.4023309350013733, "sampling/importance_sampling_ratio/min": 0.12762027978897095, "sampling/sampling_logp_difference/max": 1.0605636835098267, "sampling/sampling_logp_difference/mean": 0.3318920135498047, "step": 947, "step_time": 8.160544009995647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.018229166977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018229166977107525, "entropy": 2.9539271891117096, "epoch": 0.00948, "grad_norm": 0.07878883183002472, "kl": 0.7698107734322548, "learning_rate": 9.999615075734968e-06, "loss": -0.0019, "step": 948, "step_time": 4.920650033011043 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.0, "completions/max_length": 1327.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 352.40625, "completions/mean_terminated_length": 352.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.4436485171318054, "epoch": 0.00949, "frac_reward_zero_std": 0.5, "grad_norm": 0.08189689368009567, "kl": 0.6649428308010101, "learning_rate": 9.999614231154497e-06, "loss": -0.0047, "num_tokens": 22729165.0, "reward": 1.0210397243499756, "reward_std": 0.23726533353328705, "rewards/rollout_reward_func/mean": 1.0210397243499756, "rewards/rollout_reward_func/std": 0.4941025376319885, "sampling/importance_sampling_ratio/max": 0.5500928163528442, "sampling/importance_sampling_ratio/mean": 0.34545111656188965, "sampling/importance_sampling_ratio/min": 2.4224246740478517e-12, "sampling/sampling_logp_difference/max": 14.635427474975586, "sampling/sampling_logp_difference/mean": 0.7040596008300781, "step": 949, "step_time": 10.452708443015581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.033854166977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033854166977107525, "entropy": 3.5491029024124146, "epoch": 0.0095, "grad_norm": 0.02803112007677555, "kl": 0.6486968696117401, "learning_rate": 9.999613385648523e-06, "loss": -0.005, "step": 950, "step_time": 5.832582660994376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1268.0, "completions/max_terminated_length": 1268.0, "completions/mean_length": 175.8125, "completions/mean_terminated_length": 175.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.452557682991028, "epoch": 0.00951, "frac_reward_zero_std": 0.75, "grad_norm": 0.028370115906000137, "kl": 0.7127286419272423, "learning_rate": 9.999612539217044e-06, "loss": -0.0005, "num_tokens": 22771383.0, "reward": 1.0641448497772217, "reward_std": 0.17914064228534698, "rewards/rollout_reward_func/mean": 1.0641448497772217, "rewards/rollout_reward_func/std": 0.346671462059021, "sampling/importance_sampling_ratio/max": 0.5509682893753052, "sampling/importance_sampling_ratio/mean": 0.3929736316204071, "sampling/importance_sampling_ratio/min": 0.009075653739273548, "sampling/sampling_logp_difference/max": 1.8585349321365356, "sampling/sampling_logp_difference/mean": 0.4541240632534027, "step": 951, "step_time": 10.075319103001675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.589779168367386, "epoch": 0.00952, "grad_norm": 0.031322378665208817, "kl": 0.6953304558992386, "learning_rate": 9.999611691860062e-06, "loss": -0.0005, "step": 952, "step_time": 5.395811879003304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.0, "completions/max_length": 1336.0, "completions/max_terminated_length": 1336.0, "completions/mean_length": 415.53125, "completions/mean_terminated_length": 415.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.042195022106171, "epoch": 0.00953, "frac_reward_zero_std": 0.25, "grad_norm": 0.10861092805862427, "kl": 0.7158765532076359, "learning_rate": 9.999610843577577e-06, "loss": 0.0013, "num_tokens": 22823531.0, "reward": 0.4797612130641937, "reward_std": 0.7166582345962524, "rewards/rollout_reward_func/mean": 0.4797612130641937, "rewards/rollout_reward_func/std": 0.9546231627464294, "sampling/importance_sampling_ratio/max": 0.5501277446746826, "sampling/importance_sampling_ratio/mean": 0.2014792561531067, "sampling/importance_sampling_ratio/min": 4.6308023909891975e-15, "sampling/sampling_logp_difference/max": 4.012730121612549, "sampling/sampling_logp_difference/mean": 1.054319977760315, "step": 953, "step_time": 10.272469629991974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.12355899810791, "epoch": 0.00954, "grad_norm": 0.10539785772562027, "kl": 0.7089306712150574, "learning_rate": 9.999609994369586e-06, "loss": 0.0011, "step": 954, "step_time": 6.205366476999188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0024999999441206455, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024999999441206455, "completions/clipped_ratio": 0.03125, "completions/max_length": 1396.0, "completions/max_terminated_length": 1396.0, "completions/mean_length": 824.5, "completions/mean_terminated_length": 830.806396484375, "completions/min_length": 525.0, "completions/min_terminated_length": 525.0, "entropy": 4.873484492301941, "epoch": 0.00955, "frac_reward_zero_std": 0.0, "grad_norm": 0.04701424390077591, "kl": 0.5485302172601223, "learning_rate": 9.999609144236094e-06, "loss": 0.0008, "num_tokens": 22892615.0, "reward": 0.8856679797172546, "reward_std": 0.5345062613487244, "rewards/rollout_reward_func/mean": 0.8856679797172546, "rewards/rollout_reward_func/std": 0.7665066719055176, "sampling/importance_sampling_ratio/max": 0.17956383526325226, "sampling/importance_sampling_ratio/mean": 0.05644465237855911, "sampling/importance_sampling_ratio/min": 8.074128964431857e-15, "sampling/sampling_logp_difference/max": 14.510169982910156, "sampling/sampling_logp_difference/mean": 1.0830450057983398, "step": 955, "step_time": 11.454990125988843 }, { "clip_ratio/high_max": 0.028125000186264515, "clip_ratio/high_mean": 0.014062500093132257, "clip_ratio/low_mean": 0.013749999925494194, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02781250001862645, "entropy": 4.956103205680847, "epoch": 0.00956, "grad_norm": 0.03821816295385361, "kl": 0.5380986742675304, "learning_rate": 9.999608293177099e-06, "loss": 0.0007, "step": 956, "step_time": 6.673302663002687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1318.0, "completions/max_terminated_length": 1318.0, "completions/mean_length": 282.5, "completions/mean_terminated_length": 255.60000610351562, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.043659090995789, "epoch": 0.00957, "frac_reward_zero_std": 0.25, "grad_norm": 0.12083621323108673, "kl": 0.7754873558878899, "learning_rate": 9.9996074411926e-06, "loss": -0.0009, "num_tokens": 22939411.0, "reward": -0.006917774677276611, "reward_std": 0.5557221174240112, "rewards/rollout_reward_func/mean": -0.006917774677276611, "rewards/rollout_reward_func/std": 1.0694139003753662, "sampling/importance_sampling_ratio/max": 0.540764570236206, "sampling/importance_sampling_ratio/mean": 0.1960860639810562, "sampling/importance_sampling_ratio/min": 8.08419891562529e-17, "sampling/sampling_logp_difference/max": 4.455624580383301, "sampling/sampling_logp_difference/mean": 1.082715630531311, "step": 957, "step_time": 10.209506034007063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.053564965724945, "epoch": 0.00958, "grad_norm": 0.12116611003875732, "kl": 0.7720189765095711, "learning_rate": 9.9996065882826e-06, "loss": -0.0012, "step": 958, "step_time": 5.73824918599712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1426.0, "completions/max_terminated_length": 1426.0, "completions/mean_length": 450.65625, "completions/mean_terminated_length": 442.3548278808594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.858855813741684, "epoch": 0.00959, "frac_reward_zero_std": 0.5, "grad_norm": 0.018139397725462914, "kl": 0.5971951112151146, "learning_rate": 9.999605734447097e-06, "loss": -0.0002, "num_tokens": 22992036.0, "reward": 0.8907556533813477, "reward_std": 0.31311020255088806, "rewards/rollout_reward_func/mean": 0.8907556533813477, "rewards/rollout_reward_func/std": 0.5692058801651001, "sampling/importance_sampling_ratio/max": 0.5478548407554626, "sampling/importance_sampling_ratio/mean": 0.20095543563365936, "sampling/importance_sampling_ratio/min": 1.2682716963467245e-16, "sampling/sampling_logp_difference/max": 3.4894447326660156, "sampling/sampling_logp_difference/mean": 1.0204156637191772, "step": 959, "step_time": 10.88797721798619 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 4.8125185668468475, "epoch": 0.0096, "grad_norm": 0.014486213214695454, "kl": 0.6000241748988628, "learning_rate": 9.999604879686092e-06, "loss": -0.0003, "step": 960, "step_time": 6.509022513993841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1286.0, "completions/max_terminated_length": 1286.0, "completions/mean_length": 612.28125, "completions/mean_terminated_length": 611.258056640625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.777412116527557, "epoch": 0.00961, "frac_reward_zero_std": 0.25, "grad_norm": 0.015663791447877884, "kl": 0.7067319825291634, "learning_rate": 9.999604023999587e-06, "loss": 0.0011, "num_tokens": 23049727.0, "reward": 0.6988149285316467, "reward_std": 0.40751200914382935, "rewards/rollout_reward_func/mean": 0.6988149285316467, "rewards/rollout_reward_func/std": 0.8430649638175964, "sampling/importance_sampling_ratio/max": 0.541477382183075, "sampling/importance_sampling_ratio/mean": 0.12464139610528946, "sampling/importance_sampling_ratio/min": 9.20760256234443e-12, "sampling/sampling_logp_difference/max": 14.256279945373535, "sampling/sampling_logp_difference/mean": 1.1258395910263062, "step": 961, "step_time": 11.15532535300008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 5.779981315135956, "epoch": 0.00962, "grad_norm": 0.015573974698781967, "kl": 0.6990835964679718, "learning_rate": 9.999603167387578e-06, "loss": 0.0011, "step": 962, "step_time": 6.337896675991942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1565.0, "completions/max_terminated_length": 1565.0, "completions/mean_length": 679.28125, "completions/mean_terminated_length": 675.6774291992188, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.825135171413422, "epoch": 0.00963, "frac_reward_zero_std": 0.25, "grad_norm": 0.014453604817390442, "kl": 0.6104420721530914, "learning_rate": 9.999602309850068e-06, "loss": -0.002, "num_tokens": 23112551.0, "reward": 0.699454128742218, "reward_std": 0.3849160075187683, "rewards/rollout_reward_func/mean": 0.699454128742218, "rewards/rollout_reward_func/std": 0.799207329750061, "sampling/importance_sampling_ratio/max": 0.5527753829956055, "sampling/importance_sampling_ratio/mean": 0.16589370369911194, "sampling/importance_sampling_ratio/min": 1.4834444023800205e-16, "sampling/sampling_logp_difference/max": 3.8086252212524414, "sampling/sampling_logp_difference/mean": 0.8530404567718506, "step": 963, "step_time": 12.291303920021164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.821809947490692, "epoch": 0.00964, "grad_norm": 0.013152669183909893, "kl": 0.6069780737161636, "learning_rate": 9.999601451387057e-06, "loss": -0.002, "step": 964, "step_time": 6.600034424984187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1277.0, "completions/max_terminated_length": 1277.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 555.51611328125, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "entropy": 4.641388952732086, "epoch": 0.00965, "frac_reward_zero_std": 0.0, "grad_norm": 0.05389261990785599, "kl": 0.6762375384569168, "learning_rate": 9.999600591998547e-06, "loss": 0.0037, "num_tokens": 23173219.0, "reward": 0.9912129044532776, "reward_std": 0.42668086290359497, "rewards/rollout_reward_func/mean": 0.9912129044532776, "rewards/rollout_reward_func/std": 0.6140649318695068, "sampling/importance_sampling_ratio/max": 0.3006269335746765, "sampling/importance_sampling_ratio/mean": 0.08354897797107697, "sampling/importance_sampling_ratio/min": 4.112235291869881e-12, "sampling/sampling_logp_difference/max": 12.753003120422363, "sampling/sampling_logp_difference/mean": 0.8583312630653381, "step": 965, "step_time": 11.328580929999589 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 4.568196713924408, "epoch": 0.00966, "grad_norm": 0.04860686883330345, "kl": 0.677258126437664, "learning_rate": 9.999599731684533e-06, "loss": 0.0034, "step": 966, "step_time": 5.791432997990341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1611.0, "completions/max_terminated_length": 1611.0, "completions/mean_length": 779.34375, "completions/mean_terminated_length": 779.34375, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "entropy": 5.311013102531433, "epoch": 0.00967, "frac_reward_zero_std": 0.0, "grad_norm": 0.03481242433190346, "kl": 0.6907995976507664, "learning_rate": 9.99959887044502e-06, "loss": -0.0062, "num_tokens": 23239924.0, "reward": 0.11245100200176239, "reward_std": 1.1198699474334717, "rewards/rollout_reward_func/mean": 0.11245100200176239, "rewards/rollout_reward_func/std": 1.092604398727417, "sampling/importance_sampling_ratio/max": 0.2972244918346405, "sampling/importance_sampling_ratio/mean": 0.05577573925256729, "sampling/importance_sampling_ratio/min": 1.1996556092269683e-13, "sampling/sampling_logp_difference/max": 4.392571449279785, "sampling/sampling_logp_difference/mean": 0.89178466796875, "step": 967, "step_time": 12.017086905005272 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 5.216181397438049, "epoch": 0.00968, "grad_norm": 0.02370266430079937, "kl": 0.6961629763245583, "learning_rate": 9.999598008280007e-06, "loss": -0.0063, "step": 968, "step_time": 6.986920174000261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 389.90625, "completions/mean_terminated_length": 383.06451416015625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.9752757847309113, "epoch": 0.00969, "frac_reward_zero_std": 0.25, "grad_norm": 0.026512950658798218, "kl": 0.6881235912442207, "learning_rate": 9.999597145189494e-06, "loss": -0.0072, "num_tokens": 23291571.0, "reward": 1.0974844694137573, "reward_std": 0.21513430774211884, "rewards/rollout_reward_func/mean": 1.0974844694137573, "rewards/rollout_reward_func/std": 0.4124569892883301, "sampling/importance_sampling_ratio/max": 0.5491604208946228, "sampling/importance_sampling_ratio/mean": 0.25361353158950806, "sampling/importance_sampling_ratio/min": 3.1564200038414203e-12, "sampling/sampling_logp_difference/max": 9.279319763183594, "sampling/sampling_logp_difference/mean": 0.8469645977020264, "step": 969, "step_time": 8.650700197991682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.913731336593628, "epoch": 0.0097, "grad_norm": 0.026359520852565765, "kl": 0.6971999183297157, "learning_rate": 9.999596281173482e-06, "loss": -0.0072, "step": 970, "step_time": 4.575416960004077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1358.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 640.34375, "completions/mean_terminated_length": 617.258056640625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.283310890197754, "epoch": 0.00971, "frac_reward_zero_std": 0.25, "grad_norm": 0.04000231623649597, "kl": 0.7240562438964844, "learning_rate": 9.999595416231968e-06, "loss": -0.0008, "num_tokens": 23352272.0, "reward": 0.7350499033927917, "reward_std": 0.20749299228191376, "rewards/rollout_reward_func/mean": 0.7350499033927917, "rewards/rollout_reward_func/std": 0.8555464744567871, "sampling/importance_sampling_ratio/max": 0.546396791934967, "sampling/importance_sampling_ratio/mean": 0.18949942290782928, "sampling/importance_sampling_ratio/min": 1.0605907428337669e-12, "sampling/sampling_logp_difference/max": 4.036533355712891, "sampling/sampling_logp_difference/mean": 0.6488973498344421, "step": 971, "step_time": 11.814209482996375 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 4.25919783115387, "epoch": 0.00972, "grad_norm": 0.040390707552433014, "kl": 0.7268297001719475, "learning_rate": 9.999594550364955e-06, "loss": -0.0008, "step": 972, "step_time": 6.111696231993847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1505.0, "completions/max_terminated_length": 1505.0, "completions/mean_length": 967.21875, "completions/mean_terminated_length": 967.21875, "completions/min_length": 570.0, "completions/min_terminated_length": 570.0, "entropy": 5.000036656856537, "epoch": 0.00973, "frac_reward_zero_std": 0.0, "grad_norm": 0.06556100398302078, "kl": 0.6068676933646202, "learning_rate": 9.999593683572444e-06, "loss": -0.0095, "num_tokens": 23425827.0, "reward": 0.37182748317718506, "reward_std": 0.90622878074646, "rewards/rollout_reward_func/mean": 0.37182748317718506, "rewards/rollout_reward_func/std": 0.9619581699371338, "sampling/importance_sampling_ratio/max": 0.2976311147212982, "sampling/importance_sampling_ratio/mean": 0.08905059099197388, "sampling/importance_sampling_ratio/min": 9.379683661714378e-12, "sampling/sampling_logp_difference/max": 4.355246543884277, "sampling/sampling_logp_difference/mean": 0.7424236536026001, "step": 973, "step_time": 12.816057418982382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.960120320320129, "epoch": 0.00974, "grad_norm": 0.05774989724159241, "kl": 0.607686672359705, "learning_rate": 9.999592815854433e-06, "loss": -0.0098, "step": 974, "step_time": 6.545029619010165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1370.0, "completions/max_terminated_length": 1370.0, "completions/mean_length": 570.625, "completions/mean_terminated_length": 570.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.1959854662418365, "epoch": 0.00975, "frac_reward_zero_std": 0.25, "grad_norm": 0.04172062501311302, "kl": 0.7348470389842987, "learning_rate": 9.999591947210923e-06, "loss": -0.0003, "num_tokens": 23484635.0, "reward": 0.952018141746521, "reward_std": 0.25795984268188477, "rewards/rollout_reward_func/mean": 0.952018141746521, "rewards/rollout_reward_func/std": 0.6233644485473633, "sampling/importance_sampling_ratio/max": 0.5505441427230835, "sampling/importance_sampling_ratio/mean": 0.24132469296455383, "sampling/importance_sampling_ratio/min": 2.010455135703637e-10, "sampling/sampling_logp_difference/max": 11.899221420288086, "sampling/sampling_logp_difference/mean": 0.6796488761901855, "step": 975, "step_time": 10.888217476996942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.18396520614624, "epoch": 0.00976, "grad_norm": 0.037317126989364624, "kl": 0.7373814731836319, "learning_rate": 9.999591077641915e-06, "loss": -0.0003, "step": 976, "step_time": 6.041234903008444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.0, "completions/max_length": 954.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 424.46875, "completions/mean_terminated_length": 424.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.3529422879219055, "epoch": 0.00977, "frac_reward_zero_std": 0.25, "grad_norm": 0.11640133708715439, "kl": 0.6750902682542801, "learning_rate": 9.999590207147407e-06, "loss": -0.0069, "num_tokens": 23537390.0, "reward": 0.423724889755249, "reward_std": 0.4692642390727997, "rewards/rollout_reward_func/mean": 0.423724889755249, "rewards/rollout_reward_func/std": 0.980891227722168, "sampling/importance_sampling_ratio/max": 0.5485250949859619, "sampling/importance_sampling_ratio/mean": 0.22764188051223755, "sampling/importance_sampling_ratio/min": 7.524637659653738e-19, "sampling/sampling_logp_difference/max": 13.832743644714355, "sampling/sampling_logp_difference/mean": 0.8429832458496094, "step": 977, "step_time": 9.509690471997601 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 4.367637366056442, "epoch": 0.00978, "grad_norm": 0.034426137804985046, "kl": 0.6628688797354698, "learning_rate": 9.999589335727404e-06, "loss": -0.0073, "step": 978, "step_time": 4.837850839998282 }, { "clip_ratio/high_max": 0.017708333674818277, "clip_ratio/high_mean": 0.008854166837409139, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008854166837409139, "completions/clipped_ratio": 0.0, "completions/max_length": 1359.0, "completions/max_terminated_length": 1359.0, "completions/mean_length": 822.53125, "completions/mean_terminated_length": 822.53125, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "entropy": 4.580320835113525, "epoch": 0.00979, "frac_reward_zero_std": 0.0, "grad_norm": 0.047514017671346664, "kl": 0.7386870756745338, "learning_rate": 9.9995884633819e-06, "loss": 0.0005, "num_tokens": 23606957.0, "reward": 0.2920800447463989, "reward_std": 0.6823729872703552, "rewards/rollout_reward_func/mean": 0.2920800447463989, "rewards/rollout_reward_func/std": 1.086691975593567, "sampling/importance_sampling_ratio/max": 0.2891578674316406, "sampling/importance_sampling_ratio/mean": 0.1132071241736412, "sampling/importance_sampling_ratio/min": 4.657125379781135e-12, "sampling/sampling_logp_difference/max": 3.7220358848571777, "sampling/sampling_logp_difference/mean": 0.6732253432273865, "step": 979, "step_time": 11.752268241994898 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 4.598024249076843, "epoch": 0.0098, "grad_norm": 0.049051105976104736, "kl": 0.7384000271558762, "learning_rate": 9.9995875901109e-06, "loss": 0.0004, "step": 980, "step_time": 6.148695844996837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1416.0, "completions/max_terminated_length": 1416.0, "completions/mean_length": 632.625, "completions/mean_terminated_length": 632.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.8785711526870728, "epoch": 0.00981, "frac_reward_zero_std": 0.25, "grad_norm": 0.012054949067533016, "kl": 0.6959945186972618, "learning_rate": 9.999586715914402e-06, "loss": 0.008, "num_tokens": 23667047.0, "reward": 1.0518594980239868, "reward_std": 0.2386079579591751, "rewards/rollout_reward_func/mean": 1.0518594980239868, "rewards/rollout_reward_func/std": 0.4908691346645355, "sampling/importance_sampling_ratio/max": 0.5514014363288879, "sampling/importance_sampling_ratio/mean": 0.24650728702545166, "sampling/importance_sampling_ratio/min": 5.13275089097931e-10, "sampling/sampling_logp_difference/max": 3.4484877586364746, "sampling/sampling_logp_difference/mean": 0.5564125776290894, "step": 981, "step_time": 11.574967648994061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.8684687316417694, "epoch": 0.00982, "grad_norm": 0.0112694026902318, "kl": 0.6952349655330181, "learning_rate": 9.999585840792405e-06, "loss": 0.008, "step": 982, "step_time": 6.137606370008143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1399.0, "completions/max_terminated_length": 1399.0, "completions/mean_length": 366.46875, "completions/mean_terminated_length": 377.774169921875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.4478474259376526, "epoch": 0.00983, "frac_reward_zero_std": 0.25, "grad_norm": 0.02646995149552822, "kl": 0.7314128801226616, "learning_rate": 9.999584964744914e-06, "loss": -0.0066, "num_tokens": 23717300.0, "reward": 1.078500747680664, "reward_std": 0.2230343222618103, "rewards/rollout_reward_func/mean": 1.078500747680664, "rewards/rollout_reward_func/std": 0.42312145233154297, "sampling/importance_sampling_ratio/max": 0.5501765012741089, "sampling/importance_sampling_ratio/mean": 0.3445752263069153, "sampling/importance_sampling_ratio/min": 2.34424817335821e-08, "sampling/sampling_logp_difference/max": 2.896688461303711, "sampling/sampling_logp_difference/mean": 0.5539132356643677, "step": 983, "step_time": 11.104640922974795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.443566530942917, "epoch": 0.00984, "grad_norm": 0.02737184427678585, "kl": 0.7318117618560791, "learning_rate": 9.999584087771923e-06, "loss": -0.0066, "step": 984, "step_time": 5.948298301998875 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014062500093132257, "completions/clipped_ratio": 0.03125, "completions/max_length": 1268.0, "completions/max_terminated_length": 1268.0, "completions/mean_length": 677.34375, "completions/mean_terminated_length": 678.1290283203125, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "entropy": 4.515717774629593, "epoch": 0.00985, "frac_reward_zero_std": 0.25, "grad_norm": 0.06250448524951935, "kl": 0.6423786953091621, "learning_rate": 9.999583209873438e-06, "loss": -0.0044, "num_tokens": 23779803.0, "reward": 0.6552852988243103, "reward_std": 0.5103462934494019, "rewards/rollout_reward_func/mean": 0.6552852988243103, "rewards/rollout_reward_func/std": 0.8581331372261047, "sampling/importance_sampling_ratio/max": 0.301928848028183, "sampling/importance_sampling_ratio/mean": 0.15511086583137512, "sampling/importance_sampling_ratio/min": 1.5525949666542194e-12, "sampling/sampling_logp_difference/max": 4.313401222229004, "sampling/sampling_logp_difference/mean": 0.7200040817260742, "step": 985, "step_time": 11.201680667996698 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014062500093132257, "entropy": 4.523561596870422, "epoch": 0.00986, "grad_norm": 0.06010211259126663, "kl": 0.6374649256467819, "learning_rate": 9.999582331049455e-06, "loss": -0.0045, "step": 986, "step_time": 5.831420118985989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1393.0, "completions/max_terminated_length": 1393.0, "completions/mean_length": 254.34375, "completions/mean_terminated_length": 262.0322570800781, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.541371166706085, "epoch": 0.00987, "frac_reward_zero_std": 0.5, "grad_norm": 0.04155676066875458, "kl": 0.7012351006269455, "learning_rate": 9.999581451299976e-06, "loss": -0.0074, "num_tokens": 23825276.0, "reward": 0.4422667920589447, "reward_std": 0.26758167147636414, "rewards/rollout_reward_func/mean": 0.4422667920589447, "rewards/rollout_reward_func/std": 1.0378878116607666, "sampling/importance_sampling_ratio/max": 0.5507044196128845, "sampling/importance_sampling_ratio/mean": 0.41820091009140015, "sampling/importance_sampling_ratio/min": 1.7293503055352405e-13, "sampling/sampling_logp_difference/max": 4.619731903076172, "sampling/sampling_logp_difference/mean": 0.6739413738250732, "step": 987, "step_time": 10.4872786479973 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018229166977107525, "entropy": 3.540975898504257, "epoch": 0.00988, "grad_norm": 0.022405270487070084, "kl": 0.7004428952932358, "learning_rate": 9.999580570625e-06, "loss": -0.0074, "step": 988, "step_time": 6.02943217199936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1457.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 587.6875, "completions/mean_terminated_length": 587.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.830281913280487, "epoch": 0.00989, "frac_reward_zero_std": 0.5, "grad_norm": 0.01851630210876465, "kl": 0.6575273908674717, "learning_rate": 9.99957968902453e-06, "loss": -0.0061, "num_tokens": 23883234.0, "reward": 0.8881833553314209, "reward_std": 0.38109052181243896, "rewards/rollout_reward_func/mean": 0.8881833553314209, "rewards/rollout_reward_func/std": 0.7165822386741638, "sampling/importance_sampling_ratio/max": 0.5413999557495117, "sampling/importance_sampling_ratio/mean": 0.2547625005245209, "sampling/importance_sampling_ratio/min": 7.535061984087488e-12, "sampling/sampling_logp_difference/max": 3.499765157699585, "sampling/sampling_logp_difference/mean": 0.6155714392662048, "step": 989, "step_time": 12.139230427994335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.8313040137290955, "epoch": 0.0099, "grad_norm": 0.019380757585167885, "kl": 0.6594627238810062, "learning_rate": 9.999578806498565e-06, "loss": -0.0061, "step": 990, "step_time": 6.2851990569979534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1391.0, "completions/max_terminated_length": 1391.0, "completions/mean_length": 645.1875, "completions/mean_terminated_length": 645.1875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 4.557336181402206, "epoch": 0.00991, "frac_reward_zero_std": 0.0, "grad_norm": 0.04504440352320671, "kl": 0.6799027398228645, "learning_rate": 9.999577923047102e-06, "loss": -0.0092, "num_tokens": 23945676.0, "reward": 1.1791322231292725, "reward_std": 0.042779386043548584, "rewards/rollout_reward_func/mean": 1.1791322231292725, "rewards/rollout_reward_func/std": 0.04709625989198685, "sampling/importance_sampling_ratio/max": 0.30318453907966614, "sampling/importance_sampling_ratio/mean": 0.1569148600101471, "sampling/importance_sampling_ratio/min": 1.0126224495684298e-21, "sampling/sampling_logp_difference/max": 11.35831069946289, "sampling/sampling_logp_difference/mean": 1.0365245342254639, "step": 991, "step_time": 11.55647997300548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.533345639705658, "epoch": 0.00992, "grad_norm": 0.0454535186290741, "kl": 0.6810985505580902, "learning_rate": 9.999577038670144e-06, "loss": -0.0092, "step": 992, "step_time": 6.051136581991159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1481.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 555.75, "completions/mean_terminated_length": 555.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.8084298074245453, "epoch": 0.00993, "frac_reward_zero_std": 0.25, "grad_norm": 0.039017196744680405, "kl": 0.4802627004683018, "learning_rate": 9.999576153367693e-06, "loss": -0.0107, "num_tokens": 24003526.0, "reward": 0.5950574278831482, "reward_std": 0.7197784781455994, "rewards/rollout_reward_func/mean": 0.5950574278831482, "rewards/rollout_reward_func/std": 0.9638783931732178, "sampling/importance_sampling_ratio/max": 0.551956832408905, "sampling/importance_sampling_ratio/mean": 0.2586813271045685, "sampling/importance_sampling_ratio/min": 0.0014478731900453568, "sampling/sampling_logp_difference/max": 2.124206066131592, "sampling/sampling_logp_difference/mean": 0.5126742124557495, "step": 993, "step_time": 11.362349848015583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.7913385927677155, "epoch": 0.00994, "grad_norm": 0.051527608186006546, "kl": 0.4817902073264122, "learning_rate": 9.999575267139748e-06, "loss": -0.0108, "step": 994, "step_time": 6.282914354000241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1430.0, "completions/max_terminated_length": 1430.0, "completions/mean_length": 603.34375, "completions/mean_terminated_length": 603.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.7402433156967163, "epoch": 0.00995, "frac_reward_zero_std": 0.25, "grad_norm": 0.01476216223090887, "kl": 0.6623366251587868, "learning_rate": 9.999574379986306e-06, "loss": -0.0108, "num_tokens": 24063279.0, "reward": 0.8537921905517578, "reward_std": 0.5097861886024475, "rewards/rollout_reward_func/mean": 0.8537921905517578, "rewards/rollout_reward_func/std": 0.8019309043884277, "sampling/importance_sampling_ratio/max": 0.545613706111908, "sampling/importance_sampling_ratio/mean": 0.24160926043987274, "sampling/importance_sampling_ratio/min": 0.0008832769235596061, "sampling/sampling_logp_difference/max": 4.236098766326904, "sampling/sampling_logp_difference/mean": 0.5239253044128418, "step": 995, "step_time": 11.576721673976863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.738920360803604, "epoch": 0.00996, "grad_norm": 0.01500903069972992, "kl": 0.6606502123177052, "learning_rate": 9.99957349190737e-06, "loss": -0.0108, "step": 996, "step_time": 6.2205043339927215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1469.0, "completions/max_terminated_length": 1469.0, "completions/mean_length": 325.9375, "completions/mean_terminated_length": 332.16668701171875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.873122423887253, "epoch": 0.00997, "frac_reward_zero_std": 0.25, "grad_norm": 0.037762612104415894, "kl": 0.7267659455537796, "learning_rate": 9.99957260290294e-06, "loss": -0.0146, "num_tokens": 24113132.0, "reward": 1.0411605834960938, "reward_std": 0.20320191979408264, "rewards/rollout_reward_func/mean": 1.0411605834960938, "rewards/rollout_reward_func/std": 0.3801014721393585, "sampling/importance_sampling_ratio/max": 0.5527579188346863, "sampling/importance_sampling_ratio/mean": 0.3375300168991089, "sampling/importance_sampling_ratio/min": 2.2298276167675123e-14, "sampling/sampling_logp_difference/max": 4.954793930053711, "sampling/sampling_logp_difference/mean": 0.7946081757545471, "step": 997, "step_time": 11.294603582995478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.863828420639038, "epoch": 0.00998, "grad_norm": 0.034559641033411026, "kl": 0.7285930290818214, "learning_rate": 9.999571712973018e-06, "loss": -0.0147, "step": 998, "step_time": 5.974524504003057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1336.0, "completions/max_terminated_length": 1336.0, "completions/mean_length": 278.84375, "completions/mean_terminated_length": 278.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.860451817512512, "epoch": 0.00999, "frac_reward_zero_std": 0.75, "grad_norm": 0.0023496069479733706, "kl": 0.8382352441549301, "learning_rate": 9.9995708221176e-06, "loss": 0.0015, "num_tokens": 24157698.0, "reward": 0.6296960711479187, "reward_std": 0.1122015118598938, "rewards/rollout_reward_func/mean": 0.6296960711479187, "rewards/rollout_reward_func/std": 0.8581942915916443, "sampling/importance_sampling_ratio/max": 0.553615927696228, "sampling/importance_sampling_ratio/mean": 0.40776923298835754, "sampling/importance_sampling_ratio/min": 0.0007384553318843246, "sampling/sampling_logp_difference/max": 2.1785287857055664, "sampling/sampling_logp_difference/mean": 0.5823751091957092, "step": 999, "step_time": 10.392267750998144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.864382952451706, "epoch": 0.01, "grad_norm": 0.002353657502681017, "kl": 0.8372042402625084, "learning_rate": 9.99956993033669e-06, "loss": 0.0015, "step": 1000, "step_time": 6.2344839770084945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1507.0, "completions/max_terminated_length": 1507.0, "completions/mean_length": 772.0, "completions/mean_terminated_length": 772.0, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "entropy": 4.320258229970932, "epoch": 0.01001, "frac_reward_zero_std": 0.0, "grad_norm": 0.03833257406949997, "kl": 0.7714856937527657, "learning_rate": 9.999569037630288e-06, "loss": -0.0159, "num_tokens": 24224320.0, "reward": 0.8221930861473083, "reward_std": 0.5352391004562378, "rewards/rollout_reward_func/mean": 0.8221930861473083, "rewards/rollout_reward_func/std": 0.8164379596710205, "sampling/importance_sampling_ratio/max": 0.305854469537735, "sampling/importance_sampling_ratio/mean": 0.16174450516700745, "sampling/importance_sampling_ratio/min": 8.037157744889001e-12, "sampling/sampling_logp_difference/max": 4.2294511795043945, "sampling/sampling_logp_difference/mean": 0.6953935623168945, "step": 1001, "step_time": 11.789504747997853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.3271342515945435, "epoch": 0.01002, "grad_norm": 0.03802679851651192, "kl": 0.7698334157466888, "learning_rate": 9.99956814399839e-06, "loss": -0.0159, "step": 1002, "step_time": 6.695228971017059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1597.0, "completions/max_terminated_length": 1597.0, "completions/mean_length": 775.34375, "completions/mean_terminated_length": 775.34375, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "entropy": 3.956084281206131, "epoch": 0.01003, "frac_reward_zero_std": 0.25, "grad_norm": 0.021655065938830376, "kl": 0.6435342878103256, "learning_rate": 9.999567249441e-06, "loss": -0.0128, "num_tokens": 24290569.0, "reward": 0.8141170144081116, "reward_std": 0.4738766551017761, "rewards/rollout_reward_func/mean": 0.8141170144081116, "rewards/rollout_reward_func/std": 0.7359580397605896, "sampling/importance_sampling_ratio/max": 0.307858407497406, "sampling/importance_sampling_ratio/mean": 0.17211441695690155, "sampling/importance_sampling_ratio/min": 4.3143216316821054e-05, "sampling/sampling_logp_difference/max": 5.084605693817139, "sampling/sampling_logp_difference/mean": 0.5627256631851196, "step": 1003, "step_time": 11.815280444003292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.9575119614601135, "epoch": 0.01004, "grad_norm": 0.02132711000740528, "kl": 0.6417199857532978, "learning_rate": 9.999566353958118e-06, "loss": -0.0129, "step": 1004, "step_time": 6.451344005996361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1331.0, "completions/max_terminated_length": 1331.0, "completions/mean_length": 379.71875, "completions/mean_terminated_length": 391.45159912109375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.3677240014076233, "epoch": 0.01005, "frac_reward_zero_std": 0.5, "grad_norm": 0.05939796194434166, "kl": 0.6725525856018066, "learning_rate": 9.999565457549745e-06, "loss": 0.0009, "num_tokens": 24341357.0, "reward": 1.1182734966278076, "reward_std": 0.02189493365585804, "rewards/rollout_reward_func/mean": 1.1182734966278076, "rewards/rollout_reward_func/std": 0.12907308340072632, "sampling/importance_sampling_ratio/max": 0.5505664944648743, "sampling/importance_sampling_ratio/mean": 0.34911975264549255, "sampling/importance_sampling_ratio/min": 1.8570657164818094e-11, "sampling/sampling_logp_difference/max": 2.6924827098846436, "sampling/sampling_logp_difference/mean": 0.5028355121612549, "step": 1005, "step_time": 10.634539398015477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.378414809703827, "epoch": 0.01006, "grad_norm": 0.03228818252682686, "kl": 0.6726972311735153, "learning_rate": 9.999564560215878e-06, "loss": 0.0008, "step": 1006, "step_time": 6.322411455003021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1303.0, "completions/max_terminated_length": 1303.0, "completions/mean_length": 725.34375, "completions/mean_terminated_length": 725.34375, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "entropy": 4.5604943335056305, "epoch": 0.01007, "frac_reward_zero_std": 0.25, "grad_norm": 0.03457186743617058, "kl": 0.7125952877104282, "learning_rate": 9.999563661956521e-06, "loss": -0.0044, "num_tokens": 24404966.0, "reward": 0.35546088218688965, "reward_std": 0.698154866695404, "rewards/rollout_reward_func/mean": 0.35546088218688965, "rewards/rollout_reward_func/std": 1.0184643268585205, "sampling/importance_sampling_ratio/max": 0.30441054701805115, "sampling/importance_sampling_ratio/mean": 0.17248937487602234, "sampling/importance_sampling_ratio/min": 0.00023882991808932275, "sampling/sampling_logp_difference/max": 2.3175463676452637, "sampling/sampling_logp_difference/mean": 0.6945215463638306, "step": 1007, "step_time": 10.962814246013295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 4.566156834363937, "epoch": 0.01008, "grad_norm": 0.017302200198173523, "kl": 0.7155376970767975, "learning_rate": 9.999562762771671e-06, "loss": -0.0044, "step": 1008, "step_time": 6.286445513011131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.6881998479366302, "epoch": 0.01009, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002516929234843701, "kl": 0.8386494442820549, "learning_rate": 9.999561862661328e-06, "loss": 0.001, "num_tokens": 24439762.0, "reward": 1.1019694805145264, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.1019694805145264, "rewards/rollout_reward_func/std": 0.003151739714667201, "sampling/importance_sampling_ratio/max": 0.5536758303642273, "sampling/importance_sampling_ratio/mean": 0.5441502928733826, "sampling/importance_sampling_ratio/min": 0.5321888327598572, "sampling/sampling_logp_difference/max": 0.6298795938491821, "sampling/sampling_logp_difference/mean": 0.30428993701934814, "step": 1009, "step_time": 4.900091969007917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.6834003627300262, "epoch": 0.0101, "grad_norm": 0.00025685926084406674, "kl": 0.839431993663311, "learning_rate": 9.999560961625496e-06, "loss": 0.001, "step": 1010, "step_time": 2.610620902989467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1602.0, "completions/max_terminated_length": 1602.0, "completions/mean_length": 676.0, "completions/mean_terminated_length": 676.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.320781588554382, "epoch": 0.01011, "frac_reward_zero_std": 0.25, "grad_norm": 0.061148274689912796, "kl": 0.5296310968697071, "learning_rate": 9.999560059664175e-06, "loss": -0.0067, "num_tokens": 24501904.0, "reward": 0.45766177773475647, "reward_std": 0.7177019119262695, "rewards/rollout_reward_func/mean": 0.45766177773475647, "rewards/rollout_reward_func/std": 0.9404799342155457, "sampling/importance_sampling_ratio/max": 0.5547874569892883, "sampling/importance_sampling_ratio/mean": 0.2265568971633911, "sampling/importance_sampling_ratio/min": 3.2527832032353546e-17, "sampling/sampling_logp_difference/max": 3.8880791664123535, "sampling/sampling_logp_difference/mean": 0.745362401008606, "step": 1011, "step_time": 11.685776790982345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.321219176054001, "epoch": 0.01012, "grad_norm": 0.06330855935811996, "kl": 0.5314729437232018, "learning_rate": 9.999559156777358e-06, "loss": -0.0068, "step": 1012, "step_time": 6.895532885988359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1430.0, "completions/max_terminated_length": 1430.0, "completions/mean_length": 513.875, "completions/mean_terminated_length": 529.9354858398438, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.721266329288483, "epoch": 0.01013, "frac_reward_zero_std": 0.25, "grad_norm": 0.0402718149125576, "kl": 0.712926834821701, "learning_rate": 9.999558252965055e-06, "loss": 0.0032, "num_tokens": 24555878.0, "reward": 0.1656104326248169, "reward_std": 0.5648130774497986, "rewards/rollout_reward_func/mean": 0.1656104326248169, "rewards/rollout_reward_func/std": 1.0891296863555908, "sampling/importance_sampling_ratio/max": 0.5505205988883972, "sampling/importance_sampling_ratio/mean": 0.27395185828208923, "sampling/importance_sampling_ratio/min": 2.411047164870006e-13, "sampling/sampling_logp_difference/max": 4.423028469085693, "sampling/sampling_logp_difference/mean": 0.9477685689926147, "step": 1013, "step_time": 10.910316916990269 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 4.742702513933182, "epoch": 0.01014, "grad_norm": 0.013910000212490559, "kl": 0.7135629542171955, "learning_rate": 9.99955734822726e-06, "loss": 0.0031, "step": 1014, "step_time": 6.485865349997766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1473.0, "completions/max_terminated_length": 1473.0, "completions/mean_length": 495.0625, "completions/mean_terminated_length": 495.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.395742356777191, "epoch": 0.01015, "frac_reward_zero_std": 0.25, "grad_norm": 0.030310133472085, "kl": 0.7143898606300354, "learning_rate": 9.999556442563976e-06, "loss": -0.0059, "num_tokens": 24612036.0, "reward": 1.0092322826385498, "reward_std": 0.26441699266433716, "rewards/rollout_reward_func/mean": 1.0092322826385498, "rewards/rollout_reward_func/std": 0.5787513256072998, "sampling/importance_sampling_ratio/max": 0.5478367209434509, "sampling/importance_sampling_ratio/mean": 0.3012220859527588, "sampling/importance_sampling_ratio/min": 0.00025138340424746275, "sampling/sampling_logp_difference/max": 2.5183629989624023, "sampling/sampling_logp_difference/mean": 0.4639310836791992, "step": 1015, "step_time": 11.395134374994086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.404837489128113, "epoch": 0.01016, "grad_norm": 0.030500415712594986, "kl": 0.7141380868852139, "learning_rate": 9.9995555359752e-06, "loss": -0.006, "step": 1016, "step_time": 6.209970696996606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1617.0, "completions/max_terminated_length": 1617.0, "completions/mean_length": 357.46875, "completions/mean_terminated_length": 357.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.359949439764023, "epoch": 0.01017, "frac_reward_zero_std": 0.5, "grad_norm": 0.01392175629734993, "kl": 0.6419147104024887, "learning_rate": 9.999554628460936e-06, "loss": -0.0033, "num_tokens": 24662592.0, "reward": -0.12130655348300934, "reward_std": 0.2032841444015503, "rewards/rollout_reward_func/mean": -0.12130655348300934, "rewards/rollout_reward_func/std": 1.021456241607666, "sampling/importance_sampling_ratio/max": 0.5524564981460571, "sampling/importance_sampling_ratio/mean": 0.37686824798583984, "sampling/importance_sampling_ratio/min": 5.550881837513527e-19, "sampling/sampling_logp_difference/max": 4.427207946777344, "sampling/sampling_logp_difference/mean": 0.9118939638137817, "step": 1017, "step_time": 11.691977120004594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.377725452184677, "epoch": 0.01018, "grad_norm": 0.013598811812698841, "kl": 0.6300220228731632, "learning_rate": 9.999553720021182e-06, "loss": -0.0033, "step": 1018, "step_time": 6.917577698011883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1377.0, "completions/max_terminated_length": 1377.0, "completions/mean_length": 617.03125, "completions/mean_terminated_length": 617.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.6914574801921844, "epoch": 0.01019, "frac_reward_zero_std": 0.25, "grad_norm": 0.02624981850385666, "kl": 0.6002871207892895, "learning_rate": 9.999552810655939e-06, "loss": -0.0012, "num_tokens": 24723246.0, "reward": 0.6770861148834229, "reward_std": 0.49599361419677734, "rewards/rollout_reward_func/mean": 0.6770861148834229, "rewards/rollout_reward_func/std": 0.8258283734321594, "sampling/importance_sampling_ratio/max": 0.5512887239456177, "sampling/importance_sampling_ratio/mean": 0.23025217652320862, "sampling/importance_sampling_ratio/min": 4.9030387802367296e-15, "sampling/sampling_logp_difference/max": 12.28420639038086, "sampling/sampling_logp_difference/mean": 0.9253937602043152, "step": 1019, "step_time": 11.581428350989881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.705299407243729, "epoch": 0.0102, "grad_norm": 0.0268723051995039, "kl": 0.5978711619973183, "learning_rate": 9.999551900365207e-06, "loss": -0.0012, "step": 1020, "step_time": 6.644435677008005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1439.0, "completions/max_terminated_length": 1439.0, "completions/mean_length": 782.875, "completions/mean_terminated_length": 782.875, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "entropy": 4.529172897338867, "epoch": 0.01021, "frac_reward_zero_std": 0.0, "grad_norm": 0.08566246181726456, "kl": 0.5309905186295509, "learning_rate": 9.999550989148985e-06, "loss": -0.0046, "num_tokens": 24790160.0, "reward": 0.3241449296474457, "reward_std": 0.8944140672683716, "rewards/rollout_reward_func/mean": 0.3241449296474457, "rewards/rollout_reward_func/std": 1.1006196737289429, "sampling/importance_sampling_ratio/max": 0.3060303330421448, "sampling/importance_sampling_ratio/mean": 0.14218854904174805, "sampling/importance_sampling_ratio/min": 4.3720915527956095e-06, "sampling/sampling_logp_difference/max": 4.393390655517578, "sampling/sampling_logp_difference/mean": 0.7215535640716553, "step": 1021, "step_time": 11.48300823300815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.559604287147522, "epoch": 0.01022, "grad_norm": 0.09068978577852249, "kl": 0.526032280176878, "learning_rate": 9.999550077007277e-06, "loss": -0.0048, "step": 1022, "step_time": 6.2081753429883975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1258.0, "completions/max_terminated_length": 1258.0, "completions/mean_length": 549.09375, "completions/mean_terminated_length": 549.09375, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "entropy": 3.5752796828746796, "epoch": 0.01023, "frac_reward_zero_std": 0.0, "grad_norm": 0.09235069155693054, "kl": 0.6304505504667759, "learning_rate": 9.999549163940078e-06, "loss": -0.0095, "num_tokens": 24849527.0, "reward": 1.0052635669708252, "reward_std": 0.4396611452102661, "rewards/rollout_reward_func/mean": 1.0052635669708252, "rewards/rollout_reward_func/std": 0.6377554535865784, "sampling/importance_sampling_ratio/max": 0.3085218071937561, "sampling/importance_sampling_ratio/mean": 0.224301278591156, "sampling/importance_sampling_ratio/min": 3.870422593532652e-10, "sampling/sampling_logp_difference/max": 9.454436302185059, "sampling/sampling_logp_difference/mean": 0.5551091432571411, "step": 1023, "step_time": 10.30152972999349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 3.6470492780208588, "epoch": 0.01024, "grad_norm": 0.09544817358255386, "kl": 0.616082739084959, "learning_rate": 9.999548249947393e-06, "loss": -0.0099, "step": 1024, "step_time": 6.227001680992544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2194.0, "completions/max_terminated_length": 2194.0, "completions/mean_length": 985.0, "completions/mean_terminated_length": 985.0, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "entropy": 5.591999650001526, "epoch": 0.01025, "frac_reward_zero_std": 0.0, "grad_norm": 0.047996893525123596, "kl": 0.6562155671417713, "learning_rate": 9.999547335029218e-06, "loss": -0.0026, "num_tokens": 24922819.0, "reward": 0.5181870460510254, "reward_std": 1.0879014730453491, "rewards/rollout_reward_func/mean": 0.5181870460510254, "rewards/rollout_reward_func/std": 1.097529649734497, "sampling/importance_sampling_ratio/max": 0.30235108733177185, "sampling/importance_sampling_ratio/mean": 0.09555739909410477, "sampling/importance_sampling_ratio/min": 2.1360762654760583e-18, "sampling/sampling_logp_difference/max": 13.517428398132324, "sampling/sampling_logp_difference/mean": 1.0766173601150513, "step": 1025, "step_time": 14.47120615998574 }, { "clip_ratio/high_max": 0.03705357201397419, "clip_ratio/high_mean": 0.018526786006987095, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026339286006987095, "entropy": 5.65596342086792, "epoch": 0.01026, "grad_norm": 0.023338790982961655, "kl": 0.641275878995657, "learning_rate": 9.999546419185557e-06, "loss": -0.0028, "step": 1026, "step_time": 7.7857103959831875 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.03125, "completions/max_length": 810.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 151.0625, "completions/mean_terminated_length": 129.8064422607422, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.1305853128433228, "epoch": 0.01027, "frac_reward_zero_std": 0.75, "grad_norm": 0.03914261609315872, "kl": 0.7541664130985737, "learning_rate": 9.99954550241641e-06, "loss": 0.0047, "num_tokens": 24965355.0, "reward": 0.7065185308456421, "reward_std": 0.25022971630096436, "rewards/rollout_reward_func/mean": 0.7065185308456421, "rewards/rollout_reward_func/std": 0.8244343996047974, "sampling/importance_sampling_ratio/max": 0.5529090166091919, "sampling/importance_sampling_ratio/mean": 0.4422188997268677, "sampling/importance_sampling_ratio/min": 1.6333998389961302e-11, "sampling/sampling_logp_difference/max": 3.957094192504883, "sampling/sampling_logp_difference/mean": 0.4971124231815338, "step": 1027, "step_time": 8.534991103006178 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 3.1656832695007324, "epoch": 0.01028, "grad_norm": 0.039249833673238754, "kl": 0.7500411421060562, "learning_rate": 9.999544584721775e-06, "loss": 0.0046, "step": 1028, "step_time": 4.659153545013396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1399.0, "completions/max_terminated_length": 1399.0, "completions/mean_length": 672.84375, "completions/mean_terminated_length": 672.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.728755980730057, "epoch": 0.01029, "frac_reward_zero_std": 0.0, "grad_norm": 0.029930420219898224, "kl": 0.5063529312610626, "learning_rate": 9.99954366610165e-06, "loss": -0.0046, "num_tokens": 25029282.0, "reward": 0.7790826559066772, "reward_std": 0.510993242263794, "rewards/rollout_reward_func/mean": 0.7790826559066772, "rewards/rollout_reward_func/std": 0.8527136445045471, "sampling/importance_sampling_ratio/max": 0.30104896426200867, "sampling/importance_sampling_ratio/mean": 0.1326015293598175, "sampling/importance_sampling_ratio/min": 1.0610841094660373e-09, "sampling/sampling_logp_difference/max": 11.420570373535156, "sampling/sampling_logp_difference/mean": 0.7760703563690186, "step": 1029, "step_time": 11.08072875799553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.759628623723984, "epoch": 0.0103, "grad_norm": 0.030473923310637474, "kl": 0.503642562776804, "learning_rate": 9.999542746556042e-06, "loss": -0.0045, "step": 1030, "step_time": 6.62545380499796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.03125, "completions/max_length": 1080.0, "completions/max_terminated_length": 1080.0, "completions/mean_length": 291.875, "completions/mean_terminated_length": 300.774169921875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.451302140951157, "epoch": 0.01031, "frac_reward_zero_std": 0.5, "grad_norm": 0.05567794665694237, "kl": 0.668168444186449, "learning_rate": 9.999541826084947e-06, "loss": 0.001, "num_tokens": 25076776.0, "reward": 0.4052836298942566, "reward_std": 0.2870924174785614, "rewards/rollout_reward_func/mean": 0.4052836298942566, "rewards/rollout_reward_func/std": 1.0612505674362183, "sampling/importance_sampling_ratio/max": 0.5448439717292786, "sampling/importance_sampling_ratio/mean": 0.26586055755615234, "sampling/importance_sampling_ratio/min": 3.197852401921864e-15, "sampling/sampling_logp_difference/max": 3.9818434715270996, "sampling/sampling_logp_difference/mean": 0.9162989854812622, "step": 1031, "step_time": 9.58731192000414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 4.473619878292084, "epoch": 0.01032, "grad_norm": 0.050992753356695175, "kl": 0.6654686704277992, "learning_rate": 9.999540904688363e-06, "loss": 0.0009, "step": 1032, "step_time": 5.189514588004386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1367.0, "completions/max_terminated_length": 1367.0, "completions/mean_length": 498.375, "completions/mean_terminated_length": 498.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.8276524245738983, "epoch": 0.01033, "frac_reward_zero_std": 0.25, "grad_norm": 0.08054932951927185, "kl": 0.48530082032084465, "learning_rate": 9.999539982366296e-06, "loss": -0.0082, "num_tokens": 25134523.0, "reward": 0.9995776414871216, "reward_std": 0.3615216612815857, "rewards/rollout_reward_func/mean": 0.9995776414871216, "rewards/rollout_reward_func/std": 0.5351185202598572, "sampling/importance_sampling_ratio/max": 0.5503594875335693, "sampling/importance_sampling_ratio/mean": 0.24128466844558716, "sampling/importance_sampling_ratio/min": 0.0009260592050850391, "sampling/sampling_logp_difference/max": 2.603914976119995, "sampling/sampling_logp_difference/mean": 0.5201395750045776, "step": 1033, "step_time": 11.010333972997614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.8746362924575806, "epoch": 0.01034, "grad_norm": 0.04838310182094574, "kl": 0.4856616444885731, "learning_rate": 9.99953905911874e-06, "loss": -0.0083, "step": 1034, "step_time": 6.030993909997051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2230.0, "completions/max_terminated_length": 2230.0, "completions/mean_length": 1021.46875, "completions/mean_terminated_length": 1021.46875, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "entropy": 5.550190687179565, "epoch": 0.01035, "frac_reward_zero_std": 0.0, "grad_norm": 0.04598653316497803, "kl": 0.5416033565998077, "learning_rate": 9.999538134945701e-06, "loss": -0.0038, "num_tokens": 25209420.0, "reward": 1.018578052520752, "reward_std": 0.461384654045105, "rewards/rollout_reward_func/mean": 1.018578052520752, "rewards/rollout_reward_func/std": 0.6250579953193665, "sampling/importance_sampling_ratio/max": 0.29996684193611145, "sampling/importance_sampling_ratio/mean": 0.10153479874134064, "sampling/importance_sampling_ratio/min": 3.347646467911559e-19, "sampling/sampling_logp_difference/max": 11.782835960388184, "sampling/sampling_logp_difference/mean": 1.0699148178100586, "step": 1035, "step_time": 14.642759574991942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.556042850017548, "epoch": 0.01036, "grad_norm": 0.04634781926870346, "kl": 0.5368504002690315, "learning_rate": 9.999537209847177e-06, "loss": -0.0038, "step": 1036, "step_time": 7.926616626995383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1380.0, "completions/max_terminated_length": 1380.0, "completions/mean_length": 746.96875, "completions/mean_terminated_length": 746.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.820212036371231, "epoch": 0.01037, "frac_reward_zero_std": 0.25, "grad_norm": 0.06530660390853882, "kl": 0.5914548635482788, "learning_rate": 9.999536283823168e-06, "loss": 0.0033, "num_tokens": 25274220.0, "reward": 0.9111982583999634, "reward_std": 0.32386481761932373, "rewards/rollout_reward_func/mean": 0.9111982583999634, "rewards/rollout_reward_func/std": 0.7519922852516174, "sampling/importance_sampling_ratio/max": 0.551983654499054, "sampling/importance_sampling_ratio/mean": 0.15915067493915558, "sampling/importance_sampling_ratio/min": 9.129172244548567e-12, "sampling/sampling_logp_difference/max": 12.509172439575195, "sampling/sampling_logp_difference/mean": 0.8153478503227234, "step": 1037, "step_time": 11.915868279007555 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0020833334419876337, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020833334419876337, "entropy": 4.864973545074463, "epoch": 0.01038, "grad_norm": 0.06561736762523651, "kl": 0.5860785618424416, "learning_rate": 9.999535356873673e-06, "loss": 0.0032, "step": 1038, "step_time": 6.194830085012654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1967.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 589.0625, "completions/mean_terminated_length": 589.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.888054609298706, "epoch": 0.01039, "frac_reward_zero_std": 0.5, "grad_norm": 0.027497727423906326, "kl": 0.5400012135505676, "learning_rate": 9.999534428998694e-06, "loss": -0.0066, "num_tokens": 25332130.0, "reward": 0.8196384906768799, "reward_std": 0.5527935028076172, "rewards/rollout_reward_func/mean": 0.8196384906768799, "rewards/rollout_reward_func/std": 0.8270880579948425, "sampling/importance_sampling_ratio/max": 0.5414133071899414, "sampling/importance_sampling_ratio/mean": 0.22690114378929138, "sampling/importance_sampling_ratio/min": 1.2608435879762187e-15, "sampling/sampling_logp_difference/max": 4.06583833694458, "sampling/sampling_logp_difference/mean": 0.8975342512130737, "step": 1039, "step_time": 12.88091157598683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.9394590854644775, "epoch": 0.0104, "grad_norm": 0.02789594791829586, "kl": 0.5342586673796177, "learning_rate": 9.999533500198229e-06, "loss": -0.0066, "step": 1040, "step_time": 7.260426685999846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1712.0, "completions/max_terminated_length": 1712.0, "completions/mean_length": 527.21875, "completions/mean_terminated_length": 527.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.971460342407227, "epoch": 0.01041, "frac_reward_zero_std": 0.25, "grad_norm": 0.04106605798006058, "kl": 0.6933506950736046, "learning_rate": 9.999532570472281e-06, "loss": -0.0041, "num_tokens": 25388497.0, "reward": 0.7963317632675171, "reward_std": 0.7215906381607056, "rewards/rollout_reward_func/mean": 0.7963317632675171, "rewards/rollout_reward_func/std": 0.8599229454994202, "sampling/importance_sampling_ratio/max": 0.5271613597869873, "sampling/importance_sampling_ratio/mean": 0.18455016613006592, "sampling/importance_sampling_ratio/min": 2.589203536903367e-13, "sampling/sampling_logp_difference/max": 3.582059383392334, "sampling/sampling_logp_difference/mean": 0.8794147968292236, "step": 1041, "step_time": 12.56226800200966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.962617129087448, "epoch": 0.01042, "grad_norm": 0.04334601014852524, "kl": 0.6949352994561195, "learning_rate": 9.99953163982085e-06, "loss": -0.0041, "step": 1042, "step_time": 6.8576044459987315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1448.0, "completions/max_terminated_length": 1448.0, "completions/mean_length": 766.5, "completions/mean_terminated_length": 766.5, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "entropy": 4.977444142103195, "epoch": 0.01043, "frac_reward_zero_std": 0.0, "grad_norm": 0.09308309108018875, "kl": 0.3753646593540907, "learning_rate": 9.999530708243934e-06, "loss": -0.001, "num_tokens": 25454571.0, "reward": 0.67226243019104, "reward_std": 0.72996586561203, "rewards/rollout_reward_func/mean": 0.67226243019104, "rewards/rollout_reward_func/std": 0.9444065690040588, "sampling/importance_sampling_ratio/max": 0.3067317605018616, "sampling/importance_sampling_ratio/mean": 0.11937634646892548, "sampling/importance_sampling_ratio/min": 2.9920586407800043e-13, "sampling/sampling_logp_difference/max": 3.4378137588500977, "sampling/sampling_logp_difference/mean": 0.832266092300415, "step": 1043, "step_time": 11.209824703000777 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 4.862116575241089, "epoch": 0.01044, "grad_norm": 0.08831577748060226, "kl": 0.3819267489016056, "learning_rate": 9.999529775741534e-06, "loss": -0.0013, "step": 1044, "step_time": 6.175025324999297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1183.0, "completions/max_terminated_length": 1183.0, "completions/mean_length": 540.625, "completions/mean_terminated_length": 540.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.504600942134857, "epoch": 0.01045, "frac_reward_zero_std": 0.25, "grad_norm": 0.05342355743050575, "kl": 0.5972879379987717, "learning_rate": 9.999528842313652e-06, "loss": -0.0008, "num_tokens": 25511683.0, "reward": 0.1037360429763794, "reward_std": 0.03832197189331055, "rewards/rollout_reward_func/mean": 0.1037360429763794, "rewards/rollout_reward_func/std": 1.0575379133224487, "sampling/importance_sampling_ratio/max": 0.5487291812896729, "sampling/importance_sampling_ratio/mean": 0.20981019735336304, "sampling/importance_sampling_ratio/min": 9.652777225710452e-05, "sampling/sampling_logp_difference/max": 2.4976301193237305, "sampling/sampling_logp_difference/mean": 0.6333364248275757, "step": 1045, "step_time": 10.562854440002411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.381782740354538, "epoch": 0.01046, "grad_norm": 0.03776256740093231, "kl": 0.6027568429708481, "learning_rate": 9.999527907960287e-06, "loss": -0.001, "step": 1046, "step_time": 5.738917253984255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1241.0, "completions/max_terminated_length": 1241.0, "completions/mean_length": 247.6875, "completions/mean_terminated_length": 247.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.4445791244506836, "epoch": 0.01047, "frac_reward_zero_std": 0.75, "grad_norm": 0.014525948092341423, "kl": 0.6891071945428848, "learning_rate": 9.999526972681438e-06, "loss": -0.0008, "num_tokens": 25555335.0, "reward": 0.9124147891998291, "reward_std": 0.30116006731987, "rewards/rollout_reward_func/mean": 0.9124147891998291, "rewards/rollout_reward_func/std": 0.664121150970459, "sampling/importance_sampling_ratio/max": 0.5531890392303467, "sampling/importance_sampling_ratio/mean": 0.41298162937164307, "sampling/importance_sampling_ratio/min": 0.005324002355337143, "sampling/sampling_logp_difference/max": 1.8816885948181152, "sampling/sampling_logp_difference/mean": 0.45271480083465576, "step": 1047, "step_time": 10.610623346001375 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.389825314283371, "epoch": 0.01048, "grad_norm": 0.011522998102009296, "kl": 0.6927279978990555, "learning_rate": 9.999526036477107e-06, "loss": -0.0008, "step": 1048, "step_time": 5.83977680499811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1381.0, "completions/max_terminated_length": 1381.0, "completions/mean_length": 602.34375, "completions/mean_terminated_length": 602.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.0719955265522, "epoch": 0.01049, "frac_reward_zero_std": 0.25, "grad_norm": 0.034175071865320206, "kl": 0.6995933800935745, "learning_rate": 9.999525099347293e-06, "loss": -0.0026, "num_tokens": 25615824.0, "reward": 0.6797000765800476, "reward_std": 0.02440585196018219, "rewards/rollout_reward_func/mean": 0.6797000765800476, "rewards/rollout_reward_func/std": 0.9397779107093811, "sampling/importance_sampling_ratio/max": 0.5405997633934021, "sampling/importance_sampling_ratio/mean": 0.2237962782382965, "sampling/importance_sampling_ratio/min": 0.005434779915958643, "sampling/sampling_logp_difference/max": 3.322289228439331, "sampling/sampling_logp_difference/mean": 0.5174392461776733, "step": 1049, "step_time": 11.04574620600033 }, { "clip_ratio/high_max": 0.037500000558793545, "clip_ratio/high_mean": 0.018750000279396772, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018750000279396772, "entropy": 3.959404468536377, "epoch": 0.0105, "grad_norm": 0.025269271805882454, "kl": 0.7099743075668812, "learning_rate": 9.999524161291997e-06, "loss": -0.0027, "step": 1050, "step_time": 6.135734422001406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1502.0, "completions/max_terminated_length": 1502.0, "completions/mean_length": 566.65625, "completions/mean_terminated_length": 562.0322265625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.7922723591327667, "epoch": 0.01051, "frac_reward_zero_std": 0.25, "grad_norm": 0.05978448688983917, "kl": 0.5607864782214165, "learning_rate": 9.99952322231122e-06, "loss": -0.0097, "num_tokens": 25674225.0, "reward": 0.8863339424133301, "reward_std": 0.4809461832046509, "rewards/rollout_reward_func/mean": 0.8863339424133301, "rewards/rollout_reward_func/std": 0.7405045032501221, "sampling/importance_sampling_ratio/max": 0.5581222176551819, "sampling/importance_sampling_ratio/mean": 0.28131115436553955, "sampling/importance_sampling_ratio/min": 7.946578561009128e-17, "sampling/sampling_logp_difference/max": 3.6559629440307617, "sampling/sampling_logp_difference/mean": 0.6898416876792908, "step": 1051, "step_time": 10.679668030999892 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.739591062068939, "epoch": 0.01052, "grad_norm": 0.052436064928770065, "kl": 0.5643653385341167, "learning_rate": 9.99952228240496e-06, "loss": -0.0099, "step": 1052, "step_time": 6.537937793997116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2138.0, "completions/max_terminated_length": 2138.0, "completions/mean_length": 744.3125, "completions/mean_terminated_length": 744.3125, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "entropy": 4.078800916671753, "epoch": 0.01053, "frac_reward_zero_std": 0.0, "grad_norm": 0.06671728193759918, "kl": 0.5488265790045261, "learning_rate": 9.999521341573221e-06, "loss": -0.0129, "num_tokens": 25739671.0, "reward": 0.8018462061882019, "reward_std": 0.8814876675605774, "rewards/rollout_reward_func/mean": 0.8018462061882019, "rewards/rollout_reward_func/std": 0.8799902200698853, "sampling/importance_sampling_ratio/max": 0.30570435523986816, "sampling/importance_sampling_ratio/mean": 0.19048956036567688, "sampling/importance_sampling_ratio/min": 8.682094575601898e-22, "sampling/sampling_logp_difference/max": 10.970651626586914, "sampling/sampling_logp_difference/mean": 0.7355945110321045, "step": 1053, "step_time": 13.609246706990234 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 4.028005868196487, "epoch": 0.01054, "grad_norm": 0.06463883817195892, "kl": 0.5575513876974583, "learning_rate": 9.999520399815998e-06, "loss": -0.0129, "step": 1054, "step_time": 8.102156984983594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2104.0, "completions/max_terminated_length": 2104.0, "completions/mean_length": 817.09375, "completions/mean_terminated_length": 842.9354858398438, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.739003747701645, "epoch": 0.01055, "frac_reward_zero_std": 0.25, "grad_norm": 0.01176211517304182, "kl": 0.5881996154785156, "learning_rate": 9.999519457133295e-06, "loss": -0.0097, "num_tokens": 25805283.0, "reward": 0.8128964304924011, "reward_std": 0.5337049961090088, "rewards/rollout_reward_func/mean": 0.8128964304924011, "rewards/rollout_reward_func/std": 0.8017608523368835, "sampling/importance_sampling_ratio/max": 0.5477339029312134, "sampling/importance_sampling_ratio/mean": 0.22528736293315887, "sampling/importance_sampling_ratio/min": 7.570434893116457e-14, "sampling/sampling_logp_difference/max": 3.421675443649292, "sampling/sampling_logp_difference/mean": 0.8229086399078369, "step": 1055, "step_time": 13.010881876987696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.722516179084778, "epoch": 0.01056, "grad_norm": 0.011036389507353306, "kl": 0.5912976041436195, "learning_rate": 9.999518513525112e-06, "loss": -0.0097, "step": 1056, "step_time": 7.471165534996544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2327.0, "completions/max_terminated_length": 2327.0, "completions/mean_length": 972.90625, "completions/mean_terminated_length": 972.90625, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "entropy": 3.9856348037719727, "epoch": 0.01057, "frac_reward_zero_std": 0.0, "grad_norm": 0.026641665026545525, "kl": 0.6648684851825237, "learning_rate": 9.999517568991448e-06, "loss": -0.0034, "num_tokens": 25877852.0, "reward": 0.7140668630599976, "reward_std": 0.46829813718795776, "rewards/rollout_reward_func/mean": 0.7140668630599976, "rewards/rollout_reward_func/std": 0.8218598365783691, "sampling/importance_sampling_ratio/max": 0.2947962284088135, "sampling/importance_sampling_ratio/mean": 0.17257186770439148, "sampling/importance_sampling_ratio/min": 1.8290707837209652e-10, "sampling/sampling_logp_difference/max": 11.571012496948242, "sampling/sampling_logp_difference/mean": 0.6456868648529053, "step": 1057, "step_time": 14.576181814009033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.9565739035606384, "epoch": 0.01058, "grad_norm": 0.021450255066156387, "kl": 0.6679041758179665, "learning_rate": 9.999516623532303e-06, "loss": -0.0035, "step": 1058, "step_time": 8.198530587993446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1259.0, "completions/max_terminated_length": 1259.0, "completions/mean_length": 501.6875, "completions/mean_terminated_length": 501.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.60075381398201, "epoch": 0.01059, "frac_reward_zero_std": 0.25, "grad_norm": 0.054699476808309555, "kl": 0.7468784190714359, "learning_rate": 9.99951567714768e-06, "loss": 0.002, "num_tokens": 25934074.0, "reward": -0.24788403511047363, "reward_std": 0.31091806292533875, "rewards/rollout_reward_func/mean": -0.24788403511047363, "rewards/rollout_reward_func/std": 1.0722310543060303, "sampling/importance_sampling_ratio/max": 0.5534303784370422, "sampling/importance_sampling_ratio/mean": 0.28316783905029297, "sampling/importance_sampling_ratio/min": 0.009340609423816204, "sampling/sampling_logp_difference/max": 2.2044708728790283, "sampling/sampling_logp_difference/mean": 0.45913827419281006, "step": 1059, "step_time": 10.290746816019237 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.5760727524757385, "epoch": 0.0106, "grad_norm": 0.04353693872690201, "kl": 0.7492297515273094, "learning_rate": 9.999514729837577e-06, "loss": 0.0019, "step": 1060, "step_time": 6.10096113000327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2206.0, "completions/max_terminated_length": 2206.0, "completions/mean_length": 686.0, "completions/mean_terminated_length": 688.2257690429688, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.9909788072109222, "epoch": 0.01061, "frac_reward_zero_std": 0.25, "grad_norm": 0.18192502856254578, "kl": 0.6046500243246555, "learning_rate": 9.999513781601992e-06, "loss": -0.004, "num_tokens": 25995959.0, "reward": 0.8134256601333618, "reward_std": 0.6388046741485596, "rewards/rollout_reward_func/mean": 0.8134256601333618, "rewards/rollout_reward_func/std": 0.7715610265731812, "sampling/importance_sampling_ratio/max": 0.5529170632362366, "sampling/importance_sampling_ratio/mean": 0.25156545639038086, "sampling/importance_sampling_ratio/min": 2.2784482633930736e-10, "sampling/sampling_logp_difference/max": 4.064633369445801, "sampling/sampling_logp_difference/mean": 0.6391501426696777, "step": 1061, "step_time": 13.76093883098656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.9721734821796417, "epoch": 0.01062, "grad_norm": 0.06032123044133186, "kl": 0.6087890304625034, "learning_rate": 9.99951283244093e-06, "loss": -0.0048, "step": 1062, "step_time": 7.5395271529923775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1352.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 491.25, "completions/mean_terminated_length": 491.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.0621531307697296, "epoch": 0.01063, "frac_reward_zero_std": 0.5, "grad_norm": 0.060449108481407166, "kl": 0.727036826312542, "learning_rate": 9.999511882354389e-06, "loss": 0.0014, "num_tokens": 26050034.0, "reward": 1.1002600193023682, "reward_std": 0.22186657786369324, "rewards/rollout_reward_func/mean": 1.1002600193023682, "rewards/rollout_reward_func/std": 0.4125767946243286, "sampling/importance_sampling_ratio/max": 0.5546656847000122, "sampling/importance_sampling_ratio/mean": 0.34787845611572266, "sampling/importance_sampling_ratio/min": 0.002637215657159686, "sampling/sampling_logp_difference/max": 3.6517081260681152, "sampling/sampling_logp_difference/mean": 0.3706575632095337, "step": 1063, "step_time": 11.598691175000567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.0767181515693665, "epoch": 0.01064, "grad_norm": 0.06728089600801468, "kl": 0.7247865870594978, "learning_rate": 9.999510931342367e-06, "loss": 0.0013, "step": 1064, "step_time": 6.704310576002172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1955.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 626.78125, "completions/mean_terminated_length": 646.4838256835938, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.542897135019302, "epoch": 0.01065, "frac_reward_zero_std": 0.25, "grad_norm": 0.029909098520874977, "kl": 0.7484886646270752, "learning_rate": 9.999509979404867e-06, "loss": -0.0041, "num_tokens": 26109026.0, "reward": 0.2629876136779785, "reward_std": 0.47295162081718445, "rewards/rollout_reward_func/mean": 0.2629876136779785, "rewards/rollout_reward_func/std": 1.086465835571289, "sampling/importance_sampling_ratio/max": 0.5544455051422119, "sampling/importance_sampling_ratio/mean": 0.2481609433889389, "sampling/importance_sampling_ratio/min": 3.478289298296591e-14, "sampling/sampling_logp_difference/max": 3.95489239692688, "sampling/sampling_logp_difference/mean": 0.7128568887710571, "step": 1065, "step_time": 12.665996997995535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.548022091388702, "epoch": 0.01066, "grad_norm": 0.030365876853466034, "kl": 0.7498941272497177, "learning_rate": 9.999509026541889e-06, "loss": -0.0041, "step": 1066, "step_time": 7.732990934004192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2501.0, "completions/max_terminated_length": 2501.0, "completions/mean_length": 823.84375, "completions/mean_terminated_length": 849.9031982421875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.0500006675720215, "epoch": 0.01067, "frac_reward_zero_std": 0.0, "grad_norm": 0.03833025321364403, "kl": 0.72388856112957, "learning_rate": 9.999508072753433e-06, "loss": -0.0085, "num_tokens": 26175967.0, "reward": 0.5092016458511353, "reward_std": 0.688133955001831, "rewards/rollout_reward_func/mean": 0.5092016458511353, "rewards/rollout_reward_func/std": 0.9623427391052246, "sampling/importance_sampling_ratio/max": 0.5530078411102295, "sampling/importance_sampling_ratio/mean": 0.24246399104595184, "sampling/importance_sampling_ratio/min": 4.5143336985242755e-11, "sampling/sampling_logp_difference/max": 3.9595580101013184, "sampling/sampling_logp_difference/mean": 0.6385789513587952, "step": 1067, "step_time": 14.324690089000796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.05137500166893, "epoch": 0.01068, "grad_norm": 0.03820168972015381, "kl": 0.7241230085492134, "learning_rate": 9.999507118039498e-06, "loss": -0.0085, "step": 1068, "step_time": 8.118026545002067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2236.0, "completions/max_terminated_length": 2236.0, "completions/mean_length": 1109.0, "completions/mean_terminated_length": 1109.0, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "entropy": 4.014167904853821, "epoch": 0.01069, "frac_reward_zero_std": 0.0, "grad_norm": 0.03062097541987896, "kl": 0.8848347328603268, "learning_rate": 9.999506162400088e-06, "loss": -0.0029, "num_tokens": 26254307.0, "reward": 0.36360055208206177, "reward_std": 0.9355719089508057, "rewards/rollout_reward_func/mean": 0.36360055208206177, "rewards/rollout_reward_func/std": 1.069339394569397, "sampling/importance_sampling_ratio/max": 0.3050198554992676, "sampling/importance_sampling_ratio/mean": 0.13403278589248657, "sampling/importance_sampling_ratio/min": 0.00012651519500650465, "sampling/sampling_logp_difference/max": 2.672262191772461, "sampling/sampling_logp_difference/mean": 0.5390782952308655, "step": 1069, "step_time": 14.582252041989705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.02396634221077, "epoch": 0.0107, "grad_norm": 0.026172611862421036, "kl": 0.8707437291741371, "learning_rate": 9.999505205835198e-06, "loss": -0.0029, "step": 1070, "step_time": 8.536914336997143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2030.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 968.40625, "completions/mean_terminated_length": 968.40625, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "entropy": 3.719958007335663, "epoch": 0.01071, "frac_reward_zero_std": 0.0, "grad_norm": 0.031597089022397995, "kl": 0.5999252237379551, "learning_rate": 9.999504248344831e-06, "loss": -0.0074, "num_tokens": 26326952.0, "reward": 0.34981369972229004, "reward_std": 0.43856021761894226, "rewards/rollout_reward_func/mean": 0.34981369972229004, "rewards/rollout_reward_func/std": 1.0440902709960938, "sampling/importance_sampling_ratio/max": 0.30514639616012573, "sampling/importance_sampling_ratio/mean": 0.16490396857261658, "sampling/importance_sampling_ratio/min": 9.123671436406244e-12, "sampling/sampling_logp_difference/max": 12.79852294921875, "sampling/sampling_logp_difference/mean": 0.6427367925643921, "step": 1071, "step_time": 13.630435414997919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.71403107047081, "epoch": 0.01072, "grad_norm": 0.030860716477036476, "kl": 0.602145541459322, "learning_rate": 9.99950328992899e-06, "loss": -0.0074, "step": 1072, "step_time": 7.425518184005341 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 2119.0, "completions/max_terminated_length": 2119.0, "completions/mean_length": 687.0625, "completions/mean_terminated_length": 687.0625, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "entropy": 3.1765688359737396, "epoch": 0.01073, "frac_reward_zero_std": 0.5, "grad_norm": 0.06941226869821548, "kl": 0.7691433653235435, "learning_rate": 9.99950233058767e-06, "loss": 0.0052, "num_tokens": 26389710.0, "reward": 0.8416248559951782, "reward_std": 0.2962298095226288, "rewards/rollout_reward_func/mean": 0.8416248559951782, "rewards/rollout_reward_func/std": 0.7966400384902954, "sampling/importance_sampling_ratio/max": 0.3391980826854706, "sampling/importance_sampling_ratio/mean": 0.2389087677001953, "sampling/importance_sampling_ratio/min": 4.3471670558539906e-15, "sampling/sampling_logp_difference/max": 3.3596749305725098, "sampling/sampling_logp_difference/mean": 0.4891839325428009, "step": 1073, "step_time": 13.269507401004375 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.187870144844055, "epoch": 0.01074, "grad_norm": 0.06210951507091522, "kl": 0.7654987424612045, "learning_rate": 9.999501370320872e-06, "loss": 0.005, "step": 1074, "step_time": 7.383651075986563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1556.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 704.40625, "completions/mean_terminated_length": 704.40625, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "entropy": 3.665571540594101, "epoch": 0.01075, "frac_reward_zero_std": 0.25, "grad_norm": 0.022352810949087143, "kl": 0.7372833117842674, "learning_rate": 9.999500409128599e-06, "loss": -0.0006, "num_tokens": 26454293.0, "reward": 0.11863769590854645, "reward_std": 0.39109405875205994, "rewards/rollout_reward_func/mean": 0.11863769590854645, "rewards/rollout_reward_func/std": 1.1260255575180054, "sampling/importance_sampling_ratio/max": 0.3080194294452667, "sampling/importance_sampling_ratio/mean": 0.19739238917827606, "sampling/importance_sampling_ratio/min": 0.019123472273349762, "sampling/sampling_logp_difference/max": 2.2961266040802, "sampling/sampling_logp_difference/mean": 0.46557047963142395, "step": 1075, "step_time": 11.49842600399279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.6744439601898193, "epoch": 0.01076, "grad_norm": 0.02347937412559986, "kl": 0.737735528498888, "learning_rate": 9.99949944701085e-06, "loss": -0.0006, "step": 1076, "step_time": 6.672511538992694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1574.0, "completions/max_terminated_length": 1574.0, "completions/mean_length": 707.53125, "completions/mean_terminated_length": 707.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.215282380580902, "epoch": 0.01077, "frac_reward_zero_std": 0.25, "grad_norm": 0.04572855308651924, "kl": 0.6756180562078953, "learning_rate": 9.999498483967625e-06, "loss": -0.0046, "num_tokens": 26516271.0, "reward": 0.8236799240112305, "reward_std": 0.5028373003005981, "rewards/rollout_reward_func/mean": 0.8236799240112305, "rewards/rollout_reward_func/std": 0.8055359721183777, "sampling/importance_sampling_ratio/max": 0.5522487759590149, "sampling/importance_sampling_ratio/mean": 0.27742791175842285, "sampling/importance_sampling_ratio/min": 2.1465865380782684e-13, "sampling/sampling_logp_difference/max": 3.3561315536499023, "sampling/sampling_logp_difference/mean": 0.47750183939933777, "step": 1077, "step_time": 11.88477135100402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.217511087656021, "epoch": 0.01078, "grad_norm": 0.04134608060121536, "kl": 0.6756375804543495, "learning_rate": 9.999497519998923e-06, "loss": -0.0048, "step": 1078, "step_time": 6.356267424998805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1417.0, "completions/max_terminated_length": 1417.0, "completions/mean_length": 692.625, "completions/mean_terminated_length": 692.3547973632812, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.3134100139141083, "epoch": 0.01079, "frac_reward_zero_std": 0.25, "grad_norm": 0.01712828129529953, "kl": 0.633019745349884, "learning_rate": 9.99949655510475e-06, "loss": -0.0003, "num_tokens": 26578321.0, "reward": 1.0813632011413574, "reward_std": 0.27343088388442993, "rewards/rollout_reward_func/mean": 1.0813632011413574, "rewards/rollout_reward_func/std": 0.5315710306167603, "sampling/importance_sampling_ratio/max": 0.5520837903022766, "sampling/importance_sampling_ratio/mean": 0.26901715993881226, "sampling/importance_sampling_ratio/min": 2.8358866361981316e-15, "sampling/sampling_logp_difference/max": 4.192270278930664, "sampling/sampling_logp_difference/mean": 0.5254329442977905, "step": 1079, "step_time": 11.26954471701174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.3142412304878235, "epoch": 0.0108, "grad_norm": 0.01793287694454193, "kl": 0.6360182166099548, "learning_rate": 9.9994955892851e-06, "loss": -0.0003, "step": 1080, "step_time": 6.224627003008209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1896.0, "completions/max_terminated_length": 1896.0, "completions/mean_length": 817.90625, "completions/mean_terminated_length": 817.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.076970756053925, "epoch": 0.01081, "frac_reward_zero_std": 0.25, "grad_norm": 0.032178282737731934, "kl": 0.6889081075787544, "learning_rate": 9.999494622539973e-06, "loss": -0.0038, "num_tokens": 26644885.0, "reward": 0.7979466915130615, "reward_std": 0.5774991512298584, "rewards/rollout_reward_func/mean": 0.7979466915130615, "rewards/rollout_reward_func/std": 0.8347436785697937, "sampling/importance_sampling_ratio/max": 0.5487461090087891, "sampling/importance_sampling_ratio/mean": 0.22094249725341797, "sampling/importance_sampling_ratio/min": 4.599691743884486e-23, "sampling/sampling_logp_difference/max": 13.54800033569336, "sampling/sampling_logp_difference/mean": 0.8254491686820984, "step": 1081, "step_time": 13.683158025989542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.09425088763237, "epoch": 0.01082, "grad_norm": 0.024675384163856506, "kl": 0.6905715577304363, "learning_rate": 9.999493654869373e-06, "loss": -0.0038, "step": 1082, "step_time": 7.594454903999576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2045.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 864.78125, "completions/mean_terminated_length": 836.0333862304688, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "entropy": 3.7802258133888245, "epoch": 0.01083, "frac_reward_zero_std": 0.25, "grad_norm": 0.01599167473614216, "kl": 0.6066873408854008, "learning_rate": 9.999492686273298e-06, "loss": 0.0106, "num_tokens": 26713778.0, "reward": 0.594771683216095, "reward_std": 0.2117389440536499, "rewards/rollout_reward_func/mean": 0.594771683216095, "rewards/rollout_reward_func/std": 0.9965864419937134, "sampling/importance_sampling_ratio/max": 0.3004372715950012, "sampling/importance_sampling_ratio/mean": 0.1984369307756424, "sampling/importance_sampling_ratio/min": 6.253425079803397e-16, "sampling/sampling_logp_difference/max": 4.621401786804199, "sampling/sampling_logp_difference/mean": 0.6806453466415405, "step": 1083, "step_time": 13.488251457987644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.7940398454666138, "epoch": 0.01084, "grad_norm": 0.016381463035941124, "kl": 0.6069911867380142, "learning_rate": 9.99949171675175e-06, "loss": 0.0106, "step": 1084, "step_time": 7.475125561017194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.0625, "completions/max_length": 1348.0, "completions/max_terminated_length": 1348.0, "completions/mean_length": 919.15625, "completions/mean_terminated_length": 942.6333618164062, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "entropy": 4.740765631198883, "epoch": 0.01085, "frac_reward_zero_std": 0.0, "grad_norm": 0.03067781589925289, "kl": 0.6604723073542118, "learning_rate": 9.999490746304727e-06, "loss": -0.0132, "num_tokens": 26784949.0, "reward": 0.6264994144439697, "reward_std": 0.9428136348724365, "rewards/rollout_reward_func/mean": 0.6264994144439697, "rewards/rollout_reward_func/std": 0.9565773606300354, "sampling/importance_sampling_ratio/max": 0.30440014600753784, "sampling/importance_sampling_ratio/mean": 0.11273488402366638, "sampling/importance_sampling_ratio/min": 1.9223482181085494e-11, "sampling/sampling_logp_difference/max": 11.688701629638672, "sampling/sampling_logp_difference/mean": 0.8036290407180786, "step": 1085, "step_time": 11.552544527003192 }, { "clip_ratio/high_max": 0.004807692486792803, "clip_ratio/high_mean": 0.0024038462433964014, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024038462433964014, "entropy": 4.7613056898117065, "epoch": 0.01086, "grad_norm": 0.027858542278409004, "kl": 0.6655140444636345, "learning_rate": 9.999489774932232e-06, "loss": -0.0132, "step": 1086, "step_time": 6.138699575996725 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 2110.0, "completions/max_terminated_length": 2110.0, "completions/mean_length": 632.78125, "completions/mean_terminated_length": 632.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.6630208790302277, "epoch": 0.01087, "frac_reward_zero_std": 0.25, "grad_norm": 0.04100068658590317, "kl": 0.6756198368966579, "learning_rate": 9.999488802634262e-06, "loss": -0.0056, "num_tokens": 26847290.0, "reward": 0.42395830154418945, "reward_std": 0.44595465064048767, "rewards/rollout_reward_func/mean": 0.42395830154418945, "rewards/rollout_reward_func/std": 1.0804308652877808, "sampling/importance_sampling_ratio/max": 0.5559871196746826, "sampling/importance_sampling_ratio/mean": 0.2558334469795227, "sampling/importance_sampling_ratio/min": 3.0080786018515937e-05, "sampling/sampling_logp_difference/max": 3.13061261177063, "sampling/sampling_logp_difference/mean": 0.5221539735794067, "step": 1087, "step_time": 13.243648215990106 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.666400760412216, "epoch": 0.01088, "grad_norm": 0.04584572836756706, "kl": 0.6791002303361893, "learning_rate": 9.999487829410819e-06, "loss": -0.0058, "step": 1088, "step_time": 7.637476103001973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1727.0, "completions/max_terminated_length": 1727.0, "completions/mean_length": 695.625, "completions/mean_terminated_length": 695.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.8172796070575714, "epoch": 0.01089, "frac_reward_zero_std": 0.5, "grad_norm": 0.012274743057787418, "kl": 0.7540586516261101, "learning_rate": 9.999486855261904e-06, "loss": 0.0033, "num_tokens": 26909521.0, "reward": 0.7368971109390259, "reward_std": 0.44137662649154663, "rewards/rollout_reward_func/mean": 0.7368971109390259, "rewards/rollout_reward_func/std": 0.8661063313484192, "sampling/importance_sampling_ratio/max": 0.5475568175315857, "sampling/importance_sampling_ratio/mean": 0.24894820153713226, "sampling/importance_sampling_ratio/min": 1.9218806233955998e-13, "sampling/sampling_logp_difference/max": 2.99630069732666, "sampling/sampling_logp_difference/mean": 0.5628497004508972, "step": 1089, "step_time": 12.523428241001966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.8176711201667786, "epoch": 0.0109, "grad_norm": 0.01211884617805481, "kl": 0.754688061773777, "learning_rate": 9.999485880187515e-06, "loss": 0.0033, "step": 1090, "step_time": 6.902751930996601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1324.0, "completions/max_terminated_length": 1324.0, "completions/mean_length": 325.75, "completions/mean_terminated_length": 325.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.0937472879886627, "epoch": 0.01091, "frac_reward_zero_std": 0.5, "grad_norm": 0.08118791878223419, "kl": 0.8039654195308685, "learning_rate": 9.999484904187655e-06, "loss": -0.0039, "num_tokens": 26957808.0, "reward": 1.138397455215454, "reward_std": 0.02274748496711254, "rewards/rollout_reward_func/mean": 1.138397455215454, "rewards/rollout_reward_func/std": 0.07342702150344849, "sampling/importance_sampling_ratio/max": 0.5490531325340271, "sampling/importance_sampling_ratio/mean": 0.3759253919124603, "sampling/importance_sampling_ratio/min": 9.989245874919561e-09, "sampling/sampling_logp_difference/max": 10.637663841247559, "sampling/sampling_logp_difference/mean": 0.49504631757736206, "step": 1091, "step_time": 9.899879195982066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 3.116311728954315, "epoch": 0.01092, "grad_norm": 0.06792215257883072, "kl": 0.8019489049911499, "learning_rate": 9.99948392726232e-06, "loss": -0.0041, "step": 1092, "step_time": 5.729620227000851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1802.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 630.90625, "completions/mean_terminated_length": 630.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.684675633907318, "epoch": 0.01093, "frac_reward_zero_std": 0.25, "grad_norm": 0.02378474734723568, "kl": 0.769803948700428, "learning_rate": 9.999482949411516e-06, "loss": -0.0087, "num_tokens": 27018264.0, "reward": 0.10417535901069641, "reward_std": 0.725614070892334, "rewards/rollout_reward_func/mean": 0.10417535901069641, "rewards/rollout_reward_func/std": 1.117267370223999, "sampling/importance_sampling_ratio/max": 0.550246000289917, "sampling/importance_sampling_ratio/mean": 0.21611091494560242, "sampling/importance_sampling_ratio/min": 1.3469826853906852e-07, "sampling/sampling_logp_difference/max": 4.357213973999023, "sampling/sampling_logp_difference/mean": 0.8208245038986206, "step": 1093, "step_time": 12.86589880799147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 4.704837322235107, "epoch": 0.01094, "grad_norm": 0.024078601971268654, "kl": 0.7671393677592278, "learning_rate": 9.99948197063524e-06, "loss": -0.0087, "step": 1094, "step_time": 7.2909858920029365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 2122.0, "completions/max_terminated_length": 2122.0, "completions/mean_length": 681.875, "completions/mean_terminated_length": 681.875, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "entropy": 3.665563315153122, "epoch": 0.01095, "frac_reward_zero_std": 0.0, "grad_norm": 0.08857525140047073, "kl": 0.7299753464758396, "learning_rate": 9.999480990933493e-06, "loss": -0.0107, "num_tokens": 27081180.0, "reward": 0.5196670293807983, "reward_std": 0.5652835369110107, "rewards/rollout_reward_func/mean": 0.5196670293807983, "rewards/rollout_reward_func/std": 1.040029764175415, "sampling/importance_sampling_ratio/max": 0.3040343225002289, "sampling/importance_sampling_ratio/mean": 0.18960340321063995, "sampling/importance_sampling_ratio/min": 5.4173918684341515e-09, "sampling/sampling_logp_difference/max": 4.75364875793457, "sampling/sampling_logp_difference/mean": 0.5306105613708496, "step": 1095, "step_time": 12.751328881007794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.718174546957016, "epoch": 0.01096, "grad_norm": 0.09468680620193481, "kl": 0.7226775884628296, "learning_rate": 9.999480010306273e-06, "loss": -0.0111, "step": 1096, "step_time": 7.219878668984165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2067.0, "completions/max_terminated_length": 2067.0, "completions/mean_length": 715.5625, "completions/mean_terminated_length": 715.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.398089945316315, "epoch": 0.01097, "frac_reward_zero_std": 0.25, "grad_norm": 0.0321333184838295, "kl": 0.59485087916255, "learning_rate": 9.999479028753583e-06, "loss": -0.0087, "num_tokens": 27144001.0, "reward": 0.935232400894165, "reward_std": 0.5677123665809631, "rewards/rollout_reward_func/mean": 0.935232400894165, "rewards/rollout_reward_func/std": 0.696830689907074, "sampling/importance_sampling_ratio/max": 0.5526922941207886, "sampling/importance_sampling_ratio/mean": 0.2280263602733612, "sampling/importance_sampling_ratio/min": 1.6339317168605677e-16, "sampling/sampling_logp_difference/max": 9.615973472595215, "sampling/sampling_logp_difference/mean": 0.804852306842804, "step": 1097, "step_time": 13.414518249010143 }, { "clip_ratio/high_max": 0.004807692486792803, "clip_ratio/high_mean": 0.0024038462433964014, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024038462433964014, "entropy": 4.460204243659973, "epoch": 0.01098, "grad_norm": 0.0242720078676939, "kl": 0.5815465450286865, "learning_rate": 9.999478046275422e-06, "loss": -0.0088, "step": 1098, "step_time": 7.452722570007609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1331.0, "completions/max_terminated_length": 1331.0, "completions/mean_length": 510.0625, "completions/mean_terminated_length": 510.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.7651880383491516, "epoch": 0.01099, "frac_reward_zero_std": 0.25, "grad_norm": 0.1177113726735115, "kl": 0.8136527836322784, "learning_rate": 9.99947706287179e-06, "loss": -0.0026, "num_tokens": 27199457.0, "reward": 0.8373656272888184, "reward_std": 0.524480938911438, "rewards/rollout_reward_func/mean": 0.8373656272888184, "rewards/rollout_reward_func/std": 0.7852200269699097, "sampling/importance_sampling_ratio/max": 0.5433353185653687, "sampling/importance_sampling_ratio/mean": 0.2321738451719284, "sampling/importance_sampling_ratio/min": 6.170897126622776e-13, "sampling/sampling_logp_difference/max": 3.9489879608154297, "sampling/sampling_logp_difference/mean": 0.6416085958480835, "step": 1099, "step_time": 10.967963013004919 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 3.7828968167304993, "epoch": 0.011, "grad_norm": 0.04674230143427849, "kl": 0.822065994143486, "learning_rate": 9.999476078542688e-06, "loss": -0.0028, "step": 1100, "step_time": 6.304222580009082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1336.0, "completions/max_terminated_length": 1336.0, "completions/mean_length": 795.375, "completions/mean_terminated_length": 795.375, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "entropy": 4.925518155097961, "epoch": 0.01101, "frac_reward_zero_std": 0.0, "grad_norm": 0.024751799181103706, "kl": 0.6251841485500336, "learning_rate": 9.999475093288116e-06, "loss": -0.022, "num_tokens": 27267531.0, "reward": 0.28219741582870483, "reward_std": 1.08160400390625, "rewards/rollout_reward_func/mean": 0.28219741582870483, "rewards/rollout_reward_func/std": 1.0725538730621338, "sampling/importance_sampling_ratio/max": 0.3083089292049408, "sampling/importance_sampling_ratio/mean": 0.11028968542814255, "sampling/importance_sampling_ratio/min": 1.7746745642043676e-13, "sampling/sampling_logp_difference/max": 3.785745143890381, "sampling/sampling_logp_difference/mean": 0.8613319396972656, "step": 1101, "step_time": 11.345596163002483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.932945519685745, "epoch": 0.01102, "grad_norm": 0.02151988446712494, "kl": 0.6182377636432648, "learning_rate": 9.999474107108074e-06, "loss": -0.022, "step": 1102, "step_time": 6.113090892002219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2106.0, "completions/max_terminated_length": 2106.0, "completions/mean_length": 740.25, "completions/mean_terminated_length": 740.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.971364200115204, "epoch": 0.01103, "frac_reward_zero_std": 0.0, "grad_norm": 0.07887359708547592, "kl": 0.6550860367715359, "learning_rate": 9.999473120002564e-06, "loss": -0.0024, "num_tokens": 27332483.0, "reward": -0.22960332036018372, "reward_std": 0.6438595652580261, "rewards/rollout_reward_func/mean": -0.22960332036018372, "rewards/rollout_reward_func/std": 0.9883885979652405, "sampling/importance_sampling_ratio/max": 0.5459045767784119, "sampling/importance_sampling_ratio/mean": 0.16598914563655853, "sampling/importance_sampling_ratio/min": 2.2169050280143165e-15, "sampling/sampling_logp_difference/max": 4.329117774963379, "sampling/sampling_logp_difference/mean": 0.9561574459075928, "step": 1103, "step_time": 13.68095746399922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.991263419389725, "epoch": 0.01104, "grad_norm": 0.07369791716337204, "kl": 0.6534667313098907, "learning_rate": 9.999472131971582e-06, "loss": -0.0024, "step": 1104, "step_time": 7.638228440009698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1345.0, "completions/max_terminated_length": 1345.0, "completions/mean_length": 516.59375, "completions/mean_terminated_length": 516.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.373591899871826, "epoch": 0.01105, "frac_reward_zero_std": 0.0, "grad_norm": 0.061281245201826096, "kl": 0.703276876360178, "learning_rate": 9.999471143015132e-06, "loss": -0.0029, "num_tokens": 27389854.0, "reward": 0.4845708906650543, "reward_std": 0.4147566854953766, "rewards/rollout_reward_func/mean": 0.4845708906650543, "rewards/rollout_reward_func/std": 1.0618455410003662, "sampling/importance_sampling_ratio/max": 0.5532063841819763, "sampling/importance_sampling_ratio/mean": 0.21389582753181458, "sampling/importance_sampling_ratio/min": 2.7667293606701334e-15, "sampling/sampling_logp_difference/max": 4.875144004821777, "sampling/sampling_logp_difference/mean": 0.859816312789917, "step": 1105, "step_time": 11.648948336987814 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026041666977107525, "entropy": 4.443784952163696, "epoch": 0.01106, "grad_norm": 0.03612392395734787, "kl": 0.6987685300409794, "learning_rate": 9.999470153133216e-06, "loss": -0.0031, "step": 1106, "step_time": 6.355801319012244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1538.0, "completions/max_terminated_length": 1538.0, "completions/mean_length": 353.0, "completions/mean_terminated_length": 363.8709716796875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.9310325980186462, "epoch": 0.01107, "frac_reward_zero_std": 0.25, "grad_norm": 0.046413008123636246, "kl": 0.8938448876142502, "learning_rate": 9.999469162325828e-06, "loss": -0.0106, "num_tokens": 27440204.0, "reward": 0.4896203875541687, "reward_std": 0.386188268661499, "rewards/rollout_reward_func/mean": 0.4896203875541687, "rewards/rollout_reward_func/std": 1.007868766784668, "sampling/importance_sampling_ratio/max": 0.547310471534729, "sampling/importance_sampling_ratio/mean": 0.26084330677986145, "sampling/importance_sampling_ratio/min": 8.642087045984681e-09, "sampling/sampling_logp_difference/max": 3.0682127475738525, "sampling/sampling_logp_difference/mean": 0.6499993801116943, "step": 1107, "step_time": 11.197833902006096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.9627852737903595, "epoch": 0.01108, "grad_norm": 0.05246968939900398, "kl": 0.8904408812522888, "learning_rate": 9.999468170592971e-06, "loss": -0.0105, "step": 1108, "step_time": 6.1505380860035075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2113.0, "completions/max_terminated_length": 2113.0, "completions/mean_length": 852.09375, "completions/mean_terminated_length": 856.3870849609375, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "entropy": 5.225654602050781, "epoch": 0.01109, "frac_reward_zero_std": 0.0, "grad_norm": 0.04485287517309189, "kl": 0.6780040115118027, "learning_rate": 9.999467177934649e-06, "loss": -0.0034, "num_tokens": 27508809.0, "reward": 0.5704819560050964, "reward_std": 0.554434597492218, "rewards/rollout_reward_func/mean": 0.5704819560050964, "rewards/rollout_reward_func/std": 1.0147672891616821, "sampling/importance_sampling_ratio/max": 0.2561897039413452, "sampling/importance_sampling_ratio/mean": 0.07088504731655121, "sampling/importance_sampling_ratio/min": 1.0995829755904686e-18, "sampling/sampling_logp_difference/max": 4.705512046813965, "sampling/sampling_logp_difference/mean": 1.0142810344696045, "step": 1109, "step_time": 13.824531665013637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.196021616458893, "epoch": 0.0111, "grad_norm": 0.04095577076077461, "kl": 0.6751914545893669, "learning_rate": 9.999466184350858e-06, "loss": -0.0035, "step": 1110, "step_time": 7.597952568008623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 277.59375, "completions/mean_terminated_length": 277.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.121375918388367, "epoch": 0.01111, "frac_reward_zero_std": 0.5, "grad_norm": 0.016769349575042725, "kl": 0.723046250641346, "learning_rate": 9.999465189841599e-06, "loss": 0.0014, "num_tokens": 27555042.0, "reward": 0.07017077505588531, "reward_std": 0.04199586436152458, "rewards/rollout_reward_func/mean": 0.07017077505588531, "rewards/rollout_reward_func/std": 1.1081881523132324, "sampling/importance_sampling_ratio/max": 0.5573220252990723, "sampling/importance_sampling_ratio/mean": 0.24025961756706238, "sampling/importance_sampling_ratio/min": 0.0027678704354912043, "sampling/sampling_logp_difference/max": 3.1327269077301025, "sampling/sampling_logp_difference/mean": 0.7022111415863037, "step": 1111, "step_time": 9.10329360999458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.115625619888306, "epoch": 0.01112, "grad_norm": 0.014490512199699879, "kl": 0.7273128926753998, "learning_rate": 9.999464194406873e-06, "loss": 0.0014, "step": 1112, "step_time": 4.87835383201309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 299.25, "completions/mean_terminated_length": 294.70001220703125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.006084829568863, "epoch": 0.01113, "frac_reward_zero_std": 0.5, "grad_norm": 0.026473643258213997, "kl": 0.7325413897633553, "learning_rate": 9.99946319804668e-06, "loss": -0.0061, "num_tokens": 27604041.0, "reward": 0.5037553310394287, "reward_std": 0.2475237101316452, "rewards/rollout_reward_func/mean": 0.5037553310394287, "rewards/rollout_reward_func/std": 0.9782460927963257, "sampling/importance_sampling_ratio/max": 0.5517081022262573, "sampling/importance_sampling_ratio/mean": 0.25634443759918213, "sampling/importance_sampling_ratio/min": 4.944591285895028e-13, "sampling/sampling_logp_difference/max": 4.293819427490234, "sampling/sampling_logp_difference/mean": 0.8081156015396118, "step": 1113, "step_time": 8.460328452994872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.986738681793213, "epoch": 0.01114, "grad_norm": 0.025891881436109543, "kl": 0.7343566492199898, "learning_rate": 9.999462200761019e-06, "loss": -0.0062, "step": 1114, "step_time": 4.5290895590005675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1991.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 1089.15625, "completions/mean_terminated_length": 1089.15625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 5.079625308513641, "epoch": 0.01115, "frac_reward_zero_std": 0.0, "grad_norm": 0.0429086908698082, "kl": 0.6119425892829895, "learning_rate": 9.999461202549894e-06, "loss": -0.008, "num_tokens": 27680202.0, "reward": 0.4232938289642334, "reward_std": 0.8835432529449463, "rewards/rollout_reward_func/mean": 0.4232938289642334, "rewards/rollout_reward_func/std": 1.089979648590088, "sampling/importance_sampling_ratio/max": 0.3069210648536682, "sampling/importance_sampling_ratio/mean": 0.05470661818981171, "sampling/importance_sampling_ratio/min": 1.0200907354374067e-06, "sampling/sampling_logp_difference/max": 4.75225830078125, "sampling/sampling_logp_difference/mean": 0.8469244837760925, "step": 1115, "step_time": 13.710635732015362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.03052020072937, "epoch": 0.01116, "grad_norm": 0.03864658623933792, "kl": 0.6101716011762619, "learning_rate": 9.9994602034133e-06, "loss": -0.0081, "step": 1116, "step_time": 7.895505003005383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1948.0, "completions/max_terminated_length": 1948.0, "completions/mean_length": 729.75, "completions/mean_terminated_length": 734.8386840820312, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "entropy": 4.403641611337662, "epoch": 0.01117, "frac_reward_zero_std": 0.25, "grad_norm": 0.05201779678463936, "kl": 0.5646483078598976, "learning_rate": 9.999459203351241e-06, "loss": -0.0048, "num_tokens": 27745838.0, "reward": 0.7309807538986206, "reward_std": 0.7121310234069824, "rewards/rollout_reward_func/mean": 0.7309807538986206, "rewards/rollout_reward_func/std": 0.8907268643379211, "sampling/importance_sampling_ratio/max": 0.3096608519554138, "sampling/importance_sampling_ratio/mean": 0.1566011607646942, "sampling/importance_sampling_ratio/min": 6.460944975507149e-16, "sampling/sampling_logp_difference/max": 4.193825721740723, "sampling/sampling_logp_difference/mean": 0.8983026146888733, "step": 1117, "step_time": 13.208850707997044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.389255732297897, "epoch": 0.01118, "grad_norm": 0.04544927179813385, "kl": 0.5666244477033615, "learning_rate": 9.999458202363715e-06, "loss": -0.0049, "step": 1118, "step_time": 7.191251763004402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2167.0, "completions/max_terminated_length": 2167.0, "completions/mean_length": 614.84375, "completions/mean_terminated_length": 614.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.4941626489162445, "epoch": 0.01119, "frac_reward_zero_std": 0.25, "grad_norm": 0.022532841190695763, "kl": 0.6848932057619095, "learning_rate": 9.999457200450725e-06, "loss": -0.0019, "num_tokens": 27805740.0, "reward": 1.052018404006958, "reward_std": 0.3218718469142914, "rewards/rollout_reward_func/mean": 1.052018404006958, "rewards/rollout_reward_func/std": 0.4684761166572571, "sampling/importance_sampling_ratio/max": 0.5583498477935791, "sampling/importance_sampling_ratio/mean": 0.2435728907585144, "sampling/importance_sampling_ratio/min": 3.56870353959168e-21, "sampling/sampling_logp_difference/max": 13.759223937988281, "sampling/sampling_logp_difference/mean": 0.9200906753540039, "step": 1119, "step_time": 13.144201347997296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.459460109472275, "epoch": 0.0112, "grad_norm": 0.018880341202020645, "kl": 0.6895486190915108, "learning_rate": 9.99945619761227e-06, "loss": -0.002, "step": 1120, "step_time": 7.507324361002247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1990.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 593.75, "completions/mean_terminated_length": 593.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.076577067375183, "epoch": 0.01121, "frac_reward_zero_std": 0.25, "grad_norm": 0.03590729087591171, "kl": 0.5689575597643852, "learning_rate": 9.999455193848349e-06, "loss": -0.0087, "num_tokens": 27865339.0, "reward": 0.8294470310211182, "reward_std": 0.5719634294509888, "rewards/rollout_reward_func/mean": 0.8294470310211182, "rewards/rollout_reward_func/std": 0.8214303255081177, "sampling/importance_sampling_ratio/max": 0.5482642650604248, "sampling/importance_sampling_ratio/mean": 0.22893892228603363, "sampling/importance_sampling_ratio/min": 2.5832416260209464e-19, "sampling/sampling_logp_difference/max": 13.81050968170166, "sampling/sampling_logp_difference/mean": 0.7407287955284119, "step": 1121, "step_time": 12.922000062004372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.0532238483428955, "epoch": 0.01122, "grad_norm": 0.0366685725748539, "kl": 0.5707559362053871, "learning_rate": 9.999454189158961e-06, "loss": -0.0087, "step": 1122, "step_time": 7.84941398997762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1453.0, "completions/max_terminated_length": 1453.0, "completions/mean_length": 533.6875, "completions/mean_terminated_length": 533.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.063687741756439, "epoch": 0.01123, "frac_reward_zero_std": 0.5, "grad_norm": 0.030699104070663452, "kl": 0.7925672829151154, "learning_rate": 9.999453183544113e-06, "loss": -0.0054, "num_tokens": 27921895.0, "reward": 0.7377948760986328, "reward_std": 0.5218265056610107, "rewards/rollout_reward_func/mean": 0.7377948760986328, "rewards/rollout_reward_func/std": 0.8171179294586182, "sampling/importance_sampling_ratio/max": 0.5583005547523499, "sampling/importance_sampling_ratio/mean": 0.3067784905433655, "sampling/importance_sampling_ratio/min": 1.0670504063670094e-10, "sampling/sampling_logp_difference/max": 3.683384418487549, "sampling/sampling_logp_difference/mean": 0.637290358543396, "step": 1123, "step_time": 11.830813305990887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.061972260475159, "epoch": 0.01124, "grad_norm": 0.03172287344932556, "kl": 0.7949206605553627, "learning_rate": 9.999452177003797e-06, "loss": -0.0054, "step": 1124, "step_time": 6.32813800398435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1405.0, "completions/max_terminated_length": 1405.0, "completions/mean_length": 1013.5625, "completions/mean_terminated_length": 1013.5625, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "entropy": 4.456166326999664, "epoch": 0.01125, "frac_reward_zero_std": 0.0, "grad_norm": 0.015063649043440819, "kl": 0.6623352691531181, "learning_rate": 9.999451169538017e-06, "loss": -0.0001, "num_tokens": 27997147.0, "reward": 1.112522006034851, "reward_std": 0.41991478204727173, "rewards/rollout_reward_func/mean": 1.112522006034851, "rewards/rollout_reward_func/std": 0.5308091640472412, "sampling/importance_sampling_ratio/max": 0.2982046604156494, "sampling/importance_sampling_ratio/mean": 0.1016124039888382, "sampling/importance_sampling_ratio/min": 2.750303457199621e-11, "sampling/sampling_logp_difference/max": 13.270076751708984, "sampling/sampling_logp_difference/mean": 0.7021688222885132, "step": 1125, "step_time": 11.516869609011337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.48015570640564, "epoch": 0.01126, "grad_norm": 0.015030714683234692, "kl": 0.658710166811943, "learning_rate": 9.999450161146776e-06, "loss": -0.0001, "step": 1126, "step_time": 6.283901371003594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1416.0, "completions/max_terminated_length": 1416.0, "completions/mean_length": 543.90625, "completions/mean_terminated_length": 543.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.248491495847702, "epoch": 0.01127, "frac_reward_zero_std": 0.25, "grad_norm": 0.01477739680558443, "kl": 0.7372622638940811, "learning_rate": 9.999449151830068e-06, "loss": -0.006, "num_tokens": 28053477.0, "reward": -0.04558371752500534, "reward_std": 0.49620354175567627, "rewards/rollout_reward_func/mean": -0.04558371752500534, "rewards/rollout_reward_func/std": 1.0847086906433105, "sampling/importance_sampling_ratio/max": 0.5521236062049866, "sampling/importance_sampling_ratio/mean": 0.2458038628101349, "sampling/importance_sampling_ratio/min": 3.6098413754709346e-13, "sampling/sampling_logp_difference/max": 11.909332275390625, "sampling/sampling_logp_difference/mean": 0.7801800966262817, "step": 1127, "step_time": 10.936565964999318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.270810544490814, "epoch": 0.01128, "grad_norm": 0.01563863269984722, "kl": 0.7361334264278412, "learning_rate": 9.999448141587897e-06, "loss": -0.006, "step": 1128, "step_time": 6.280774111008213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2211.0, "completions/max_terminated_length": 2211.0, "completions/mean_length": 702.5, "completions/mean_terminated_length": 702.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.803616851568222, "epoch": 0.01129, "frac_reward_zero_std": 0.25, "grad_norm": 0.013267895206809044, "kl": 0.6355480924248695, "learning_rate": 9.999447130420266e-06, "loss": -0.0014, "num_tokens": 28117267.0, "reward": 0.5707682967185974, "reward_std": 0.21708622574806213, "rewards/rollout_reward_func/mean": 0.5707682967185974, "rewards/rollout_reward_func/std": 1.0344003438949585, "sampling/importance_sampling_ratio/max": 0.55272376537323, "sampling/importance_sampling_ratio/mean": 0.267406702041626, "sampling/importance_sampling_ratio/min": 1.3155137569553776e-11, "sampling/sampling_logp_difference/max": 2.877164363861084, "sampling/sampling_logp_difference/mean": 0.5866169333457947, "step": 1129, "step_time": 13.647962882001593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.833321273326874, "epoch": 0.0113, "grad_norm": 0.014109769836068153, "kl": 0.6333872601389885, "learning_rate": 9.99944611832717e-06, "loss": -0.0014, "step": 1130, "step_time": 7.645972703008738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1268.0, "completions/max_terminated_length": 1268.0, "completions/mean_length": 437.90625, "completions/mean_terminated_length": 451.51611328125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.8470359146595, "epoch": 0.01131, "frac_reward_zero_std": 0.25, "grad_norm": 0.015314877964556217, "kl": 0.5618612691760063, "learning_rate": 9.99944510530861e-06, "loss": -0.0118, "num_tokens": 28168631.0, "reward": 0.8600362539291382, "reward_std": 0.5178484916687012, "rewards/rollout_reward_func/mean": 0.8600362539291382, "rewards/rollout_reward_func/std": 0.7647567987442017, "sampling/importance_sampling_ratio/max": 0.554744303226471, "sampling/importance_sampling_ratio/mean": 0.32018131017684937, "sampling/importance_sampling_ratio/min": 1.5699411548553144e-08, "sampling/sampling_logp_difference/max": 4.335840225219727, "sampling/sampling_logp_difference/mean": 0.6447570323944092, "step": 1131, "step_time": 9.976283030984632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.8579556345939636, "epoch": 0.01132, "grad_norm": 0.015672652050852776, "kl": 0.5581599622964859, "learning_rate": 9.99944409136459e-06, "loss": -0.0118, "step": 1132, "step_time": 5.699249411991332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1448.0, "completions/max_terminated_length": 1448.0, "completions/mean_length": 669.5625, "completions/mean_terminated_length": 690.6451416015625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.980571210384369, "epoch": 0.01133, "frac_reward_zero_std": 0.0, "grad_norm": 0.043411366641521454, "kl": 0.600745502859354, "learning_rate": 9.999443076495105e-06, "loss": -0.0118, "num_tokens": 28229955.0, "reward": 0.7720841765403748, "reward_std": 0.7192803621292114, "rewards/rollout_reward_func/mean": 0.7720841765403748, "rewards/rollout_reward_func/std": 0.8824067711830139, "sampling/importance_sampling_ratio/max": 0.5517204999923706, "sampling/importance_sampling_ratio/mean": 0.2211419939994812, "sampling/importance_sampling_ratio/min": 6.185144840742396e-09, "sampling/sampling_logp_difference/max": 3.1275482177734375, "sampling/sampling_logp_difference/mean": 0.6177222728729248, "step": 1133, "step_time": 11.392776100998162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.95760515332222, "epoch": 0.01134, "grad_norm": 0.0413399301469326, "kl": 0.6056193001568317, "learning_rate": 9.999442060700163e-06, "loss": -0.0118, "step": 1134, "step_time": 6.63109294098831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1901.0, "completions/max_terminated_length": 1901.0, "completions/mean_length": 815.0, "completions/mean_terminated_length": 815.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.277816861867905, "epoch": 0.01135, "frac_reward_zero_std": 0.25, "grad_norm": 0.07084735482931137, "kl": 0.6568891108036041, "learning_rate": 9.999441043979755e-06, "loss": -0.0008, "num_tokens": 28296488.0, "reward": 0.637856125831604, "reward_std": 0.8473033905029297, "rewards/rollout_reward_func/mean": 0.637856125831604, "rewards/rollout_reward_func/std": 0.9712716341018677, "sampling/importance_sampling_ratio/max": 0.551771879196167, "sampling/importance_sampling_ratio/mean": 0.16519223153591156, "sampling/importance_sampling_ratio/min": 0.00032286549685522914, "sampling/sampling_logp_difference/max": 2.4968268871307373, "sampling/sampling_logp_difference/mean": 0.6192754507064819, "step": 1135, "step_time": 13.936857410008088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.257284373044968, "epoch": 0.01136, "grad_norm": 0.06395617872476578, "kl": 0.627831369638443, "learning_rate": 9.999440026333887e-06, "loss": -0.0009, "step": 1136, "step_time": 7.237717868003529 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "completions/clipped_ratio": 0.0, "completions/max_length": 2044.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 916.375, "completions/mean_terminated_length": 916.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.710913300514221, "epoch": 0.01137, "frac_reward_zero_std": 0.0, "grad_norm": 0.1784217655658722, "kl": 0.6465548500418663, "learning_rate": 9.999439007762558e-06, "loss": -0.0056, "num_tokens": 28366458.0, "reward": 0.8553228378295898, "reward_std": 0.8181700706481934, "rewards/rollout_reward_func/mean": 0.8553228378295898, "rewards/rollout_reward_func/std": 0.7943786978721619, "sampling/importance_sampling_ratio/max": 0.30953165888786316, "sampling/importance_sampling_ratio/mean": 0.11867612600326538, "sampling/importance_sampling_ratio/min": 2.5783791102185205e-07, "sampling/sampling_logp_difference/max": 4.8999924659729, "sampling/sampling_logp_difference/mean": 0.7497850656509399, "step": 1137, "step_time": 13.438664713001344 }, { "clip_ratio/high_max": 0.01854395680129528, "clip_ratio/high_mean": 0.00927197840064764, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03010531235486269, "entropy": 4.7144321501255035, "epoch": 0.01138, "grad_norm": 0.051411230117082596, "kl": 0.6335557699203491, "learning_rate": 9.999437988265768e-06, "loss": -0.0063, "step": 1138, "step_time": 7.312424900010228 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.03125, "completions/max_length": 1627.0, "completions/max_terminated_length": 1627.0, "completions/mean_length": 792.0, "completions/mean_terminated_length": 800.258056640625, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "entropy": 4.64762806892395, "epoch": 0.01139, "frac_reward_zero_std": 0.0, "grad_norm": 0.021984362974762917, "kl": 0.5154727660119534, "learning_rate": 9.999436967843518e-06, "loss": -0.0048, "num_tokens": 28435090.0, "reward": 0.631308913230896, "reward_std": 0.6457587480545044, "rewards/rollout_reward_func/mean": 0.631308913230896, "rewards/rollout_reward_func/std": 0.9350817799568176, "sampling/importance_sampling_ratio/max": 0.30743539333343506, "sampling/importance_sampling_ratio/mean": 0.12700846791267395, "sampling/importance_sampling_ratio/min": 8.469746681782908e-09, "sampling/sampling_logp_difference/max": 4.342970848083496, "sampling/sampling_logp_difference/mean": 0.753542959690094, "step": 1139, "step_time": 12.882115867010725 }, { "clip_ratio/high_max": 0.016098485328257084, "clip_ratio/high_mean": 0.008049242664128542, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008049242664128542, "entropy": 4.655754029750824, "epoch": 0.0114, "grad_norm": 0.00988929532468319, "kl": 0.5161697138100863, "learning_rate": 9.999435946495807e-06, "loss": -0.0049, "step": 1140, "step_time": 7.08479730199906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1534.0, "completions/max_terminated_length": 1534.0, "completions/mean_length": 454.34375, "completions/mean_terminated_length": 454.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.7524009943008423, "epoch": 0.01141, "frac_reward_zero_std": 0.5, "grad_norm": 0.02441931702196598, "kl": 0.7625422850251198, "learning_rate": 9.999434924222635e-06, "loss": -0.007, "num_tokens": 28488170.0, "reward": 0.9010801315307617, "reward_std": 0.47277751564979553, "rewards/rollout_reward_func/mean": 0.9010801315307617, "rewards/rollout_reward_func/std": 0.703346848487854, "sampling/importance_sampling_ratio/max": 0.5564303398132324, "sampling/importance_sampling_ratio/mean": 0.3316947817802429, "sampling/importance_sampling_ratio/min": 0.004035382065922022, "sampling/sampling_logp_difference/max": 2.3311960697174072, "sampling/sampling_logp_difference/mean": 0.47881269454956055, "step": 1141, "step_time": 11.200732961005997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.750037968158722, "epoch": 0.01142, "grad_norm": 0.025613881647586823, "kl": 0.7631163001060486, "learning_rate": 9.999433901024004e-06, "loss": -0.007, "step": 1142, "step_time": 6.237309798008937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 716.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 396.6875, "completions/mean_terminated_length": 396.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.3216218650341034, "epoch": 0.01143, "frac_reward_zero_std": 0.5, "grad_norm": 0.08380814641714096, "kl": 0.9490982368588448, "learning_rate": 9.999432876899914e-06, "loss": -0.0015, "num_tokens": 28541471.0, "reward": 1.1081372499465942, "reward_std": 0.20403532683849335, "rewards/rollout_reward_func/mean": 1.1081372499465942, "rewards/rollout_reward_func/std": 0.40569907426834106, "sampling/importance_sampling_ratio/max": 0.5553972721099854, "sampling/importance_sampling_ratio/mean": 0.3119501769542694, "sampling/importance_sampling_ratio/min": 4.107217682758346e-05, "sampling/sampling_logp_difference/max": 2.996798276901245, "sampling/sampling_logp_difference/mean": 0.4486807584762573, "step": 1143, "step_time": 8.463364392024232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.3322396278381348, "epoch": 0.01144, "grad_norm": 0.08618035912513733, "kl": 0.9610205814242363, "learning_rate": 9.999431851850363e-06, "loss": -0.0016, "step": 1144, "step_time": 4.580381859006593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1412.0, "completions/max_terminated_length": 1412.0, "completions/mean_length": 700.625, "completions/mean_terminated_length": 700.625, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "entropy": 3.7480228543281555, "epoch": 0.01145, "frac_reward_zero_std": 0.0, "grad_norm": 0.11657878756523132, "kl": 1.344167836010456, "learning_rate": 9.999430825875353e-06, "loss": -0.0161, "num_tokens": 28606555.0, "reward": 0.7847898006439209, "reward_std": 0.7457026243209839, "rewards/rollout_reward_func/mean": 0.7847898006439209, "rewards/rollout_reward_func/std": 0.908831000328064, "sampling/importance_sampling_ratio/max": 0.3079541325569153, "sampling/importance_sampling_ratio/mean": 0.18746553361415863, "sampling/importance_sampling_ratio/min": 9.407504109049114e-09, "sampling/sampling_logp_difference/max": 4.598202705383301, "sampling/sampling_logp_difference/mean": 0.5630203485488892, "step": 1145, "step_time": 11.391063108989329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.756817191839218, "epoch": 0.01146, "grad_norm": 0.08055538684129715, "kl": 1.1590019464492798, "learning_rate": 9.999429798974887e-06, "loss": -0.0165, "step": 1146, "step_time": 6.658211067020602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2123.0, "completions/max_terminated_length": 2123.0, "completions/mean_length": 515.09375, "completions/mean_terminated_length": 515.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.366278886795044, "epoch": 0.01147, "frac_reward_zero_std": 0.5, "grad_norm": 0.020305993035435677, "kl": 0.7469972893595695, "learning_rate": 9.99942877114896e-06, "loss": 0.005, "num_tokens": 28662626.0, "reward": 0.10471732914447784, "reward_std": 0.5954486131668091, "rewards/rollout_reward_func/mean": 0.10471732914447784, "rewards/rollout_reward_func/std": 1.0965386629104614, "sampling/importance_sampling_ratio/max": 0.554079532623291, "sampling/importance_sampling_ratio/mean": 0.2625787854194641, "sampling/importance_sampling_ratio/min": 1.7214800765441574e-11, "sampling/sampling_logp_difference/max": 13.093769073486328, "sampling/sampling_logp_difference/mean": 0.7518768310546875, "step": 1147, "step_time": 13.382038729992928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.395840108394623, "epoch": 0.01148, "grad_norm": 0.022164400666952133, "kl": 0.7455001026391983, "learning_rate": 9.999427742397575e-06, "loss": 0.005, "step": 1148, "step_time": 7.379824875999475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 262.5, "completions/mean_terminated_length": 261.20001220703125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.3521187007427216, "epoch": 0.01149, "frac_reward_zero_std": 0.5, "grad_norm": 0.02119198627769947, "kl": 0.7929122373461723, "learning_rate": 9.999426712720733e-06, "loss": -0.0089, "num_tokens": 28706923.0, "reward": 1.1422815322875977, "reward_std": 0.017724037170410156, "rewards/rollout_reward_func/mean": 1.1422815322875977, "rewards/rollout_reward_func/std": 0.05162676051259041, "sampling/importance_sampling_ratio/max": 0.5485890507698059, "sampling/importance_sampling_ratio/mean": 0.35614916682243347, "sampling/importance_sampling_ratio/min": 2.3503993890061636e-12, "sampling/sampling_logp_difference/max": 4.924840450286865, "sampling/sampling_logp_difference/mean": 0.6259078979492188, "step": 1149, "step_time": 8.086999466002453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.3461305797100067, "epoch": 0.0115, "grad_norm": 0.019567318260669708, "kl": 0.7925777062773705, "learning_rate": 9.99942568211843e-06, "loss": -0.0089, "step": 1150, "step_time": 4.32651752899983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1942.0, "completions/max_terminated_length": 1942.0, "completions/mean_length": 703.84375, "completions/mean_terminated_length": 703.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.2495986223220825, "epoch": 0.01151, "frac_reward_zero_std": 0.25, "grad_norm": 0.03568367660045624, "kl": 0.7660563215613365, "learning_rate": 9.999424650590674e-06, "loss": -0.0096, "num_tokens": 28769514.0, "reward": 0.573007345199585, "reward_std": 0.7540313005447388, "rewards/rollout_reward_func/mean": 0.573007345199585, "rewards/rollout_reward_func/std": 0.957791268825531, "sampling/importance_sampling_ratio/max": 0.5508859753608704, "sampling/importance_sampling_ratio/mean": 0.20449283719062805, "sampling/importance_sampling_ratio/min": 0.0032096118666231632, "sampling/sampling_logp_difference/max": 2.6486332416534424, "sampling/sampling_logp_difference/mean": 0.6164183616638184, "step": 1151, "step_time": 13.258133203999023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.210729330778122, "epoch": 0.01152, "grad_norm": 0.02998196892440319, "kl": 0.7689434364438057, "learning_rate": 9.999423618137458e-06, "loss": -0.0097, "step": 1152, "step_time": 7.143642578994331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1463.0, "completions/max_terminated_length": 1463.0, "completions/mean_length": 641.6875, "completions/mean_terminated_length": 641.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.6377866566181183, "epoch": 0.01153, "frac_reward_zero_std": 0.5, "grad_norm": 0.02246466465294361, "kl": 0.6494592949748039, "learning_rate": 9.999422584758785e-06, "loss": -0.0042, "num_tokens": 28830894.0, "reward": 0.4954870641231537, "reward_std": 0.2887594997882843, "rewards/rollout_reward_func/mean": 0.4954870641231537, "rewards/rollout_reward_func/std": 1.0267335176467896, "sampling/importance_sampling_ratio/max": 0.5544461011886597, "sampling/importance_sampling_ratio/mean": 0.20683765411376953, "sampling/importance_sampling_ratio/min": 8.756158623011599e-16, "sampling/sampling_logp_difference/max": 12.066323280334473, "sampling/sampling_logp_difference/mean": 0.6774777173995972, "step": 1153, "step_time": 11.462277670012554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.6324128806591034, "epoch": 0.01154, "grad_norm": 0.022523535415530205, "kl": 0.6496663503348827, "learning_rate": 9.999421550454654e-06, "loss": -0.0042, "step": 1154, "step_time": 6.234861075980007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2086.0, "completions/max_terminated_length": 2086.0, "completions/mean_length": 368.21875, "completions/mean_terminated_length": 368.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.498078227043152, "epoch": 0.01155, "frac_reward_zero_std": 0.75, "grad_norm": 0.00414688978344202, "kl": 0.8355140164494514, "learning_rate": 9.999420515225069e-06, "loss": 0.0012, "num_tokens": 28880506.0, "reward": 0.46738412976264954, "reward_std": 0.24838458001613617, "rewards/rollout_reward_func/mean": 0.46738412976264954, "rewards/rollout_reward_func/std": 1.0358221530914307, "sampling/importance_sampling_ratio/max": 0.5548367500305176, "sampling/importance_sampling_ratio/mean": 0.3906072974205017, "sampling/importance_sampling_ratio/min": 0.0076221260242164135, "sampling/sampling_logp_difference/max": 2.35247540473938, "sampling/sampling_logp_difference/mean": 0.4665607810020447, "step": 1155, "step_time": 12.36365885399573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.5001751482486725, "epoch": 0.01156, "grad_norm": 0.00403267377987504, "kl": 0.8361012488603592, "learning_rate": 9.999419479070025e-06, "loss": 0.0012, "step": 1156, "step_time": 7.109610482009884 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1610.0, "completions/max_terminated_length": 1610.0, "completions/mean_length": 588.53125, "completions/mean_terminated_length": 588.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.2169149518013, "epoch": 0.01157, "frac_reward_zero_std": 0.5, "grad_norm": 0.023259973153471947, "kl": 0.6422353610396385, "learning_rate": 9.999418441989527e-06, "loss": 0.0079, "num_tokens": 28940084.0, "reward": 0.6489825248718262, "reward_std": 0.017493491992354393, "rewards/rollout_reward_func/mean": 0.6489825248718262, "rewards/rollout_reward_func/std": 0.9411237239837646, "sampling/importance_sampling_ratio/max": 0.5534095764160156, "sampling/importance_sampling_ratio/mean": 0.28915607929229736, "sampling/importance_sampling_ratio/min": 0.006468330509960651, "sampling/sampling_logp_difference/max": 2.403876304626465, "sampling/sampling_logp_difference/mean": 0.3975120782852173, "step": 1157, "step_time": 12.084604440991825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.206136018037796, "epoch": 0.01158, "grad_norm": 0.022832348942756653, "kl": 0.645740520209074, "learning_rate": 9.999417403983573e-06, "loss": 0.0079, "step": 1158, "step_time": 6.463078031003533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2167.0, "completions/max_terminated_length": 2167.0, "completions/mean_length": 656.40625, "completions/mean_terminated_length": 656.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.222318142652512, "epoch": 0.01159, "frac_reward_zero_std": 0.5, "grad_norm": 0.020479716360569, "kl": 0.6407258957624435, "learning_rate": 9.999416365052164e-06, "loss": 0.0005, "num_tokens": 29001055.0, "reward": 0.07322083413600922, "reward_std": 0.4087384343147278, "rewards/rollout_reward_func/mean": 0.07322083413600922, "rewards/rollout_reward_func/std": 1.1095848083496094, "sampling/importance_sampling_ratio/max": 0.5553845167160034, "sampling/importance_sampling_ratio/mean": 0.2556365430355072, "sampling/importance_sampling_ratio/min": 4.349495065980591e-05, "sampling/sampling_logp_difference/max": 2.395353317260742, "sampling/sampling_logp_difference/mean": 0.6024338006973267, "step": 1159, "step_time": 13.955699007005023 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 4.188133537769318, "epoch": 0.0116, "grad_norm": 0.015693485736846924, "kl": 0.6434964053332806, "learning_rate": 9.999415325195299e-06, "loss": 0.0004, "step": 1160, "step_time": 7.585338759010483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1520.0, "completions/max_terminated_length": 1520.0, "completions/mean_length": 829.8125, "completions/mean_terminated_length": 829.8125, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "entropy": 4.401195138692856, "epoch": 0.01161, "frac_reward_zero_std": 0.0, "grad_norm": 0.02195357345044613, "kl": 0.6975430101156235, "learning_rate": 9.999414284412979e-06, "loss": 0.004, "num_tokens": 29068707.0, "reward": 0.6023259162902832, "reward_std": 0.852357029914856, "rewards/rollout_reward_func/mean": 0.6023259162902832, "rewards/rollout_reward_func/std": 1.0145810842514038, "sampling/importance_sampling_ratio/max": 0.30193236470222473, "sampling/importance_sampling_ratio/mean": 0.15646684169769287, "sampling/importance_sampling_ratio/min": 1.5178819826885221e-16, "sampling/sampling_logp_difference/max": 13.317293167114258, "sampling/sampling_logp_difference/mean": 0.8834775686264038, "step": 1161, "step_time": 11.591204524011118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.389650791883469, "epoch": 0.01162, "grad_norm": 0.021389123052358627, "kl": 0.6974843516945839, "learning_rate": 9.999413242705202e-06, "loss": 0.004, "step": 1162, "step_time": 6.395840227000008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2135.0, "completions/max_terminated_length": 2135.0, "completions/mean_length": 935.40625, "completions/mean_terminated_length": 919.1612548828125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 4.287213534116745, "epoch": 0.01163, "frac_reward_zero_std": 0.0, "grad_norm": 0.0633123591542244, "kl": 0.6066344566643238, "learning_rate": 9.999412200071973e-06, "loss": 0.0086, "num_tokens": 29140864.0, "reward": 0.48411816358566284, "reward_std": 0.5143802165985107, "rewards/rollout_reward_func/mean": 0.48411816358566284, "rewards/rollout_reward_func/std": 1.0493288040161133, "sampling/importance_sampling_ratio/max": 0.29817530512809753, "sampling/importance_sampling_ratio/mean": 0.13101263344287872, "sampling/importance_sampling_ratio/min": 4.731619895040562e-23, "sampling/sampling_logp_difference/max": 10.91096305847168, "sampling/sampling_logp_difference/mean": 0.8262296319007874, "step": 1163, "step_time": 14.420054001006065 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "entropy": 4.300116717815399, "epoch": 0.01164, "grad_norm": 0.06170226261019707, "kl": 0.6061569340527058, "learning_rate": 9.999411156513289e-06, "loss": 0.0085, "step": 1164, "step_time": 7.678127779989154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 139.3125, "completions/mean_terminated_length": 139.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.913672983646393, "epoch": 0.01165, "frac_reward_zero_std": 0.5, "grad_norm": 0.035599347203969955, "kl": 0.8558192551136017, "learning_rate": 9.999410112029152e-06, "loss": -0.0032, "num_tokens": 29183405.0, "reward": 0.5660710334777832, "reward_std": 0.009657107293605804, "rewards/rollout_reward_func/mean": 0.5660710334777832, "rewards/rollout_reward_func/std": 0.9703691601753235, "sampling/importance_sampling_ratio/max": 0.549401581287384, "sampling/importance_sampling_ratio/mean": 0.4520419239997864, "sampling/importance_sampling_ratio/min": 0.00025361418374814093, "sampling/sampling_logp_difference/max": 4.800114631652832, "sampling/sampling_logp_difference/mean": 0.4127007722854614, "step": 1165, "step_time": 7.824135695998848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.9175350964069366, "epoch": 0.01166, "grad_norm": 0.035813283175230026, "kl": 0.8556576929986477, "learning_rate": 9.99940906661956e-06, "loss": -0.0034, "step": 1166, "step_time": 4.279682102998777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1485.0, "completions/max_terminated_length": 1485.0, "completions/mean_length": 596.53125, "completions/mean_terminated_length": 596.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.9917852878570557, "epoch": 0.01167, "frac_reward_zero_std": 0.25, "grad_norm": 0.04558698832988739, "kl": 0.7651761099696159, "learning_rate": 9.999408020284516e-06, "loss": -0.0096, "num_tokens": 29241938.0, "reward": 0.8365844488143921, "reward_std": 0.7012677192687988, "rewards/rollout_reward_func/mean": 0.8365844488143921, "rewards/rollout_reward_func/std": 0.8057352304458618, "sampling/importance_sampling_ratio/max": 0.5460261702537537, "sampling/importance_sampling_ratio/mean": 0.24499762058258057, "sampling/importance_sampling_ratio/min": 5.0282798012683295e-15, "sampling/sampling_logp_difference/max": 4.209883689880371, "sampling/sampling_logp_difference/mean": 0.7092711925506592, "step": 1167, "step_time": 10.91390239399334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.012731999158859, "epoch": 0.01168, "grad_norm": 0.04761815443634987, "kl": 0.7601410858333111, "learning_rate": 9.999406973024017e-06, "loss": -0.0096, "step": 1168, "step_time": 6.0641503460210515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2107.0, "completions/max_terminated_length": 2107.0, "completions/mean_length": 821.84375, "completions/mean_terminated_length": 825.54833984375, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "entropy": 5.41104257106781, "epoch": 0.01169, "frac_reward_zero_std": 0.0, "grad_norm": 0.023600470274686813, "kl": 0.6080426946282387, "learning_rate": 9.999405924838066e-06, "loss": -0.0065, "num_tokens": 29309851.0, "reward": 0.7237809896469116, "reward_std": 0.9200333952903748, "rewards/rollout_reward_func/mean": 0.7237809896469116, "rewards/rollout_reward_func/std": 0.9081416726112366, "sampling/importance_sampling_ratio/max": 0.30399322509765625, "sampling/importance_sampling_ratio/mean": 0.06646985560655594, "sampling/importance_sampling_ratio/min": 2.5032972859862467e-14, "sampling/sampling_logp_difference/max": 12.929925918579102, "sampling/sampling_logp_difference/mean": 1.0496455430984497, "step": 1169, "step_time": 13.898801432995242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.392885625362396, "epoch": 0.0117, "grad_norm": 0.02354644425213337, "kl": 0.6075602695345879, "learning_rate": 9.999404875726661e-06, "loss": -0.0066, "step": 1170, "step_time": 7.514081934008573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1534.0, "completions/max_terminated_length": 1534.0, "completions/mean_length": 878.53125, "completions/mean_terminated_length": 884.7418823242188, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.624173641204834, "epoch": 0.01171, "frac_reward_zero_std": 0.25, "grad_norm": 0.01437433436512947, "kl": 0.6648056209087372, "learning_rate": 9.999403825689805e-06, "loss": -0.0056, "num_tokens": 29377808.0, "reward": 0.7425444722175598, "reward_std": 0.7462916970252991, "rewards/rollout_reward_func/mean": 0.7425444722175598, "rewards/rollout_reward_func/std": 0.9101906418800354, "sampling/importance_sampling_ratio/max": 0.5506696105003357, "sampling/importance_sampling_ratio/mean": 0.17482516169548035, "sampling/importance_sampling_ratio/min": 3.4483767956324267e-15, "sampling/sampling_logp_difference/max": 11.260679244995117, "sampling/sampling_logp_difference/mean": 0.8387491703033447, "step": 1171, "step_time": 11.989817252004286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.602476060390472, "epoch": 0.01172, "grad_norm": 0.013858374208211899, "kl": 0.6621780097484589, "learning_rate": 9.999402774727496e-06, "loss": -0.0056, "step": 1172, "step_time": 6.566929665998032 }, { "clip_ratio/high_max": 0.0029761905316263437, "clip_ratio/high_mean": 0.0014880952658131719, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014880952658131719, "completions/clipped_ratio": 0.03125, "completions/max_length": 2390.0, "completions/max_terminated_length": 2390.0, "completions/mean_length": 1039.875, "completions/mean_terminated_length": 1027.4515380859375, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "entropy": 4.98326849937439, "epoch": 0.01173, "frac_reward_zero_std": 0.0, "grad_norm": 0.02421947568655014, "kl": 0.5374710410833359, "learning_rate": 9.999401722839737e-06, "loss": -0.0132, "num_tokens": 29452934.0, "reward": 0.6994469165802002, "reward_std": 0.6899313926696777, "rewards/rollout_reward_func/mean": 0.6994469165802002, "rewards/rollout_reward_func/std": 0.8341144323348999, "sampling/importance_sampling_ratio/max": 0.3069517910480499, "sampling/importance_sampling_ratio/mean": 0.14329074323177338, "sampling/importance_sampling_ratio/min": 1.4934140463436526e-17, "sampling/sampling_logp_difference/max": 11.590923309326172, "sampling/sampling_logp_difference/mean": 0.9590178728103638, "step": 1173, "step_time": 15.164810930000385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.968954086303711, "epoch": 0.01174, "grad_norm": 0.025180328637361526, "kl": 0.5382145866751671, "learning_rate": 9.999400670026525e-06, "loss": -0.0132, "step": 1174, "step_time": 8.529027558979578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2156.0, "completions/max_terminated_length": 2156.0, "completions/mean_length": 524.53125, "completions/mean_terminated_length": 524.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.156528860330582, "epoch": 0.01175, "frac_reward_zero_std": 0.25, "grad_norm": 0.013960408046841621, "kl": 0.7507270127534866, "learning_rate": 9.999399616287859e-06, "loss": -0.0075, "num_tokens": 29508006.0, "reward": 1.0040769577026367, "reward_std": 0.3047698140144348, "rewards/rollout_reward_func/mean": 1.0040769577026367, "rewards/rollout_reward_func/std": 0.5821760296821594, "sampling/importance_sampling_ratio/max": 0.5491205453872681, "sampling/importance_sampling_ratio/mean": 0.309270977973938, "sampling/importance_sampling_ratio/min": 3.711489882790575e-12, "sampling/sampling_logp_difference/max": 4.418179988861084, "sampling/sampling_logp_difference/mean": 0.7137956619262695, "step": 1175, "step_time": 13.45221696299268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.149057447910309, "epoch": 0.01176, "grad_norm": 0.013732394203543663, "kl": 0.7474665716290474, "learning_rate": 9.999398561623746e-06, "loss": -0.0076, "step": 1176, "step_time": 7.496885863998614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1424.0, "completions/max_terminated_length": 1424.0, "completions/mean_length": 786.59375, "completions/mean_terminated_length": 786.59375, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "entropy": 3.7828021347522736, "epoch": 0.01177, "frac_reward_zero_std": 0.25, "grad_norm": 0.028208404779434204, "kl": 0.6139672286808491, "learning_rate": 9.999397506034179e-06, "loss": -0.0066, "num_tokens": 29575323.0, "reward": 0.530142068862915, "reward_std": 0.2827269434928894, "rewards/rollout_reward_func/mean": 0.530142068862915, "rewards/rollout_reward_func/std": 1.0186480283737183, "sampling/importance_sampling_ratio/max": 0.30505335330963135, "sampling/importance_sampling_ratio/mean": 0.18073292076587677, "sampling/importance_sampling_ratio/min": 8.541353388458184e-13, "sampling/sampling_logp_difference/max": 4.659256458282471, "sampling/sampling_logp_difference/mean": 0.5777888894081116, "step": 1177, "step_time": 11.834239803007222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.7763617038726807, "epoch": 0.01178, "grad_norm": 0.027048630639910698, "kl": 0.6132254153490067, "learning_rate": 9.999396449519164e-06, "loss": -0.0066, "step": 1178, "step_time": 6.170535191988165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2165.0, "completions/max_terminated_length": 2165.0, "completions/mean_length": 570.6875, "completions/mean_terminated_length": 565.0322265625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.6081531047821045, "epoch": 0.01179, "frac_reward_zero_std": 0.0, "grad_norm": 0.0768357589840889, "kl": 0.6424271948635578, "learning_rate": 9.999395392078698e-06, "loss": -0.0032, "num_tokens": 29634115.0, "reward": -0.09856799244880676, "reward_std": 1.0255146026611328, "rewards/rollout_reward_func/mean": -0.09856799244880676, "rewards/rollout_reward_func/std": 1.0644817352294922, "sampling/importance_sampling_ratio/max": 0.5387330651283264, "sampling/importance_sampling_ratio/mean": 0.18710148334503174, "sampling/importance_sampling_ratio/min": 3.6154850224789925e-09, "sampling/sampling_logp_difference/max": 4.251482963562012, "sampling/sampling_logp_difference/mean": 0.7545411586761475, "step": 1179, "step_time": 13.033458640995377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.622577011585236, "epoch": 0.0118, "grad_norm": 0.076233871281147, "kl": 0.6382213085889816, "learning_rate": 9.999394333712782e-06, "loss": -0.0035, "step": 1180, "step_time": 8.042907799994282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1502.0, "completions/max_terminated_length": 1502.0, "completions/mean_length": 351.15625, "completions/mean_terminated_length": 361.9677429199219, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.369881361722946, "epoch": 0.01181, "frac_reward_zero_std": 0.0, "grad_norm": 0.07623565196990967, "kl": 0.6115757897496223, "learning_rate": 9.999393274421414e-06, "loss": -0.0046, "num_tokens": 29684224.0, "reward": 0.45143255591392517, "reward_std": 0.5333921313285828, "rewards/rollout_reward_func/mean": 0.45143255591392517, "rewards/rollout_reward_func/std": 1.0045255422592163, "sampling/importance_sampling_ratio/max": 0.5525492429733276, "sampling/importance_sampling_ratio/mean": 0.2815926969051361, "sampling/importance_sampling_ratio/min": 1.5340244189599394e-15, "sampling/sampling_logp_difference/max": 4.5130157470703125, "sampling/sampling_logp_difference/mean": 0.8568512201309204, "step": 1181, "step_time": 10.948057564994087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.406517773866653, "epoch": 0.01182, "grad_norm": 0.07251615822315216, "kl": 0.6022755727171898, "learning_rate": 9.999392214204598e-06, "loss": -0.0048, "step": 1182, "step_time": 6.058601202981663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1493.0, "completions/max_terminated_length": 1493.0, "completions/mean_length": 589.15625, "completions/mean_terminated_length": 589.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.7233189940452576, "epoch": 0.01183, "frac_reward_zero_std": 0.0, "grad_norm": 0.03718230500817299, "kl": 0.5998472645878792, "learning_rate": 9.999391153062331e-06, "loss": -0.0171, "num_tokens": 29743931.0, "reward": 0.9077519774436951, "reward_std": 0.6770281791687012, "rewards/rollout_reward_func/mean": 0.9077519774436951, "rewards/rollout_reward_func/std": 0.7606123685836792, "sampling/importance_sampling_ratio/max": 0.5501376986503601, "sampling/importance_sampling_ratio/mean": 0.2621666491031647, "sampling/importance_sampling_ratio/min": 1.4179389040691603e-07, "sampling/sampling_logp_difference/max": 3.988513708114624, "sampling/sampling_logp_difference/mean": 0.5449155569076538, "step": 1183, "step_time": 10.979380094999215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.74275603890419, "epoch": 0.01184, "grad_norm": 0.04094843193888664, "kl": 0.6016741953790188, "learning_rate": 9.999390090994617e-06, "loss": -0.0171, "step": 1184, "step_time": 6.201887701994565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1967.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 788.875, "completions/mean_terminated_length": 793.0967407226562, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "entropy": 4.879124969244003, "epoch": 0.01185, "frac_reward_zero_std": 0.0, "grad_norm": 0.027556156739592552, "kl": 0.6218471340835094, "learning_rate": 9.999389028001453e-06, "loss": -0.0107, "num_tokens": 29810565.0, "reward": 0.3880161643028259, "reward_std": 0.5567321181297302, "rewards/rollout_reward_func/mean": 0.3880161643028259, "rewards/rollout_reward_func/std": 1.0904836654663086, "sampling/importance_sampling_ratio/max": 0.3064819276332855, "sampling/importance_sampling_ratio/mean": 0.14606717228889465, "sampling/importance_sampling_ratio/min": 1.3338648234959798e-18, "sampling/sampling_logp_difference/max": 4.074289321899414, "sampling/sampling_logp_difference/mean": 0.9897100329399109, "step": 1185, "step_time": 13.64805468499253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.879574418067932, "epoch": 0.01186, "grad_norm": 0.027539264410734177, "kl": 0.6235926151275635, "learning_rate": 9.999387964082844e-06, "loss": -0.0108, "step": 1186, "step_time": 7.821306805009954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1377.0, "completions/max_terminated_length": 1377.0, "completions/mean_length": 767.8125, "completions/mean_terminated_length": 772.290283203125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 3.7931149899959564, "epoch": 0.01187, "frac_reward_zero_std": 0.25, "grad_norm": 0.03610456734895706, "kl": 0.8279913291335106, "learning_rate": 9.999386899238782e-06, "loss": -0.0084, "num_tokens": 29876571.0, "reward": 1.078007459640503, "reward_std": 0.42852306365966797, "rewards/rollout_reward_func/mean": 1.078007459640503, "rewards/rollout_reward_func/std": 0.5747509002685547, "sampling/importance_sampling_ratio/max": 0.30732202529907227, "sampling/importance_sampling_ratio/mean": 0.19286438822746277, "sampling/importance_sampling_ratio/min": 2.9843053158961103e-18, "sampling/sampling_logp_difference/max": 10.921175956726074, "sampling/sampling_logp_difference/mean": 0.6863760352134705, "step": 1187, "step_time": 11.25676138500421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.800636738538742, "epoch": 0.01188, "grad_norm": 0.03339582309126854, "kl": 0.7885494828224182, "learning_rate": 9.999385833469273e-06, "loss": -0.0085, "step": 1188, "step_time": 6.06656139199913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1363.0, "completions/max_terminated_length": 1363.0, "completions/mean_length": 624.625, "completions/mean_terminated_length": 624.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.95987668633461, "epoch": 0.01189, "frac_reward_zero_std": 0.25, "grad_norm": 0.014236122369766235, "kl": 0.6203224547207355, "learning_rate": 9.999384766774318e-06, "loss": -0.0093, "num_tokens": 29936815.0, "reward": 0.5746361017227173, "reward_std": 0.21448971331119537, "rewards/rollout_reward_func/mean": 0.5746361017227173, "rewards/rollout_reward_func/std": 0.9313077330589294, "sampling/importance_sampling_ratio/max": 0.5499590039253235, "sampling/importance_sampling_ratio/mean": 0.26309776306152344, "sampling/importance_sampling_ratio/min": 4.764210392132238e-20, "sampling/sampling_logp_difference/max": 11.726264953613281, "sampling/sampling_logp_difference/mean": 0.7256892919540405, "step": 1189, "step_time": 11.309350462004659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.9539278149604797, "epoch": 0.0119, "grad_norm": 0.014255235902965069, "kl": 0.6211099177598953, "learning_rate": 9.999383699153913e-06, "loss": -0.0093, "step": 1190, "step_time": 6.099894928993308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1345.0, "completions/max_terminated_length": 1345.0, "completions/mean_length": 348.28125, "completions/mean_terminated_length": 359.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.658942312002182, "epoch": 0.01191, "frac_reward_zero_std": 0.25, "grad_norm": 0.019592875614762306, "kl": 0.598168820142746, "learning_rate": 9.999382630608064e-06, "loss": -0.0092, "num_tokens": 29986625.0, "reward": 1.146690845489502, "reward_std": 0.03850919008255005, "rewards/rollout_reward_func/mean": 1.146690845489502, "rewards/rollout_reward_func/std": 0.07112595438957214, "sampling/importance_sampling_ratio/max": 0.5542755126953125, "sampling/importance_sampling_ratio/mean": 0.3517773747444153, "sampling/importance_sampling_ratio/min": 3.194566097307023e-19, "sampling/sampling_logp_difference/max": 4.171504497528076, "sampling/sampling_logp_difference/mean": 0.7706051468849182, "step": 1191, "step_time": 10.998189095989801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.645829886198044, "epoch": 0.01192, "grad_norm": 0.019082173705101013, "kl": 0.5977127589285374, "learning_rate": 9.999381561136765e-06, "loss": -0.0092, "step": 1192, "step_time": 6.416151371995511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1321.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 406.59375, "completions/mean_terminated_length": 419.19354248046875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.9727588891983032, "epoch": 0.01193, "frac_reward_zero_std": 0.25, "grad_norm": 0.11901342868804932, "kl": 0.6283828131854534, "learning_rate": 9.99938049074002e-06, "loss": -0.0011, "num_tokens": 30039959.0, "reward": 0.23687154054641724, "reward_std": 0.23568496108055115, "rewards/rollout_reward_func/mean": 0.23687154054641724, "rewards/rollout_reward_func/std": 1.0578008890151978, "sampling/importance_sampling_ratio/max": 0.5517662763595581, "sampling/importance_sampling_ratio/mean": 0.2778024673461914, "sampling/importance_sampling_ratio/min": 4.125279829650559e-14, "sampling/sampling_logp_difference/max": 4.2810516357421875, "sampling/sampling_logp_difference/mean": 0.7636876106262207, "step": 1193, "step_time": 10.86995731000934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 3.9341584146022797, "epoch": 0.01194, "grad_norm": 0.09223005175590515, "kl": 0.6287758313119411, "learning_rate": 9.99937941941783e-06, "loss": -0.0014, "step": 1194, "step_time": 5.958785358998284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1581.0, "completions/max_terminated_length": 1581.0, "completions/mean_length": 661.53125, "completions/mean_terminated_length": 704.5667114257812, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.726432919502258, "epoch": 0.01195, "frac_reward_zero_std": 0.0, "grad_norm": 0.038067691028118134, "kl": 0.6480265259742737, "learning_rate": 9.999378347170195e-06, "loss": -0.0095, "num_tokens": 30100392.0, "reward": 0.5064792633056641, "reward_std": 0.6089417934417725, "rewards/rollout_reward_func/mean": 0.5064792633056641, "rewards/rollout_reward_func/std": 0.9461585283279419, "sampling/importance_sampling_ratio/max": 0.5538952946662903, "sampling/importance_sampling_ratio/mean": 0.2073923796415329, "sampling/importance_sampling_ratio/min": 8.886754443437894e-09, "sampling/sampling_logp_difference/max": 3.915250778198242, "sampling/sampling_logp_difference/mean": 0.8153954744338989, "step": 1195, "step_time": 11.484141081986309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.703816145658493, "epoch": 0.01196, "grad_norm": 0.03321937844157219, "kl": 0.6446032486855984, "learning_rate": 9.999377273997111e-06, "loss": -0.0096, "step": 1196, "step_time": 6.390888337002252 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 2225.0, "completions/max_terminated_length": 2225.0, "completions/mean_length": 739.75, "completions/mean_terminated_length": 739.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.336666435003281, "epoch": 0.01197, "frac_reward_zero_std": 0.25, "grad_norm": 0.0515507310628891, "kl": 0.6440066285431385, "learning_rate": 9.999376199898583e-06, "loss": -0.0028, "num_tokens": 30163958.0, "reward": 0.9868500232696533, "reward_std": 0.4521673321723938, "rewards/rollout_reward_func/mean": 0.9868500232696533, "rewards/rollout_reward_func/std": 0.6308428645133972, "sampling/importance_sampling_ratio/max": 0.5551838278770447, "sampling/importance_sampling_ratio/mean": 0.21022029221057892, "sampling/importance_sampling_ratio/min": 1.5896804272247378e-12, "sampling/sampling_logp_difference/max": 13.405414581298828, "sampling/sampling_logp_difference/mean": 0.7673800587654114, "step": 1197, "step_time": 14.54923889299971 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "entropy": 4.295549422502518, "epoch": 0.01198, "grad_norm": 0.059126630425453186, "kl": 0.6473887078464031, "learning_rate": 9.99937512487461e-06, "loss": -0.0028, "step": 1198, "step_time": 7.7095323739995365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 410.875, "completions/mean_terminated_length": 410.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.702938288450241, "epoch": 0.01199, "frac_reward_zero_std": 0.25, "grad_norm": 0.016283251345157623, "kl": 0.6922013834118843, "learning_rate": 9.99937404892519e-06, "loss": -0.0052, "num_tokens": 30215972.0, "reward": 0.6788020133972168, "reward_std": 0.18753820657730103, "rewards/rollout_reward_func/mean": 0.6788020133972168, "rewards/rollout_reward_func/std": 0.9068642854690552, "sampling/importance_sampling_ratio/max": 0.5520418882369995, "sampling/importance_sampling_ratio/mean": 0.29258543252944946, "sampling/importance_sampling_ratio/min": 2.698536706591259e-12, "sampling/sampling_logp_difference/max": 4.760934352874756, "sampling/sampling_logp_difference/mean": 0.6035001873970032, "step": 1199, "step_time": 8.637438573008694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.676440268754959, "epoch": 0.012, "grad_norm": 0.01488407701253891, "kl": 0.6947975270450115, "learning_rate": 9.999372972050326e-06, "loss": -0.0052, "step": 1200, "step_time": 4.577128646997153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2292.0, "completions/max_terminated_length": 2292.0, "completions/mean_length": 1207.46875, "completions/mean_terminated_length": 1207.46875, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "entropy": 4.49423822760582, "epoch": 0.01201, "frac_reward_zero_std": 0.0, "grad_norm": 0.025468900799751282, "kl": 0.5083098597824574, "learning_rate": 9.999371894250018e-06, "loss": -0.0133, "num_tokens": 30296475.0, "reward": 0.7533478736877441, "reward_std": 0.9513682126998901, "rewards/rollout_reward_func/mean": 0.7533478736877441, "rewards/rollout_reward_func/std": 0.9120132923126221, "sampling/importance_sampling_ratio/max": 0.30374956130981445, "sampling/importance_sampling_ratio/mean": 0.11839031428098679, "sampling/importance_sampling_ratio/min": 1.7717077804846326e-10, "sampling/sampling_logp_difference/max": 13.97607135772705, "sampling/sampling_logp_difference/mean": 0.7900880575180054, "step": 1201, "step_time": 14.670873323986598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.471497505903244, "epoch": 0.01202, "grad_norm": 0.023343946784734726, "kl": 0.511919941753149, "learning_rate": 9.999370815524266e-06, "loss": -0.0133, "step": 1202, "step_time": 8.690286238990666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2189.0, "completions/max_terminated_length": 2189.0, "completions/mean_length": 1147.8125, "completions/mean_terminated_length": 1147.8125, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "entropy": 4.378250241279602, "epoch": 0.01203, "frac_reward_zero_std": 0.0, "grad_norm": 0.01563016138970852, "kl": 0.5835802592337132, "learning_rate": 9.999369735873068e-06, "loss": -0.0046, "num_tokens": 30375535.0, "reward": 0.5597470998764038, "reward_std": 0.8640182018280029, "rewards/rollout_reward_func/mean": 0.5597470998764038, "rewards/rollout_reward_func/std": 1.0042974948883057, "sampling/importance_sampling_ratio/max": 0.2963108420372009, "sampling/importance_sampling_ratio/mean": 0.0869191512465477, "sampling/importance_sampling_ratio/min": 5.499048993939937e-10, "sampling/sampling_logp_difference/max": 13.94407844543457, "sampling/sampling_logp_difference/mean": 0.7006374597549438, "step": 1203, "step_time": 14.492602947990235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.35752409696579, "epoch": 0.01204, "grad_norm": 0.015314022079110146, "kl": 0.5831968188285828, "learning_rate": 9.999368655296428e-06, "loss": -0.0046, "step": 1204, "step_time": 7.787133258992981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1788.0, "completions/max_terminated_length": 1788.0, "completions/mean_length": 903.25, "completions/mean_terminated_length": 915.0, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "entropy": 4.637510180473328, "epoch": 0.01205, "frac_reward_zero_std": 0.0, "grad_norm": 0.031131142750382423, "kl": 0.5826076939702034, "learning_rate": 9.999367573794344e-06, "loss": -0.0047, "num_tokens": 30445281.0, "reward": 0.42256832122802734, "reward_std": 0.48754966259002686, "rewards/rollout_reward_func/mean": 0.42256832122802734, "rewards/rollout_reward_func/std": 0.9823398590087891, "sampling/importance_sampling_ratio/max": 0.3001525402069092, "sampling/importance_sampling_ratio/mean": 0.12432420998811722, "sampling/importance_sampling_ratio/min": 6.341764136399974e-14, "sampling/sampling_logp_difference/max": 4.333077907562256, "sampling/sampling_logp_difference/mean": 0.8627938628196716, "step": 1205, "step_time": 13.302236946001358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.603991806507111, "epoch": 0.01206, "grad_norm": 0.02977268025279045, "kl": 0.5876617208123207, "learning_rate": 9.999366491366816e-06, "loss": -0.0048, "step": 1206, "step_time": 7.1491689150061575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 151.4375, "completions/mean_terminated_length": 151.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.3819161355495453, "epoch": 0.01207, "frac_reward_zero_std": 0.5, "grad_norm": 0.00818544253706932, "kl": 0.8267941027879715, "learning_rate": 9.999365408013845e-06, "loss": -0.0086, "num_tokens": 30486862.0, "reward": 1.0346903800964355, "reward_std": 0.19615814089775085, "rewards/rollout_reward_func/mean": 1.0346903800964355, "rewards/rollout_reward_func/std": 0.36588937044143677, "sampling/importance_sampling_ratio/max": 0.5551024079322815, "sampling/importance_sampling_ratio/mean": 0.4399452209472656, "sampling/importance_sampling_ratio/min": 1.7513499983579095e-07, "sampling/sampling_logp_difference/max": 3.6537575721740723, "sampling/sampling_logp_difference/mean": 0.5651035308837891, "step": 1207, "step_time": 8.049823282010038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.3821807503700256, "epoch": 0.01208, "grad_norm": 0.008217578753829002, "kl": 0.827524371445179, "learning_rate": 9.999364323735433e-06, "loss": -0.0086, "step": 1208, "step_time": 4.765591594994476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1574.0, "completions/max_terminated_length": 1574.0, "completions/mean_length": 447.21875, "completions/mean_terminated_length": 447.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7578055262565613, "epoch": 0.01209, "frac_reward_zero_std": 0.25, "grad_norm": 0.22409510612487793, "kl": 0.7828532978892326, "learning_rate": 9.999363238531578e-06, "loss": 0.0014, "num_tokens": 30541314.0, "reward": 1.1848576068878174, "reward_std": 0.012876907363533974, "rewards/rollout_reward_func/mean": 1.1848576068878174, "rewards/rollout_reward_func/std": 0.050628237426280975, "sampling/importance_sampling_ratio/max": 0.55485999584198, "sampling/importance_sampling_ratio/mean": 0.343207448720932, "sampling/importance_sampling_ratio/min": 0.15564602613449097, "sampling/sampling_logp_difference/max": 0.7755091786384583, "sampling/sampling_logp_difference/mean": 0.3131253719329834, "step": 1209, "step_time": 12.229252737990464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 2.745770573616028, "epoch": 0.0121, "grad_norm": 0.032414842396974564, "kl": 0.7781141996383667, "learning_rate": 9.99936215240228e-06, "loss": 0.0008, "step": 1210, "step_time": 6.574419006996322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2239.0, "completions/max_terminated_length": 2239.0, "completions/mean_length": 430.28125, "completions/mean_terminated_length": 457.9000244140625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.3000855147838593, "epoch": 0.01211, "frac_reward_zero_std": 0.25, "grad_norm": 0.03231727331876755, "kl": 0.6434760130941868, "learning_rate": 9.999361065347541e-06, "loss": -0.0031, "num_tokens": 30592925.0, "reward": 1.1526910066604614, "reward_std": 0.02666318789124489, "rewards/rollout_reward_func/mean": 1.1526910066604614, "rewards/rollout_reward_func/std": 0.09035911411046982, "sampling/importance_sampling_ratio/max": 0.5553015470504761, "sampling/importance_sampling_ratio/mean": 0.35828378796577454, "sampling/importance_sampling_ratio/min": 3.869442544157664e-09, "sampling/sampling_logp_difference/max": 4.336469650268555, "sampling/sampling_logp_difference/mean": 0.5353835821151733, "step": 1211, "step_time": 12.930880864994833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.3005562722682953, "epoch": 0.01212, "grad_norm": 0.03038928285241127, "kl": 0.64108582213521, "learning_rate": 9.99935997736736e-06, "loss": -0.0033, "step": 1212, "step_time": 7.429966409988992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1516.0, "completions/max_terminated_length": 1516.0, "completions/mean_length": 625.90625, "completions/mean_terminated_length": 625.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.5110959708690643, "epoch": 0.01213, "frac_reward_zero_std": 0.5, "grad_norm": 0.01754441112279892, "kl": 0.6712074726819992, "learning_rate": 9.999358888461737e-06, "loss": -0.0055, "num_tokens": 30652586.0, "reward": 0.8842005729675293, "reward_std": 0.5080546736717224, "rewards/rollout_reward_func/mean": 0.8842005729675293, "rewards/rollout_reward_func/std": 0.7463345527648926, "sampling/importance_sampling_ratio/max": 0.5558585524559021, "sampling/importance_sampling_ratio/mean": 0.31954100728034973, "sampling/importance_sampling_ratio/min": 3.6160026084530728e-09, "sampling/sampling_logp_difference/max": 3.857103109359741, "sampling/sampling_logp_difference/mean": 0.5626006722450256, "step": 1213, "step_time": 11.294329030009976 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.50175878405571, "epoch": 0.01214, "grad_norm": 0.017251180484890938, "kl": 0.6736801341176033, "learning_rate": 9.999357798630673e-06, "loss": -0.0056, "step": 1214, "step_time": 6.7768876219997765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2072.0, "completions/max_terminated_length": 2072.0, "completions/mean_length": 665.96875, "completions/mean_terminated_length": 665.96875, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "entropy": 3.164253294467926, "epoch": 0.01215, "frac_reward_zero_std": 0.0, "grad_norm": 0.03656580299139023, "kl": 0.679936770349741, "learning_rate": 9.99935670787417e-06, "loss": -0.0061, "num_tokens": 30715583.0, "reward": 1.1383252143859863, "reward_std": 0.20792216062545776, "rewards/rollout_reward_func/mean": 1.1383252143859863, "rewards/rollout_reward_func/std": 0.35952383279800415, "sampling/importance_sampling_ratio/max": 0.3198475241661072, "sampling/importance_sampling_ratio/mean": 0.24836167693138123, "sampling/importance_sampling_ratio/min": 0.0016719232080504298, "sampling/sampling_logp_difference/max": 2.1197571754455566, "sampling/sampling_logp_difference/mean": 0.3813033699989319, "step": 1215, "step_time": 13.540808684992953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.156970113515854, "epoch": 0.01216, "grad_norm": 0.02888025902211666, "kl": 0.6868201121687889, "learning_rate": 9.999355616192225e-06, "loss": -0.0062, "step": 1216, "step_time": 7.43322497200279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1859.0, "completions/max_terminated_length": 1859.0, "completions/mean_length": 752.96875, "completions/mean_terminated_length": 752.96875, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "entropy": 3.6212147772312164, "epoch": 0.01217, "frac_reward_zero_std": 0.0, "grad_norm": 0.06346023082733154, "kl": 0.5016878396272659, "learning_rate": 9.99935452358484e-06, "loss": -0.0045, "num_tokens": 30781606.0, "reward": 1.2044453620910645, "reward_std": 0.0488470196723938, "rewards/rollout_reward_func/mean": 1.2044453620910645, "rewards/rollout_reward_func/std": 0.054275672882795334, "sampling/importance_sampling_ratio/max": 0.32221803069114685, "sampling/importance_sampling_ratio/mean": 0.20629873871803284, "sampling/importance_sampling_ratio/min": 2.7656639131623528e-11, "sampling/sampling_logp_difference/max": 4.610589027404785, "sampling/sampling_logp_difference/mean": 0.5817146301269531, "step": 1217, "step_time": 12.389779514989641 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.627200275659561, "epoch": 0.01218, "grad_norm": 0.061596520245075226, "kl": 0.5003233291208744, "learning_rate": 9.999353430052015e-06, "loss": -0.0048, "step": 1218, "step_time": 7.081240396000794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2177.0, "completions/max_terminated_length": 2177.0, "completions/mean_length": 409.8125, "completions/mean_terminated_length": 409.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.414701223373413, "epoch": 0.01219, "frac_reward_zero_std": 0.75, "grad_norm": 0.008170461282134056, "kl": 0.7943335771560669, "learning_rate": 9.999352335593749e-06, "loss": -0.0043, "num_tokens": 30830570.0, "reward": 0.8697896003723145, "reward_std": 0.24520204961299896, "rewards/rollout_reward_func/mean": 0.8697896003723145, "rewards/rollout_reward_func/std": 0.6055601835250854, "sampling/importance_sampling_ratio/max": 0.555368185043335, "sampling/importance_sampling_ratio/mean": 0.42669767141342163, "sampling/importance_sampling_ratio/min": 0.00023858222994022071, "sampling/sampling_logp_difference/max": 1.9322383403778076, "sampling/sampling_logp_difference/mean": 0.4723881185054779, "step": 1219, "step_time": 13.342609293009446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.423911541700363, "epoch": 0.0122, "grad_norm": 0.008038177154958248, "kl": 0.7947252914309502, "learning_rate": 9.999351240210043e-06, "loss": -0.0043, "step": 1220, "step_time": 7.758256399014499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1417.0, "completions/max_terminated_length": 1353.0, "completions/mean_length": 684.4375, "completions/mean_terminated_length": 660.8064575195312, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.830541282892227, "epoch": 0.01221, "frac_reward_zero_std": 0.0, "grad_norm": 0.1743815392255783, "kl": 0.6816451773047447, "learning_rate": 9.9993501439009e-06, "loss": 0.0019, "num_tokens": 30893362.0, "reward": 0.04876452684402466, "reward_std": 0.7817049026489258, "rewards/rollout_reward_func/mean": 0.04876452684402466, "rewards/rollout_reward_func/std": 1.1229263544082642, "sampling/importance_sampling_ratio/max": 0.5581764578819275, "sampling/importance_sampling_ratio/mean": 0.2296110838651657, "sampling/importance_sampling_ratio/min": 7.836439830377628e-11, "sampling/sampling_logp_difference/max": 3.68752384185791, "sampling/sampling_logp_difference/mean": 0.6349436640739441, "step": 1221, "step_time": 12.44589276798797 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.03177083400078118, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0421875009778887, "entropy": 3.826026886701584, "epoch": 0.01222, "grad_norm": 0.02724592201411724, "kl": 0.6646308936178684, "learning_rate": 9.999349046666318e-06, "loss": 0.0009, "step": 1222, "step_time": 6.334305999982462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.03125, "completions/max_length": 2145.0, "completions/max_terminated_length": 2145.0, "completions/mean_length": 978.625, "completions/mean_terminated_length": 1009.1612548828125, "completions/min_length": 32.0, "completions/min_terminated_length": 504.0, "entropy": 4.287299692630768, "epoch": 0.01223, "frac_reward_zero_std": 0.0, "grad_norm": 0.01582840457558632, "kl": 0.5804429613053799, "learning_rate": 9.999347948506298e-06, "loss": -0.0168, "num_tokens": 30966422.0, "reward": 0.6382193565368652, "reward_std": 0.2187040150165558, "rewards/rollout_reward_func/mean": 0.6382193565368652, "rewards/rollout_reward_func/std": 0.9787591695785522, "sampling/importance_sampling_ratio/max": 0.30203667283058167, "sampling/importance_sampling_ratio/mean": 0.1464533507823944, "sampling/importance_sampling_ratio/min": 1.5307259518416996e-32, "sampling/sampling_logp_difference/max": 12.725556373596191, "sampling/sampling_logp_difference/mean": 0.911911129951477, "step": 1223, "step_time": 13.97464749601204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.290394902229309, "epoch": 0.01224, "grad_norm": 0.016664188355207443, "kl": 0.5789789631962776, "learning_rate": 9.999346849420837e-06, "loss": -0.0168, "step": 1224, "step_time": 7.814566091001325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1433.0, "completions/max_terminated_length": 1433.0, "completions/mean_length": 730.40625, "completions/mean_terminated_length": 753.4515991210938, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.110022634267807, "epoch": 0.01225, "frac_reward_zero_std": 0.0, "grad_norm": 0.06321551650762558, "kl": 0.567928247153759, "learning_rate": 9.99934574940994e-06, "loss": -0.0092, "num_tokens": 31030111.0, "reward": -0.327337384223938, "reward_std": 0.4512665569782257, "rewards/rollout_reward_func/mean": -0.327337384223938, "rewards/rollout_reward_func/std": 0.9852256178855896, "sampling/importance_sampling_ratio/max": 0.5513493418693542, "sampling/importance_sampling_ratio/mean": 0.2100791335105896, "sampling/importance_sampling_ratio/min": 1.4223767944241132e-15, "sampling/sampling_logp_difference/max": 12.455384254455566, "sampling/sampling_logp_difference/mean": 0.8308519124984741, "step": 1225, "step_time": 11.193486066978949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.111831903457642, "epoch": 0.01226, "grad_norm": 0.06346433609724045, "kl": 0.5681986883282661, "learning_rate": 9.999344648473603e-06, "loss": -0.0094, "step": 1226, "step_time": 7.250342962004652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2221.0, "completions/max_terminated_length": 2221.0, "completions/mean_length": 924.90625, "completions/mean_terminated_length": 887.6773681640625, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "entropy": 3.97314590215683, "epoch": 0.01227, "frac_reward_zero_std": 0.25, "grad_norm": 0.03804680332541466, "kl": 0.6474037915468216, "learning_rate": 9.99934354661183e-06, "loss": -0.0125, "num_tokens": 31100342.0, "reward": 0.18337027728557587, "reward_std": 0.8256643414497375, "rewards/rollout_reward_func/mean": 0.18337027728557587, "rewards/rollout_reward_func/std": 1.1453940868377686, "sampling/importance_sampling_ratio/max": 0.3087674677371979, "sampling/importance_sampling_ratio/mean": 0.17439767718315125, "sampling/importance_sampling_ratio/min": 9.967584614779107e-17, "sampling/sampling_logp_difference/max": 3.4182145595550537, "sampling/sampling_logp_difference/mean": 0.6377203464508057, "step": 1227, "step_time": 14.469932655003504 }, { "clip_ratio/high_max": 0.041666666977107525, "clip_ratio/high_mean": 0.020833333488553762, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020833333488553762, "entropy": 3.984426975250244, "epoch": 0.01228, "grad_norm": 0.013067997992038727, "kl": 0.6470359899103642, "learning_rate": 9.99934244382462e-06, "loss": -0.0126, "step": 1228, "step_time": 7.806654173997231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1498.0, "completions/max_terminated_length": 1498.0, "completions/mean_length": 846.15625, "completions/mean_terminated_length": 846.15625, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "entropy": 4.217070072889328, "epoch": 0.01229, "frac_reward_zero_std": 0.0, "grad_norm": 0.11828513443470001, "kl": 0.5894293412566185, "learning_rate": 9.999341340111972e-06, "loss": -0.009, "num_tokens": 31169355.0, "reward": 0.4881356656551361, "reward_std": 0.482641339302063, "rewards/rollout_reward_func/mean": 0.4881356656551361, "rewards/rollout_reward_func/std": 1.061123013496399, "sampling/importance_sampling_ratio/max": 0.3236539959907532, "sampling/importance_sampling_ratio/mean": 0.1441228836774826, "sampling/importance_sampling_ratio/min": 4.5235348800076736e-09, "sampling/sampling_logp_difference/max": 12.097009658813477, "sampling/sampling_logp_difference/mean": 0.6902893781661987, "step": 1229, "step_time": 11.875824177994218 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 4.255027174949646, "epoch": 0.0123, "grad_norm": 0.10791677236557007, "kl": 0.5859774351119995, "learning_rate": 9.999340235473887e-06, "loss": -0.0092, "step": 1230, "step_time": 6.5088541060031275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 431.84375, "completions/mean_terminated_length": 445.258056640625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.073386967182159, "epoch": 0.01231, "frac_reward_zero_std": 0.25, "grad_norm": 0.16894599795341492, "kl": 0.6242465078830719, "learning_rate": 9.999339129910366e-06, "loss": -0.0072, "num_tokens": 31223654.0, "reward": 0.07694172859191895, "reward_std": 0.04529460892081261, "rewards/rollout_reward_func/mean": 0.07694172859191895, "rewards/rollout_reward_func/std": 1.1483973264694214, "sampling/importance_sampling_ratio/max": 0.5515030026435852, "sampling/importance_sampling_ratio/mean": 0.2342691570520401, "sampling/importance_sampling_ratio/min": 9.628174807045165e-16, "sampling/sampling_logp_difference/max": 4.6477580070495605, "sampling/sampling_logp_difference/mean": 0.7860992550849915, "step": 1231, "step_time": 9.311403612009599 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017361111473292112, "entropy": 4.110050588846207, "epoch": 0.01232, "grad_norm": 0.10857035219669342, "kl": 0.6200901828706264, "learning_rate": 9.99933802342141e-06, "loss": -0.0077, "step": 1232, "step_time": 5.215604840996093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2191.0, "completions/max_terminated_length": 2191.0, "completions/mean_length": 1099.5, "completions/mean_terminated_length": 1079.838623046875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.100472033023834, "epoch": 0.01233, "frac_reward_zero_std": 0.5, "grad_norm": 0.013139261864125729, "kl": 0.5643606744706631, "learning_rate": 9.999336916007016e-06, "loss": 0.0003, "num_tokens": 31299345.0, "reward": 0.7542826533317566, "reward_std": 0.31100496649742126, "rewards/rollout_reward_func/mean": 0.7542826533317566, "rewards/rollout_reward_func/std": 0.8403386473655701, "sampling/importance_sampling_ratio/max": 0.5547218322753906, "sampling/importance_sampling_ratio/mean": 0.18474668264389038, "sampling/importance_sampling_ratio/min": 2.1483508399447633e-14, "sampling/sampling_logp_difference/max": 3.352707862854004, "sampling/sampling_logp_difference/mean": 0.6245948076248169, "step": 1233, "step_time": 14.832796977003454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.138463139533997, "epoch": 0.01234, "grad_norm": 0.011962363496422768, "kl": 0.5591786503791809, "learning_rate": 9.999335807667186e-06, "loss": 0.0003, "step": 1234, "step_time": 7.971912208988215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1949.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 453.28125, "completions/mean_terminated_length": 417.6773986816406, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.9734123945236206, "epoch": 0.01235, "frac_reward_zero_std": 0.75, "grad_norm": 0.006405400577932596, "kl": 0.6872255429625511, "learning_rate": 9.999334698401922e-06, "loss": -0.0001, "num_tokens": 31352881.0, "reward": 1.0085395574569702, "reward_std": 0.23961913585662842, "rewards/rollout_reward_func/mean": 1.0085395574569702, "rewards/rollout_reward_func/std": 0.48345622420310974, "sampling/importance_sampling_ratio/max": 0.5572970509529114, "sampling/importance_sampling_ratio/mean": 0.33975231647491455, "sampling/importance_sampling_ratio/min": 5.794218211199197e-19, "sampling/sampling_logp_difference/max": 3.9544143676757812, "sampling/sampling_logp_difference/mean": 0.695808470249176, "step": 1235, "step_time": 12.684723050013417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.0119712352752686, "epoch": 0.01236, "grad_norm": 0.006563367322087288, "kl": 0.6824967861175537, "learning_rate": 9.999333588211223e-06, "loss": -0.0001, "step": 1236, "step_time": 7.2355384129987215 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 1669.0, "completions/max_terminated_length": 1669.0, "completions/mean_length": 479.59375, "completions/mean_terminated_length": 479.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.727198898792267, "epoch": 0.01237, "frac_reward_zero_std": 0.5, "grad_norm": 0.009275385178625584, "kl": 0.8191529735922813, "learning_rate": 9.999332477095089e-06, "loss": -0.0062, "num_tokens": 31408233.0, "reward": -0.058660537004470825, "reward_std": 0.3741148114204407, "rewards/rollout_reward_func/mean": -0.058660537004470825, "rewards/rollout_reward_func/std": 1.117026686668396, "sampling/importance_sampling_ratio/max": 0.5535739660263062, "sampling/importance_sampling_ratio/mean": 0.3251311779022217, "sampling/importance_sampling_ratio/min": 8.714073261631677e-16, "sampling/sampling_logp_difference/max": 11.992135047912598, "sampling/sampling_logp_difference/mean": 0.6132540702819824, "step": 1237, "step_time": 12.83367253200413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.732455849647522, "epoch": 0.01238, "grad_norm": 0.009264407679438591, "kl": 0.8180783316493034, "learning_rate": 9.99933136505352e-06, "loss": -0.0062, "step": 1238, "step_time": 6.5212363049940905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 298.03125, "completions/mean_terminated_length": 287.45159912109375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.3207612931728363, "epoch": 0.01239, "frac_reward_zero_std": 0.5, "grad_norm": 0.06746633350849152, "kl": 0.685741737484932, "learning_rate": 9.999330252086517e-06, "loss": -0.009, "num_tokens": 31457624.0, "reward": 0.7171703577041626, "reward_std": 0.2679339647293091, "rewards/rollout_reward_func/mean": 0.7171703577041626, "rewards/rollout_reward_func/std": 0.8485594987869263, "sampling/importance_sampling_ratio/max": 0.5580729246139526, "sampling/importance_sampling_ratio/mean": 0.36587268114089966, "sampling/importance_sampling_ratio/min": 7.80161699132087e-11, "sampling/sampling_logp_difference/max": 4.441754341125488, "sampling/sampling_logp_difference/mean": 0.49181443452835083, "step": 1239, "step_time": 8.776469708005607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.3621975481510162, "epoch": 0.0124, "grad_norm": 0.06399620324373245, "kl": 0.6831664256751537, "learning_rate": 9.99932913819408e-06, "loss": -0.0092, "step": 1240, "step_time": 4.6262271820087335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1771.0, "completions/max_terminated_length": 1771.0, "completions/mean_length": 691.625, "completions/mean_terminated_length": 691.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.512029618024826, "epoch": 0.01241, "frac_reward_zero_std": 0.25, "grad_norm": 0.03193923085927963, "kl": 0.5508214831352234, "learning_rate": 9.99932802337621e-06, "loss": -0.0143, "num_tokens": 31520181.0, "reward": 0.2840714454650879, "reward_std": 0.7077757120132446, "rewards/rollout_reward_func/mean": 0.2840714454650879, "rewards/rollout_reward_func/std": 1.1123276948928833, "sampling/importance_sampling_ratio/max": 0.5570213198661804, "sampling/importance_sampling_ratio/mean": 0.2743782103061676, "sampling/importance_sampling_ratio/min": 1.6654446881148033e-05, "sampling/sampling_logp_difference/max": 2.609297037124634, "sampling/sampling_logp_difference/mean": 0.48059189319610596, "step": 1241, "step_time": 12.591763746000652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.5447511076927185, "epoch": 0.01242, "grad_norm": 0.019985534250736237, "kl": 0.5481614172458649, "learning_rate": 9.999326907632905e-06, "loss": -0.0144, "step": 1242, "step_time": 6.97313562099589 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0625, "completions/max_length": 1372.0, "completions/max_terminated_length": 1372.0, "completions/mean_length": 631.3125, "completions/mean_terminated_length": 631.6333618164062, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "entropy": 4.488965481519699, "epoch": 0.01243, "frac_reward_zero_std": 0.0, "grad_norm": 0.03942490741610527, "kl": 0.4160641245543957, "learning_rate": 9.999325790964166e-06, "loss": -0.0191, "num_tokens": 31581885.0, "reward": 0.9912247657775879, "reward_std": 0.4675348401069641, "rewards/rollout_reward_func/mean": 0.9912247657775879, "rewards/rollout_reward_func/std": 0.6405479311943054, "sampling/importance_sampling_ratio/max": 0.3071295917034149, "sampling/importance_sampling_ratio/mean": 0.1841922104358673, "sampling/importance_sampling_ratio/min": 8.005273444391318e-16, "sampling/sampling_logp_difference/max": 4.355010986328125, "sampling/sampling_logp_difference/mean": 0.8959228992462158, "step": 1243, "step_time": 11.563698917998408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.519833505153656, "epoch": 0.01244, "grad_norm": 0.04387421905994415, "kl": 0.41502608358860016, "learning_rate": 9.999324673369997e-06, "loss": -0.019, "step": 1244, "step_time": 6.644062097984715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1958.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 388.28125, "completions/mean_terminated_length": 388.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.6269041001796722, "epoch": 0.01245, "frac_reward_zero_std": 0.5, "grad_norm": 0.01819738931953907, "kl": 0.9011517912149429, "learning_rate": 9.999323554850393e-06, "loss": -0.0072, "num_tokens": 31633841.0, "reward": 0.5669185519218445, "reward_std": 0.21519644558429718, "rewards/rollout_reward_func/mean": 0.5669185519218445, "rewards/rollout_reward_func/std": 0.9609359502792358, "sampling/importance_sampling_ratio/max": 0.5570449233055115, "sampling/importance_sampling_ratio/mean": 0.32629165053367615, "sampling/importance_sampling_ratio/min": 2.6337984309066087e-05, "sampling/sampling_logp_difference/max": 2.8232059478759766, "sampling/sampling_logp_difference/mean": 0.5128524303436279, "step": 1245, "step_time": 13.06394443300087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.613847464323044, "epoch": 0.01246, "grad_norm": 0.01735157147049904, "kl": 0.8980639949440956, "learning_rate": 9.999322435405358e-06, "loss": -0.0072, "step": 1246, "step_time": 7.249987791990861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1525.0, "completions/max_terminated_length": 1525.0, "completions/mean_length": 529.5625, "completions/mean_terminated_length": 529.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.899559259414673, "epoch": 0.01247, "frac_reward_zero_std": 0.25, "grad_norm": 0.027795899659395218, "kl": 0.6479290574789047, "learning_rate": 9.99932131503489e-06, "loss": -0.0125, "num_tokens": 31689401.0, "reward": 0.8121272325515747, "reward_std": 0.7185967564582825, "rewards/rollout_reward_func/mean": 0.8121272325515747, "rewards/rollout_reward_func/std": 0.8398616909980774, "sampling/importance_sampling_ratio/max": 0.5572031140327454, "sampling/importance_sampling_ratio/mean": 0.3155647814273834, "sampling/importance_sampling_ratio/min": 0.0002566063776612282, "sampling/sampling_logp_difference/max": 4.029297828674316, "sampling/sampling_logp_difference/mean": 0.6735057830810547, "step": 1247, "step_time": 11.258277947999886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.882573753595352, "epoch": 0.01248, "grad_norm": 0.027684245258569717, "kl": 0.6479145213961601, "learning_rate": 9.99932019373899e-06, "loss": -0.0125, "step": 1248, "step_time": 6.39592753398756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2239.0, "completions/max_terminated_length": 2239.0, "completions/mean_length": 736.1875, "completions/mean_terminated_length": 739.3870849609375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.732678681612015, "epoch": 0.01249, "frac_reward_zero_std": 0.25, "grad_norm": 0.06375191360712051, "kl": 0.5718090608716011, "learning_rate": 9.99931907151766e-06, "loss": 0.0011, "num_tokens": 31753003.0, "reward": 0.3322821259498596, "reward_std": 0.5048651695251465, "rewards/rollout_reward_func/mean": 0.3322821259498596, "rewards/rollout_reward_func/std": 1.0840765237808228, "sampling/importance_sampling_ratio/max": 0.5602362751960754, "sampling/importance_sampling_ratio/mean": 0.22835814952850342, "sampling/importance_sampling_ratio/min": 2.7645310560621243e-14, "sampling/sampling_logp_difference/max": 13.609217643737793, "sampling/sampling_logp_difference/mean": 1.0459851026535034, "step": 1249, "step_time": 14.576066597001045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.731489449739456, "epoch": 0.0125, "grad_norm": 0.06198830157518387, "kl": 0.5789658799767494, "learning_rate": 9.999317948370898e-06, "loss": 0.0011, "step": 1250, "step_time": 7.7245010290062055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2160.0, "completions/max_terminated_length": 2160.0, "completions/mean_length": 754.625, "completions/mean_terminated_length": 754.625, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "entropy": 3.7323490977287292, "epoch": 0.01251, "frac_reward_zero_std": 0.5, "grad_norm": 0.010161362588405609, "kl": 0.6000053659081459, "learning_rate": 9.999316824298703e-06, "loss": -0.004, "num_tokens": 31818675.0, "reward": 1.1158851385116577, "reward_std": 0.27006176114082336, "rewards/rollout_reward_func/mean": 1.1158851385116577, "rewards/rollout_reward_func/std": 0.403390109539032, "sampling/importance_sampling_ratio/max": 0.3084176480770111, "sampling/importance_sampling_ratio/mean": 0.20223353803157806, "sampling/importance_sampling_ratio/min": 4.484264718485065e-05, "sampling/sampling_logp_difference/max": 3.3974430561065674, "sampling/sampling_logp_difference/mean": 0.5189021229743958, "step": 1251, "step_time": 13.887878738998552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.7233015596866608, "epoch": 0.01252, "grad_norm": 0.009712607599794865, "kl": 0.5993819981813431, "learning_rate": 9.999315699301079e-06, "loss": -0.004, "step": 1252, "step_time": 7.7751688589996775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 301.53125, "completions/mean_terminated_length": 301.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.009134948253632, "epoch": 0.01253, "frac_reward_zero_std": 0.25, "grad_norm": 0.19206245243549347, "kl": 0.6230307333171368, "learning_rate": 9.999314573378024e-06, "loss": -0.0122, "num_tokens": 31866930.0, "reward": 0.8143880367279053, "reward_std": 0.5314890742301941, "rewards/rollout_reward_func/mean": 0.8143880367279053, "rewards/rollout_reward_func/std": 0.7769364714622498, "sampling/importance_sampling_ratio/max": 0.5567402243614197, "sampling/importance_sampling_ratio/mean": 0.2986701726913452, "sampling/importance_sampling_ratio/min": 2.9572143345202306e-11, "sampling/sampling_logp_difference/max": 13.463057518005371, "sampling/sampling_logp_difference/mean": 0.8260948657989502, "step": 1253, "step_time": 8.435031176995835 }, { "clip_ratio/high_max": 0.05208333395421505, "clip_ratio/high_mean": 0.026041666977107525, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875000931322575, "entropy": 3.9749020636081696, "epoch": 0.01254, "grad_norm": 0.032276496291160583, "kl": 0.624556265771389, "learning_rate": 9.999313446529542e-06, "loss": -0.0125, "step": 1254, "step_time": 5.175024744006805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 452.65625, "completions/mean_terminated_length": 452.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.9670375883579254, "epoch": 0.01255, "frac_reward_zero_std": 0.5, "grad_norm": 0.09613366425037384, "kl": 0.806202806532383, "learning_rate": 9.999312318755627e-06, "loss": -0.0, "num_tokens": 31921485.0, "reward": 0.9597926139831543, "reward_std": 0.28345921635627747, "rewards/rollout_reward_func/mean": 0.9597926139831543, "rewards/rollout_reward_func/std": 0.6456955075263977, "sampling/importance_sampling_ratio/max": 0.5508411526679993, "sampling/importance_sampling_ratio/mean": 0.3328312039375305, "sampling/importance_sampling_ratio/min": 0.015155784785747528, "sampling/sampling_logp_difference/max": 2.654010057449341, "sampling/sampling_logp_difference/mean": 0.3549700975418091, "step": 1255, "step_time": 9.293799269995361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.018229166977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018229166977107525, "entropy": 2.9000627398490906, "epoch": 0.01256, "grad_norm": 0.0346805565059185, "kl": 0.8456034064292908, "learning_rate": 9.999311190056283e-06, "loss": -0.0002, "step": 1256, "step_time": 4.676962834004371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1421.0, "completions/max_terminated_length": 1421.0, "completions/mean_length": 534.78125, "completions/mean_terminated_length": 534.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.092135727405548, "epoch": 0.01257, "frac_reward_zero_std": 0.5, "grad_norm": 0.03203801438212395, "kl": 0.6317851692438126, "learning_rate": 9.999310060431508e-06, "loss": -0.0042, "num_tokens": 31978880.0, "reward": 1.1904221773147583, "reward_std": 0.017530139535665512, "rewards/rollout_reward_func/mean": 1.1904221773147583, "rewards/rollout_reward_func/std": 0.06924603879451752, "sampling/importance_sampling_ratio/max": 0.5550224184989929, "sampling/importance_sampling_ratio/mean": 0.3051033914089203, "sampling/importance_sampling_ratio/min": 4.480753333473142e-13, "sampling/sampling_logp_difference/max": 3.917442560195923, "sampling/sampling_logp_difference/mean": 0.5212812423706055, "step": 1257, "step_time": 11.426184917007049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.079973489046097, "epoch": 0.01258, "grad_norm": 0.03926754742860794, "kl": 0.6329767592251301, "learning_rate": 9.999308929881305e-06, "loss": -0.0043, "step": 1258, "step_time": 6.322114848990168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1419.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 836.40625, "completions/mean_terminated_length": 817.6128540039062, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "entropy": 3.654350131750107, "epoch": 0.01259, "frac_reward_zero_std": 0.25, "grad_norm": 0.0498635433614254, "kl": 0.6708957850933075, "learning_rate": 9.999307798405675e-06, "loss": 0.0007, "num_tokens": 32045819.0, "reward": 0.6214665174484253, "reward_std": 0.40967410802841187, "rewards/rollout_reward_func/mean": 0.6214665174484253, "rewards/rollout_reward_func/std": 0.9714562892913818, "sampling/importance_sampling_ratio/max": 0.30184125900268555, "sampling/importance_sampling_ratio/mean": 0.17642425000667572, "sampling/importance_sampling_ratio/min": 9.233243510042843e-12, "sampling/sampling_logp_difference/max": 13.55783748626709, "sampling/sampling_logp_difference/mean": 0.6307730674743652, "step": 1259, "step_time": 11.485134109003411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.625071406364441, "epoch": 0.0126, "grad_norm": 0.04663068428635597, "kl": 0.675637736916542, "learning_rate": 9.999306666004616e-06, "loss": 0.0006, "step": 1260, "step_time": 6.66451648700604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1286.0, "completions/max_terminated_length": 1286.0, "completions/mean_length": 545.5625, "completions/mean_terminated_length": 545.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.1471433341503143, "epoch": 0.01261, "frac_reward_zero_std": 0.75, "grad_norm": 0.010119190439581871, "kl": 0.5946541205048561, "learning_rate": 9.999305532678127e-06, "loss": -0.002, "num_tokens": 32103041.0, "reward": 1.043267846107483, "reward_std": 0.2376500964164734, "rewards/rollout_reward_func/mean": 1.043267846107483, "rewards/rollout_reward_func/std": 0.5103288292884827, "sampling/importance_sampling_ratio/max": 0.5541367530822754, "sampling/importance_sampling_ratio/mean": 0.3045516014099121, "sampling/importance_sampling_ratio/min": 1.7552246548563355e-12, "sampling/sampling_logp_difference/max": 4.798336029052734, "sampling/sampling_logp_difference/mean": 0.47839879989624023, "step": 1261, "step_time": 11.060217314006877 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "entropy": 3.139585554599762, "epoch": 0.01262, "grad_norm": 0.00945686362683773, "kl": 0.5947363413870335, "learning_rate": 9.999304398426211e-06, "loss": -0.002, "step": 1262, "step_time": 5.882454541999323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1318.0, "completions/max_terminated_length": 1318.0, "completions/mean_length": 581.28125, "completions/mean_terminated_length": 581.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.035849779844284, "epoch": 0.01263, "frac_reward_zero_std": 0.25, "grad_norm": 0.08769596368074417, "kl": 0.6344532072544098, "learning_rate": 9.999303263248869e-06, "loss": 0.0007, "num_tokens": 32161946.0, "reward": 0.26572662591934204, "reward_std": 0.828576922416687, "rewards/rollout_reward_func/mean": 0.26572662591934204, "rewards/rollout_reward_func/std": 1.039527416229248, "sampling/importance_sampling_ratio/max": 0.5550938248634338, "sampling/importance_sampling_ratio/mean": 0.24759569764137268, "sampling/importance_sampling_ratio/min": 0.00025692081544548273, "sampling/sampling_logp_difference/max": 4.028594493865967, "sampling/sampling_logp_difference/mean": 0.5944662094116211, "step": 1263, "step_time": 10.342345187003957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.984247475862503, "epoch": 0.01264, "grad_norm": 0.08305146545171738, "kl": 0.6398759633302689, "learning_rate": 9.999302127146098e-06, "loss": 0.0003, "step": 1264, "step_time": 5.764191720983945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2146.0, "completions/max_terminated_length": 2146.0, "completions/mean_length": 822.84375, "completions/mean_terminated_length": 822.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.7361888885498047, "epoch": 0.01265, "frac_reward_zero_std": 0.25, "grad_norm": 0.10166318714618683, "kl": 0.6150583550333977, "learning_rate": 9.999300990117899e-06, "loss": 0.0031, "num_tokens": 32229380.0, "reward": 0.47678691148757935, "reward_std": 0.4700819253921509, "rewards/rollout_reward_func/mean": 0.47678691148757935, "rewards/rollout_reward_func/std": 1.0276628732681274, "sampling/importance_sampling_ratio/max": 0.5518792867660522, "sampling/importance_sampling_ratio/mean": 0.23185867071151733, "sampling/importance_sampling_ratio/min": 0.0014634558465331793, "sampling/sampling_logp_difference/max": 2.4179129600524902, "sampling/sampling_logp_difference/mean": 0.4925956726074219, "step": 1265, "step_time": 13.68291949002014 }, { "clip_ratio/high_max": 0.028125000186264515, "clip_ratio/high_mean": 0.014062500093132257, "clip_ratio/low_mean": 0.014062500093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028125000186264515, "entropy": 3.706046849489212, "epoch": 0.01266, "grad_norm": 0.027102656662464142, "kl": 0.6153971552848816, "learning_rate": 9.999299852164274e-06, "loss": 0.0029, "step": 1266, "step_time": 8.160120696004014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1394.0, "completions/max_terminated_length": 1394.0, "completions/mean_length": 711.90625, "completions/mean_terminated_length": 711.90625, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "entropy": 3.429929345846176, "epoch": 0.01267, "frac_reward_zero_std": 0.25, "grad_norm": 0.022847114130854607, "kl": 0.7224347367882729, "learning_rate": 9.999298713285224e-06, "loss": -0.0063, "num_tokens": 32294475.0, "reward": 1.1662678718566895, "reward_std": 0.014320500195026398, "rewards/rollout_reward_func/mean": 1.1662678718566895, "rewards/rollout_reward_func/std": 0.0526493676006794, "sampling/importance_sampling_ratio/max": 0.31250208616256714, "sampling/importance_sampling_ratio/mean": 0.22580586373806, "sampling/importance_sampling_ratio/min": 0.00030973367393016815, "sampling/sampling_logp_difference/max": 3.757277011871338, "sampling/sampling_logp_difference/mean": 0.45914268493652344, "step": 1267, "step_time": 11.479104525009461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.422880172729492, "epoch": 0.01268, "grad_norm": 0.02079204097390175, "kl": 0.7255577892065048, "learning_rate": 9.999297573480746e-06, "loss": -0.0063, "step": 1268, "step_time": 6.154203321006207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2239.0, "completions/max_terminated_length": 2239.0, "completions/mean_length": 722.4375, "completions/mean_terminated_length": 722.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.3700459599494934, "epoch": 0.01269, "frac_reward_zero_std": 0.25, "grad_norm": 0.20338039100170135, "kl": 0.6440058089792728, "learning_rate": 9.999296432750842e-06, "loss": -0.0084, "num_tokens": 32358111.0, "reward": 1.0009937286376953, "reward_std": 0.5343030691146851, "rewards/rollout_reward_func/mean": 1.0009937286376953, "rewards/rollout_reward_func/std": 0.5924590229988098, "sampling/importance_sampling_ratio/max": 0.550593376159668, "sampling/importance_sampling_ratio/mean": 0.27959129214286804, "sampling/importance_sampling_ratio/min": 6.927198398232193e-11, "sampling/sampling_logp_difference/max": 12.844231605529785, "sampling/sampling_logp_difference/mean": 0.5010073184967041, "step": 1269, "step_time": 14.433727682997414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.3481884598731995, "epoch": 0.0127, "grad_norm": 0.022212179377675056, "kl": 0.6413394212722778, "learning_rate": 9.999295291095512e-06, "loss": -0.0086, "step": 1270, "step_time": 8.080732390000776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1494.0, "completions/max_terminated_length": 1494.0, "completions/mean_length": 535.375, "completions/mean_terminated_length": 535.375, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "entropy": 3.000445544719696, "epoch": 0.01271, "frac_reward_zero_std": 0.25, "grad_norm": 0.011228218674659729, "kl": 0.666195709258318, "learning_rate": 9.999294148514757e-06, "loss": -0.015, "num_tokens": 32416483.0, "reward": 1.1335790157318115, "reward_std": 0.19973266124725342, "rewards/rollout_reward_func/mean": 1.1335790157318115, "rewards/rollout_reward_func/std": 0.3659936487674713, "sampling/importance_sampling_ratio/max": 0.3080841898918152, "sampling/importance_sampling_ratio/mean": 0.2654181122779846, "sampling/importance_sampling_ratio/min": 4.5128108467906713e-05, "sampling/sampling_logp_difference/max": 4.624093532562256, "sampling/sampling_logp_difference/mean": 0.4322129487991333, "step": 1271, "step_time": 11.5417455420029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.995591700077057, "epoch": 0.01272, "grad_norm": 0.010776566341519356, "kl": 0.6674363128840923, "learning_rate": 9.999293005008579e-06, "loss": -0.015, "step": 1272, "step_time": 6.2726675829981104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 144.0625, "completions/mean_terminated_length": 148.19354248046875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.874002367258072, "epoch": 0.01273, "frac_reward_zero_std": 0.75, "grad_norm": 0.003806288819760084, "kl": 0.8884757831692696, "learning_rate": 9.999291860576973e-06, "loss": -0.0038, "num_tokens": 32457722.0, "reward": 1.1147948503494263, "reward_std": 0.009188584983348846, "rewards/rollout_reward_func/mean": 1.1147948503494263, "rewards/rollout_reward_func/std": 0.058024343103170395, "sampling/importance_sampling_ratio/max": 0.5549840331077576, "sampling/importance_sampling_ratio/mean": 0.4685605466365814, "sampling/importance_sampling_ratio/min": 6.046168721240974e-08, "sampling/sampling_logp_difference/max": 3.1440601348876953, "sampling/sampling_logp_difference/mean": 0.4329115152359009, "step": 1273, "step_time": 8.045707792014582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.875264286994934, "epoch": 0.01274, "grad_norm": 0.003948383033275604, "kl": 0.8899611458182335, "learning_rate": 9.999290715219945e-06, "loss": -0.0038, "step": 1274, "step_time": 4.248376328010636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 147.78125, "completions/mean_terminated_length": 151.51612854003906, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.6073333621025085, "epoch": 0.01275, "frac_reward_zero_std": 0.75, "grad_norm": 0.014183158986270428, "kl": 0.7374828159809113, "learning_rate": 9.999289568937491e-06, "loss": -0.0004, "num_tokens": 32498688.0, "reward": 0.3679554760456085, "reward_std": 0.26084989309310913, "rewards/rollout_reward_func/mean": 0.3679554760456085, "rewards/rollout_reward_func/std": 1.0161632299423218, "sampling/importance_sampling_ratio/max": 0.5565465092658997, "sampling/importance_sampling_ratio/mean": 0.4267891049385071, "sampling/importance_sampling_ratio/min": 2.325191449558223e-32, "sampling/sampling_logp_difference/max": 8.493599891662598, "sampling/sampling_logp_difference/mean": 0.8076112270355225, "step": 1275, "step_time": 8.408796106006776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.6091860234737396, "epoch": 0.01276, "grad_norm": 0.013834170065820217, "kl": 0.7377726286649704, "learning_rate": 9.999288421729613e-06, "loss": -0.0004, "step": 1276, "step_time": 4.546984796994366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1529.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 501.96875, "completions/mean_terminated_length": 501.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.484719693660736, "epoch": 0.01277, "frac_reward_zero_std": 0.5, "grad_norm": 0.07147087156772614, "kl": 0.7450011819601059, "learning_rate": 9.999287273596313e-06, "loss": -0.0027, "num_tokens": 32554973.0, "reward": 0.7850396633148193, "reward_std": 0.49877843260765076, "rewards/rollout_reward_func/mean": 0.7850396633148193, "rewards/rollout_reward_func/std": 0.8258661031723022, "sampling/importance_sampling_ratio/max": 0.5574430227279663, "sampling/importance_sampling_ratio/mean": 0.3260045647621155, "sampling/importance_sampling_ratio/min": 0.0007897971081547439, "sampling/sampling_logp_difference/max": 2.2469708919525146, "sampling/sampling_logp_difference/mean": 0.4859321117401123, "step": 1277, "step_time": 12.233413506015495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.452402889728546, "epoch": 0.01278, "grad_norm": 0.07227550446987152, "kl": 0.746454693377018, "learning_rate": 9.999286124537588e-06, "loss": -0.0029, "step": 1278, "step_time": 7.025804330005485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1233.0, "completions/max_terminated_length": 1233.0, "completions/mean_length": 156.875, "completions/mean_terminated_length": 156.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.731949418783188, "epoch": 0.01279, "frac_reward_zero_std": 0.75, "grad_norm": 0.038483645766973495, "kl": 0.7953384593129158, "learning_rate": 9.999284974553441e-06, "loss": -0.0028, "num_tokens": 32595684.0, "reward": 1.063894271850586, "reward_std": 0.18101729452610016, "rewards/rollout_reward_func/mean": 1.063894271850586, "rewards/rollout_reward_func/std": 0.351077675819397, "sampling/importance_sampling_ratio/max": 0.5545694828033447, "sampling/importance_sampling_ratio/mean": 0.47967469692230225, "sampling/importance_sampling_ratio/min": 0.06482156366109848, "sampling/sampling_logp_difference/max": 0.6458892226219177, "sampling/sampling_logp_difference/mean": 0.30970633029937744, "step": 1279, "step_time": 9.87201490400912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7255031168460846, "epoch": 0.0128, "grad_norm": 0.036438506096601486, "kl": 0.7956036254763603, "learning_rate": 9.99928382364387e-06, "loss": -0.0028, "step": 1280, "step_time": 5.651517872007389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 408.5625, "completions/mean_terminated_length": 408.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.415103316307068, "epoch": 0.01281, "frac_reward_zero_std": 0.25, "grad_norm": 0.0218973346054554, "kl": 0.6177344918251038, "learning_rate": 9.999282671808878e-06, "loss": -0.0136, "num_tokens": 32648203.0, "reward": 0.5665731430053711, "reward_std": 0.20614641904830933, "rewards/rollout_reward_func/mean": 0.5665731430053711, "rewards/rollout_reward_func/std": 0.9678769111633301, "sampling/importance_sampling_ratio/max": 0.5560244917869568, "sampling/importance_sampling_ratio/mean": 0.2974797785282135, "sampling/importance_sampling_ratio/min": 6.778521566275231e-08, "sampling/sampling_logp_difference/max": 4.127180576324463, "sampling/sampling_logp_difference/mean": 0.5126261711120605, "step": 1281, "step_time": 9.013231684999482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 3.4193469285964966, "epoch": 0.01282, "grad_norm": 0.023147275671362877, "kl": 0.6297350786626339, "learning_rate": 9.999281519048462e-06, "loss": -0.0136, "step": 1282, "step_time": 4.916984390991274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 413.9375, "completions/mean_terminated_length": 413.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.768548369407654, "epoch": 0.01283, "frac_reward_zero_std": 0.75, "grad_norm": 0.002935668919235468, "kl": 0.625603623688221, "learning_rate": 9.999280365362623e-06, "loss": -0.0043, "num_tokens": 32700282.0, "reward": 1.1771156787872314, "reward_std": 0.008539523929357529, "rewards/rollout_reward_func/mean": 1.1771156787872314, "rewards/rollout_reward_func/std": 0.04659620299935341, "sampling/importance_sampling_ratio/max": 0.5510970950126648, "sampling/importance_sampling_ratio/mean": 0.34991347789764404, "sampling/importance_sampling_ratio/min": 0.0032363850623369217, "sampling/sampling_logp_difference/max": 4.2391157150268555, "sampling/sampling_logp_difference/mean": 0.33475738763809204, "step": 1283, "step_time": 8.88250177998998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.775565266609192, "epoch": 0.01284, "grad_norm": 0.0021415380761027336, "kl": 0.6222000122070312, "learning_rate": 9.999279210751366e-06, "loss": -0.0043, "step": 1284, "step_time": 4.855995789999724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1263.0, "completions/max_terminated_length": 1263.0, "completions/mean_length": 414.125, "completions/mean_terminated_length": 414.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.0488266050815582, "epoch": 0.01285, "frac_reward_zero_std": 0.5, "grad_norm": 0.010724101215600967, "kl": 0.7648951485753059, "learning_rate": 9.999278055214684e-06, "loss": -0.0078, "num_tokens": 32752213.0, "reward": 0.6192950010299683, "reward_std": 0.016198281198740005, "rewards/rollout_reward_func/mean": 0.6192950010299683, "rewards/rollout_reward_func/std": 0.9833602905273438, "sampling/importance_sampling_ratio/max": 0.5544473528862, "sampling/importance_sampling_ratio/mean": 0.3609994947910309, "sampling/importance_sampling_ratio/min": 9.796237154802157e-10, "sampling/sampling_logp_difference/max": 13.217673301696777, "sampling/sampling_logp_difference/mean": 0.49425119161605835, "step": 1285, "step_time": 10.384570341004292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.0323448479175568, "epoch": 0.01286, "grad_norm": 0.010533376596868038, "kl": 0.7679695785045624, "learning_rate": 9.999276898752583e-06, "loss": -0.0077, "step": 1286, "step_time": 5.773875753984612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1390.0, "completions/max_terminated_length": 1390.0, "completions/mean_length": 652.71875, "completions/mean_terminated_length": 652.3225708007812, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "entropy": 3.7467799186706543, "epoch": 0.01287, "frac_reward_zero_std": 0.5, "grad_norm": 0.023471424356102943, "kl": 0.7796811237931252, "learning_rate": 9.99927574136506e-06, "loss": -0.0025, "num_tokens": 32814242.0, "reward": 0.2505175769329071, "reward_std": 0.2670941650867462, "rewards/rollout_reward_func/mean": 0.2505175769329071, "rewards/rollout_reward_func/std": 1.0845060348510742, "sampling/importance_sampling_ratio/max": 0.3048003911972046, "sampling/importance_sampling_ratio/mean": 0.22628457844257355, "sampling/importance_sampling_ratio/min": 2.0344628132634495e-15, "sampling/sampling_logp_difference/max": 4.451864242553711, "sampling/sampling_logp_difference/mean": 0.7035397291183472, "step": 1287, "step_time": 10.898227973993926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.7389815151691437, "epoch": 0.01288, "grad_norm": 0.022208232432603836, "kl": 0.7797360047698021, "learning_rate": 9.999274583052117e-06, "loss": -0.0026, "step": 1288, "step_time": 5.964237455998955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 578.90625, "completions/mean_terminated_length": 578.90625, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "entropy": 2.824742615222931, "epoch": 0.01289, "frac_reward_zero_std": 0.25, "grad_norm": 0.11189692467451096, "kl": 0.6073242127895355, "learning_rate": 9.999273423813754e-06, "loss": -0.0106, "num_tokens": 32874497.0, "reward": 1.1976957321166992, "reward_std": 0.018998805433511734, "rewards/rollout_reward_func/mean": 1.1976957321166992, "rewards/rollout_reward_func/std": 0.024948103353381157, "sampling/importance_sampling_ratio/max": 0.30975380539894104, "sampling/importance_sampling_ratio/mean": 0.27961695194244385, "sampling/importance_sampling_ratio/min": 0.002446367172524333, "sampling/sampling_logp_difference/max": 4.563121795654297, "sampling/sampling_logp_difference/mean": 0.36104801297187805, "step": 1289, "step_time": 9.386472272009996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.8157701194286346, "epoch": 0.0129, "grad_norm": 0.011180497705936432, "kl": 0.6080802530050278, "learning_rate": 9.99927226364997e-06, "loss": -0.0109, "step": 1290, "step_time": 4.606258624015027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2146.0, "completions/max_terminated_length": 2146.0, "completions/mean_length": 1129.1875, "completions/mean_terminated_length": 1129.1875, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "entropy": 3.8123765885829926, "epoch": 0.01291, "frac_reward_zero_std": 0.0, "grad_norm": 0.047646068036556244, "kl": 0.5483686327934265, "learning_rate": 9.999271102560767e-06, "loss": -0.0037, "num_tokens": 32952601.0, "reward": 0.6961705684661865, "reward_std": 0.40789079666137695, "rewards/rollout_reward_func/mean": 0.6961705684661865, "rewards/rollout_reward_func/std": 0.9784097671508789, "sampling/importance_sampling_ratio/max": 0.3081539571285248, "sampling/importance_sampling_ratio/mean": 0.14616823196411133, "sampling/importance_sampling_ratio/min": 6.418150516260268e-13, "sampling/sampling_logp_difference/max": 13.394968032836914, "sampling/sampling_logp_difference/mean": 0.6221097707748413, "step": 1291, "step_time": 14.421631359000457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.8336017429828644, "epoch": 0.01292, "grad_norm": 0.047230396419763565, "kl": 0.5477925203740597, "learning_rate": 9.999269940546145e-06, "loss": -0.0038, "step": 1292, "step_time": 7.833934528003738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2266.0, "completions/max_terminated_length": 2266.0, "completions/mean_length": 561.8125, "completions/mean_terminated_length": 579.4193115234375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.086567312479019, "epoch": 0.01293, "frac_reward_zero_std": 0.25, "grad_norm": 0.03889197111129761, "kl": 0.6250029914081097, "learning_rate": 9.999268777606102e-06, "loss": -0.0069, "num_tokens": 33009276.0, "reward": 0.22136643528938293, "reward_std": 0.273702472448349, "rewards/rollout_reward_func/mean": 0.22136643528938293, "rewards/rollout_reward_func/std": 1.0247668027877808, "sampling/importance_sampling_ratio/max": 0.5565537214279175, "sampling/importance_sampling_ratio/mean": 0.29484492540359497, "sampling/importance_sampling_ratio/min": 7.285135561756369e-15, "sampling/sampling_logp_difference/max": 4.293691635131836, "sampling/sampling_logp_difference/mean": 0.7527837753295898, "step": 1293, "step_time": 14.171809341998596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.135270029306412, "epoch": 0.01294, "grad_norm": 0.036776043474674225, "kl": 0.6211668401956558, "learning_rate": 9.999267613740642e-06, "loss": -0.0069, "step": 1294, "step_time": 7.913762712996686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1394.0, "completions/max_terminated_length": 1394.0, "completions/mean_length": 585.96875, "completions/mean_terminated_length": 585.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.720615476369858, "epoch": 0.01295, "frac_reward_zero_std": 0.5, "grad_norm": 0.020440390333533287, "kl": 0.6496164202690125, "learning_rate": 9.999266448949762e-06, "loss": -0.003, "num_tokens": 33066874.0, "reward": 1.1238954067230225, "reward_std": 0.21046702563762665, "rewards/rollout_reward_func/mean": 1.1238954067230225, "rewards/rollout_reward_func/std": 0.37931108474731445, "sampling/importance_sampling_ratio/max": 0.5551775097846985, "sampling/importance_sampling_ratio/mean": 0.30894407629966736, "sampling/importance_sampling_ratio/min": 5.934079310691502e-23, "sampling/sampling_logp_difference/max": 12.637322425842285, "sampling/sampling_logp_difference/mean": 0.8753979206085205, "step": 1295, "step_time": 11.00353747299232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.742523819208145, "epoch": 0.01296, "grad_norm": 0.021360883489251137, "kl": 0.6471819877624512, "learning_rate": 9.999265283233466e-06, "loss": -0.003, "step": 1296, "step_time": 6.013843752989487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 434.875, "completions/mean_terminated_length": 448.3870849609375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.8740906417369843, "epoch": 0.01297, "frac_reward_zero_std": 0.75, "grad_norm": 0.0022132261656224728, "kl": 0.6772407405078411, "learning_rate": 9.99926411659175e-06, "loss": -0.0037, "num_tokens": 33121150.0, "reward": 0.6174895763397217, "reward_std": 0.009188584983348846, "rewards/rollout_reward_func/mean": 0.6174895763397217, "rewards/rollout_reward_func/std": 0.9439793229103088, "sampling/importance_sampling_ratio/max": 0.5538241267204285, "sampling/importance_sampling_ratio/mean": 0.3449348211288452, "sampling/importance_sampling_ratio/min": 5.3192517057709665e-09, "sampling/sampling_logp_difference/max": 3.296802282333374, "sampling/sampling_logp_difference/mean": 0.4156562089920044, "step": 1297, "step_time": 8.647067817997595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.881764978170395, "epoch": 0.01298, "grad_norm": 0.00227533420547843, "kl": 0.6771100014448166, "learning_rate": 9.999262949024617e-06, "loss": -0.0037, "step": 1298, "step_time": 4.489414374002081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1310.0, "completions/max_terminated_length": 1310.0, "completions/mean_length": 352.0625, "completions/mean_terminated_length": 352.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.1695201098918915, "epoch": 0.01299, "frac_reward_zero_std": 0.75, "grad_norm": 0.053897675126791, "kl": 0.7365002557635307, "learning_rate": 9.999261780532066e-06, "loss": -0.0002, "num_tokens": 33169683.0, "reward": 1.159867763519287, "reward_std": 0.010234735906124115, "rewards/rollout_reward_func/mean": 1.159867763519287, "rewards/rollout_reward_func/std": 0.060543447732925415, "sampling/importance_sampling_ratio/max": 0.5534309148788452, "sampling/importance_sampling_ratio/mean": 0.37134623527526855, "sampling/importance_sampling_ratio/min": 0.028191139921545982, "sampling/sampling_logp_difference/max": 2.154263734817505, "sampling/sampling_logp_difference/mean": 0.39493316411972046, "step": 1299, "step_time": 9.953258442008519 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625000465661287, "entropy": 3.2035609781742096, "epoch": 0.013, "grad_norm": 0.03855770826339722, "kl": 0.7318961918354034, "learning_rate": 9.999260611114098e-06, "loss": -0.0002, "step": 1300, "step_time": 6.118536407993815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1466.0, "completions/max_terminated_length": 1466.0, "completions/mean_length": 899.40625, "completions/mean_terminated_length": 899.40625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 4.325098961591721, "epoch": 0.01301, "frac_reward_zero_std": 0.0, "grad_norm": 0.01054164208471775, "kl": 0.5947145819664001, "learning_rate": 9.999259440770715e-06, "loss": -0.0073, "num_tokens": 33240074.0, "reward": -0.07068991661071777, "reward_std": 0.3649655878543854, "rewards/rollout_reward_func/mean": -0.07068991661071777, "rewards/rollout_reward_func/std": 1.070626974105835, "sampling/importance_sampling_ratio/max": 0.3091825842857361, "sampling/importance_sampling_ratio/mean": 0.13881906867027283, "sampling/importance_sampling_ratio/min": 8.461479841581617e-24, "sampling/sampling_logp_difference/max": 12.916906356811523, "sampling/sampling_logp_difference/mean": 0.9266819357872009, "step": 1301, "step_time": 12.068225409981096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.3513572216033936, "epoch": 0.01302, "grad_norm": 0.012022413313388824, "kl": 0.6015691943466663, "learning_rate": 9.999258269501912e-06, "loss": -0.0072, "step": 1302, "step_time": 6.288579683015996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1749.0, "completions/max_terminated_length": 1749.0, "completions/mean_length": 626.28125, "completions/mean_terminated_length": 626.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.507447361946106, "epoch": 0.01303, "frac_reward_zero_std": 0.75, "grad_norm": 0.01981724239885807, "kl": 0.7181848958134651, "learning_rate": 9.999257097307696e-06, "loss": -0.0017, "num_tokens": 33300673.0, "reward": 1.067481279373169, "reward_std": 0.21625612676143646, "rewards/rollout_reward_func/mean": 1.067481279373169, "rewards/rollout_reward_func/std": 0.4332655668258667, "sampling/importance_sampling_ratio/max": 0.5551619529724121, "sampling/importance_sampling_ratio/mean": 0.272465318441391, "sampling/importance_sampling_ratio/min": 1.4237914001569152e-05, "sampling/sampling_logp_difference/max": 2.4228386878967285, "sampling/sampling_logp_difference/mean": 0.5046770572662354, "step": 1303, "step_time": 11.723448165001173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.571467876434326, "epoch": 0.01304, "grad_norm": 0.021531058475375175, "kl": 0.718054473400116, "learning_rate": 9.999255924188063e-06, "loss": -0.0017, "step": 1304, "step_time": 6.662261311001203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 219.75, "completions/mean_terminated_length": 219.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7609971165657043, "epoch": 0.01305, "frac_reward_zero_std": 0.75, "grad_norm": 0.07329759746789932, "kl": 0.7430065423250198, "learning_rate": 9.999254750143012e-06, "loss": -0.0003, "num_tokens": 33346941.0, "reward": 0.6359502673149109, "reward_std": 0.001237937482073903, "rewards/rollout_reward_func/mean": 0.6359502673149109, "rewards/rollout_reward_func/std": 0.9562646746635437, "sampling/importance_sampling_ratio/max": 0.5553195476531982, "sampling/importance_sampling_ratio/mean": 0.4129287004470825, "sampling/importance_sampling_ratio/min": 0.2963893413543701, "sampling/sampling_logp_difference/max": 0.9729440808296204, "sampling/sampling_logp_difference/mean": 0.3129761815071106, "step": 1305, "step_time": 8.258919694002543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7821497321128845, "epoch": 0.01306, "grad_norm": 0.08473219722509384, "kl": 0.7402011677622795, "learning_rate": 9.99925357517255e-06, "loss": -0.0005, "step": 1306, "step_time": 4.054478150996147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2415.0, "completions/max_terminated_length": 2415.0, "completions/mean_length": 876.28125, "completions/mean_terminated_length": 876.28125, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "entropy": 3.624102383852005, "epoch": 0.01307, "frac_reward_zero_std": 0.25, "grad_norm": 0.02829371951520443, "kl": 0.7061797082424164, "learning_rate": 9.999252399276669e-06, "loss": -0.0116, "num_tokens": 33417634.0, "reward": 0.6543333530426025, "reward_std": 0.04399186745285988, "rewards/rollout_reward_func/mean": 0.6543333530426025, "rewards/rollout_reward_func/std": 0.9326061606407166, "sampling/importance_sampling_ratio/max": 0.30668479204177856, "sampling/importance_sampling_ratio/mean": 0.1999737024307251, "sampling/importance_sampling_ratio/min": 7.23517036213539e-12, "sampling/sampling_logp_difference/max": 4.726815700531006, "sampling/sampling_logp_difference/mean": 0.6067665815353394, "step": 1307, "step_time": 15.132952911008033 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013020833488553762, "entropy": 3.665765792131424, "epoch": 0.01308, "grad_norm": 0.023080920800566673, "kl": 0.7018391378223896, "learning_rate": 9.999251222455376e-06, "loss": -0.0116, "step": 1308, "step_time": 8.270434401994862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1246.0, "completions/max_terminated_length": 1246.0, "completions/mean_length": 562.96875, "completions/mean_terminated_length": 562.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.2108359336853027, "epoch": 0.01309, "frac_reward_zero_std": 0.25, "grad_norm": 0.07373908907175064, "kl": 0.7793353237211704, "learning_rate": 9.999250044708666e-06, "loss": -0.0002, "num_tokens": 33475052.0, "reward": 0.7888104319572449, "reward_std": 0.555090069770813, "rewards/rollout_reward_func/mean": 0.7888104319572449, "rewards/rollout_reward_func/std": 0.8513611555099487, "sampling/importance_sampling_ratio/max": 0.553203284740448, "sampling/importance_sampling_ratio/mean": 0.2948582172393799, "sampling/importance_sampling_ratio/min": 7.03161695536636e-11, "sampling/sampling_logp_difference/max": 4.617745876312256, "sampling/sampling_logp_difference/mean": 0.477647989988327, "step": 1309, "step_time": 10.356547631003195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.234024852514267, "epoch": 0.0131, "grad_norm": 0.0704222172498703, "kl": 0.7802473530173302, "learning_rate": 9.999248866036543e-06, "loss": -0.0004, "step": 1310, "step_time": 5.727134946988372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2981.0, "completions/max_terminated_length": 2981.0, "completions/mean_length": 1253.1875, "completions/mean_terminated_length": 1253.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.568177729845047, "epoch": 0.01311, "frac_reward_zero_std": 0.25, "grad_norm": 0.010993271134793758, "kl": 0.4987572878599167, "learning_rate": 9.999247686439005e-06, "loss": 0.0027, "num_tokens": 33556484.0, "reward": 0.6006526947021484, "reward_std": 0.6706272959709167, "rewards/rollout_reward_func/mean": 0.6006526947021484, "rewards/rollout_reward_func/std": 0.9555756449699402, "sampling/importance_sampling_ratio/max": 0.547295093536377, "sampling/importance_sampling_ratio/mean": 0.15172535181045532, "sampling/importance_sampling_ratio/min": 1.662730228262283e-15, "sampling/sampling_logp_difference/max": 12.401267051696777, "sampling/sampling_logp_difference/mean": 0.8955048322677612, "step": 1311, "step_time": 18.000086710009782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.601912975311279, "epoch": 0.01312, "grad_norm": 0.011685139499604702, "kl": 0.4975172318518162, "learning_rate": 9.999246505916055e-06, "loss": 0.0027, "step": 1312, "step_time": 9.584123494998494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1471.0, "completions/max_terminated_length": 1471.0, "completions/mean_length": 278.5625, "completions/mean_terminated_length": 264.06451416015625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.640408158302307, "epoch": 0.01313, "frac_reward_zero_std": 0.75, "grad_norm": 0.010506970807909966, "kl": 0.7595948502421379, "learning_rate": 9.99924532446769e-06, "loss": 0.0002, "num_tokens": 33602473.0, "reward": 0.9974582195281982, "reward_std": 0.26018935441970825, "rewards/rollout_reward_func/mean": 0.9974582195281982, "rewards/rollout_reward_func/std": 0.5275915265083313, "sampling/importance_sampling_ratio/max": 0.5533031225204468, "sampling/importance_sampling_ratio/mean": 0.41325873136520386, "sampling/importance_sampling_ratio/min": 3.7920530156774603e-17, "sampling/sampling_logp_difference/max": 4.262429714202881, "sampling/sampling_logp_difference/mean": 0.6624701023101807, "step": 1313, "step_time": 10.507193046985776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.672273725271225, "epoch": 0.01314, "grad_norm": 0.009487641043961048, "kl": 0.7524648234248161, "learning_rate": 9.999244142093913e-06, "loss": 0.0002, "step": 1314, "step_time": 5.912788705994899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2002.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 736.5625, "completions/mean_terminated_length": 736.5625, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 4.569658279418945, "epoch": 0.01315, "frac_reward_zero_std": 0.0, "grad_norm": 0.02054576389491558, "kl": 0.5652014538645744, "learning_rate": 9.999242958794724e-06, "loss": -0.0107, "num_tokens": 33667465.0, "reward": 0.866519570350647, "reward_std": 0.5453118681907654, "rewards/rollout_reward_func/mean": 0.866519570350647, "rewards/rollout_reward_func/std": 0.7901740670204163, "sampling/importance_sampling_ratio/max": 0.3051181435585022, "sampling/importance_sampling_ratio/mean": 0.13136157393455505, "sampling/importance_sampling_ratio/min": 5.746703460318692e-12, "sampling/sampling_logp_difference/max": 13.633569717407227, "sampling/sampling_logp_difference/mean": 0.8673987984657288, "step": 1315, "step_time": 12.916146708004817 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 4.587899476289749, "epoch": 0.01316, "grad_norm": 0.020242638885974884, "kl": 0.5654572546482086, "learning_rate": 9.999241774570122e-06, "loss": -0.0107, "step": 1316, "step_time": 7.3784073579954566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1210.0, "completions/max_terminated_length": 1210.0, "completions/mean_length": 304.4375, "completions/mean_terminated_length": 292.2257995605469, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.3068936467170715, "epoch": 0.01317, "frac_reward_zero_std": 0.5, "grad_norm": 0.024294579401612282, "kl": 0.774366021156311, "learning_rate": 9.999240589420108e-06, "loss": -0.0053, "num_tokens": 33716185.0, "reward": 1.0794823169708252, "reward_std": 0.20155511796474457, "rewards/rollout_reward_func/mean": 1.0794823169708252, "rewards/rollout_reward_func/std": 0.3910870850086212, "sampling/importance_sampling_ratio/max": 0.5527845621109009, "sampling/importance_sampling_ratio/mean": 0.36572086811065674, "sampling/importance_sampling_ratio/min": 1.6461770835363154e-10, "sampling/sampling_logp_difference/max": 4.254868507385254, "sampling/sampling_logp_difference/mean": 0.4966196119785309, "step": 1317, "step_time": 10.347567135991994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.301914095878601, "epoch": 0.01318, "grad_norm": 0.023527219891548157, "kl": 0.7741450816392899, "learning_rate": 9.999239403344681e-06, "loss": -0.0053, "step": 1318, "step_time": 5.703967839996039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 402.3125, "completions/mean_terminated_length": 402.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.9142305850982666, "epoch": 0.01319, "frac_reward_zero_std": 0.75, "grad_norm": 0.0025113483425229788, "kl": 0.735607348382473, "learning_rate": 9.999238216343842e-06, "loss": -0.0036, "num_tokens": 33767437.0, "reward": 1.1473662853240967, "reward_std": 0.009188584983348846, "rewards/rollout_reward_func/mean": 1.1473662853240967, "rewards/rollout_reward_func/std": 0.053435202687978745, "sampling/importance_sampling_ratio/max": 0.5509679913520813, "sampling/importance_sampling_ratio/mean": 0.33109647035598755, "sampling/importance_sampling_ratio/min": 0.0003143785579595715, "sampling/sampling_logp_difference/max": 4.7425642013549805, "sampling/sampling_logp_difference/mean": 0.37878310680389404, "step": 1319, "step_time": 9.017974572983803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.9123444855213165, "epoch": 0.0132, "grad_norm": 0.00255004083737731, "kl": 0.7353968769311905, "learning_rate": 9.999237028417591e-06, "loss": -0.0036, "step": 1320, "step_time": 4.479912874994625 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.03125, "completions/max_length": 1466.0, "completions/max_terminated_length": 1466.0, "completions/mean_length": 407.375, "completions/mean_terminated_length": 420.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.383341759443283, "epoch": 0.01321, "frac_reward_zero_std": 0.25, "grad_norm": 0.02619532123208046, "kl": 0.5595087446272373, "learning_rate": 9.999235839565933e-06, "loss": -0.008, "num_tokens": 33819804.0, "reward": 0.8855637311935425, "reward_std": 0.31715822219848633, "rewards/rollout_reward_func/mean": 0.8855637311935425, "rewards/rollout_reward_func/std": 0.7315164804458618, "sampling/importance_sampling_ratio/max": 0.5540717840194702, "sampling/importance_sampling_ratio/mean": 0.3566696047782898, "sampling/importance_sampling_ratio/min": 3.4854385003169064e-12, "sampling/sampling_logp_difference/max": 11.27425765991211, "sampling/sampling_logp_difference/mean": 0.623454213142395, "step": 1321, "step_time": 10.747514744005457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.3698358833789825, "epoch": 0.01322, "grad_norm": 0.1675124317407608, "kl": 0.5568798743188381, "learning_rate": 9.999234649788861e-06, "loss": -0.008, "step": 1322, "step_time": 6.37011461599468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0036764706019312143, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036764706019312143, "completions/clipped_ratio": 0.0, "completions/max_length": 1967.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 940.84375, "completions/mean_terminated_length": 940.84375, "completions/min_length": 552.0, "completions/min_terminated_length": 552.0, "entropy": 4.723186194896698, "epoch": 0.01323, "frac_reward_zero_std": 0.0, "grad_norm": 0.024697553366422653, "kl": 0.6137414649128914, "learning_rate": 9.99923345908638e-06, "loss": -0.0037, "num_tokens": 33891737.0, "reward": 0.7102590799331665, "reward_std": 0.7365374565124512, "rewards/rollout_reward_func/mean": 0.7102590799331665, "rewards/rollout_reward_func/std": 0.9223761558532715, "sampling/importance_sampling_ratio/max": 0.3001182973384857, "sampling/importance_sampling_ratio/mean": 0.1073511391878128, "sampling/importance_sampling_ratio/min": 2.2110884234649575e-09, "sampling/sampling_logp_difference/max": 4.170507907867432, "sampling/sampling_logp_difference/mean": 0.8277572989463806, "step": 1323, "step_time": 13.187789388990495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.7239105105400085, "epoch": 0.01324, "grad_norm": 0.0254000686109066, "kl": 0.6096002534031868, "learning_rate": 9.999232267458488e-06, "loss": -0.0037, "step": 1324, "step_time": 7.891151438001543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 637.3125, "completions/mean_terminated_length": 637.3125, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "entropy": 4.36134260892868, "epoch": 0.01325, "frac_reward_zero_std": 0.5, "grad_norm": 0.06993799656629562, "kl": 0.6764585897326469, "learning_rate": 9.999231074905187e-06, "loss": -0.0001, "num_tokens": 33955007.0, "reward": 0.684753954410553, "reward_std": 0.19595158100128174, "rewards/rollout_reward_func/mean": 0.684753954410553, "rewards/rollout_reward_func/std": 0.9364526867866516, "sampling/importance_sampling_ratio/max": 0.3102816939353943, "sampling/importance_sampling_ratio/mean": 0.17308464646339417, "sampling/importance_sampling_ratio/min": 2.1057630821480012e-16, "sampling/sampling_logp_difference/max": 4.830142021179199, "sampling/sampling_logp_difference/mean": 0.7544593214988708, "step": 1325, "step_time": 9.91377285100316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.387347608804703, "epoch": 0.01326, "grad_norm": 0.06290081888437271, "kl": 0.661796074360609, "learning_rate": 9.999229881426476e-06, "loss": -0.0004, "step": 1326, "step_time": 5.314172121012234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1247.0, "completions/max_terminated_length": 1247.0, "completions/mean_length": 526.1875, "completions/mean_terminated_length": 526.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.851995885372162, "epoch": 0.01327, "frac_reward_zero_std": 0.5, "grad_norm": 0.016546905040740967, "kl": 0.5787493214011192, "learning_rate": 9.999228687022356e-06, "loss": -0.009, "num_tokens": 34010256.0, "reward": 0.5594481229782104, "reward_std": 0.205445796251297, "rewards/rollout_reward_func/mean": 0.5594481229782104, "rewards/rollout_reward_func/std": 0.9839579463005066, "sampling/importance_sampling_ratio/max": 0.5509777069091797, "sampling/importance_sampling_ratio/mean": 0.26345694065093994, "sampling/importance_sampling_ratio/min": 2.1063187357614e-15, "sampling/sampling_logp_difference/max": 4.741222858428955, "sampling/sampling_logp_difference/mean": 0.6728525161743164, "step": 1327, "step_time": 10.667620230015018 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 3.8706952929496765, "epoch": 0.01328, "grad_norm": 0.011042443104088306, "kl": 0.5757864117622375, "learning_rate": 9.999227491692825e-06, "loss": -0.0091, "step": 1328, "step_time": 6.196361851012625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1498.0, "completions/max_terminated_length": 1498.0, "completions/mean_length": 532.65625, "completions/mean_terminated_length": 529.9354858398438, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.513455420732498, "epoch": 0.01329, "frac_reward_zero_std": 0.25, "grad_norm": 0.052860114723443985, "kl": 0.7063988298177719, "learning_rate": 9.999226295437887e-06, "loss": -0.0045, "num_tokens": 34068133.0, "reward": 0.9710697531700134, "reward_std": 0.2949986755847931, "rewards/rollout_reward_func/mean": 0.9710697531700134, "rewards/rollout_reward_func/std": 0.6087093353271484, "sampling/importance_sampling_ratio/max": 0.5540809035301208, "sampling/importance_sampling_ratio/mean": 0.27170395851135254, "sampling/importance_sampling_ratio/min": 2.298887302742969e-09, "sampling/sampling_logp_difference/max": 4.657676696777344, "sampling/sampling_logp_difference/mean": 0.5751094222068787, "step": 1329, "step_time": 11.112313759003882 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 3.5246107280254364, "epoch": 0.0133, "grad_norm": 0.041942890733480453, "kl": 0.6834303326904774, "learning_rate": 9.99922509825754e-06, "loss": -0.0046, "step": 1330, "step_time": 6.685711550002452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2607.0, "completions/max_terminated_length": 2607.0, "completions/mean_length": 279.1875, "completions/mean_terminated_length": 287.6773986816406, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.068877935409546, "epoch": 0.01331, "frac_reward_zero_std": 0.5, "grad_norm": 0.012314396910369396, "kl": 0.7184971049427986, "learning_rate": 9.999223900151786e-06, "loss": -0.0064, "num_tokens": 34114139.0, "reward": 0.4559982120990753, "reward_std": 0.2636905014514923, "rewards/rollout_reward_func/mean": 0.4559982120990753, "rewards/rollout_reward_func/std": 1.0248945951461792, "sampling/importance_sampling_ratio/max": 0.5550054907798767, "sampling/importance_sampling_ratio/mean": 0.4036515951156616, "sampling/importance_sampling_ratio/min": 3.535686334635942e-18, "sampling/sampling_logp_difference/max": 3.7410521507263184, "sampling/sampling_logp_difference/mean": 0.7506899237632751, "step": 1331, "step_time": 14.431483637003112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.0665008425712585, "epoch": 0.01332, "grad_norm": 0.011311294510960579, "kl": 0.7178368121385574, "learning_rate": 9.999222701120623e-06, "loss": -0.0064, "step": 1332, "step_time": 8.43455576201086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1543.0, "completions/max_terminated_length": 1543.0, "completions/mean_length": 433.84375, "completions/mean_terminated_length": 433.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.743567645549774, "epoch": 0.01333, "frac_reward_zero_std": 0.75, "grad_norm": 0.00436358992010355, "kl": 0.8179990872740746, "learning_rate": 9.999221501164055e-06, "loss": 0.0004, "num_tokens": 34166319.0, "reward": 0.39849525690078735, "reward_std": 0.28806230425834656, "rewards/rollout_reward_func/mean": 0.39849525690078735, "rewards/rollout_reward_func/std": 1.004591941833496, "sampling/importance_sampling_ratio/max": 0.5576072335243225, "sampling/importance_sampling_ratio/mean": 0.3423807621002197, "sampling/importance_sampling_ratio/min": 9.03880427358672e-05, "sampling/sampling_logp_difference/max": 2.8233165740966797, "sampling/sampling_logp_difference/mean": 0.541135311126709, "step": 1333, "step_time": 10.94399191600678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.7487922608852386, "epoch": 0.01334, "grad_norm": 0.004299425520002842, "kl": 0.8174476027488708, "learning_rate": 9.999220300282077e-06, "loss": 0.0004, "step": 1334, "step_time": 6.499574024004687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1900.0, "completions/max_terminated_length": 1900.0, "completions/mean_length": 781.625, "completions/mean_terminated_length": 781.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.392789453268051, "epoch": 0.01335, "frac_reward_zero_std": 0.5, "grad_norm": 0.03445128723978996, "kl": 0.6442221626639366, "learning_rate": 9.999219098474695e-06, "loss": -0.0036, "num_tokens": 34230119.0, "reward": 0.7920569777488708, "reward_std": 0.5754367113113403, "rewards/rollout_reward_func/mean": 0.7920569777488708, "rewards/rollout_reward_func/std": 0.8814345002174377, "sampling/importance_sampling_ratio/max": 0.5545850992202759, "sampling/importance_sampling_ratio/mean": 0.18700265884399414, "sampling/importance_sampling_ratio/min": 7.703383744228631e-05, "sampling/sampling_logp_difference/max": 2.8319146633148193, "sampling/sampling_logp_difference/mean": 0.6465581655502319, "step": 1335, "step_time": 12.406241107993992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.391127586364746, "epoch": 0.01336, "grad_norm": 0.03702083230018616, "kl": 0.6394756659865379, "learning_rate": 9.999217895741903e-06, "loss": -0.0036, "step": 1336, "step_time": 7.2882044649959425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1401.0, "completions/max_terminated_length": 1401.0, "completions/mean_length": 447.40625, "completions/mean_terminated_length": 461.32257080078125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.632353127002716, "epoch": 0.01337, "frac_reward_zero_std": 0.25, "grad_norm": 0.11306525766849518, "kl": 0.519349243491888, "learning_rate": 9.999216692083707e-06, "loss": 0.002, "num_tokens": 34282227.0, "reward": 0.4119694232940674, "reward_std": 0.45183664560317993, "rewards/rollout_reward_func/mean": 0.4119694232940674, "rewards/rollout_reward_func/std": 1.0649967193603516, "sampling/importance_sampling_ratio/max": 0.5565446615219116, "sampling/importance_sampling_ratio/mean": 0.24718128144741058, "sampling/importance_sampling_ratio/min": 1.0052101837541849e-13, "sampling/sampling_logp_difference/max": 4.465713024139404, "sampling/sampling_logp_difference/mean": 0.8703786134719849, "step": 1337, "step_time": 10.422626515988668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.031250000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031250000931322575, "entropy": 4.706116825342178, "epoch": 0.01338, "grad_norm": 0.059449102729558945, "kl": 0.5113303549587727, "learning_rate": 9.999215487500103e-06, "loss": 0.0016, "step": 1338, "step_time": 5.898182912998891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1301.0, "completions/max_terminated_length": 1301.0, "completions/mean_length": 311.4375, "completions/mean_terminated_length": 303.19354248046875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.4877823889255524, "epoch": 0.01339, "frac_reward_zero_std": 0.5, "grad_norm": 0.10085315257310867, "kl": 0.613537922501564, "learning_rate": 9.999214281991096e-06, "loss": -0.0072, "num_tokens": 34331993.0, "reward": 1.0789563655853271, "reward_std": 0.2073177993297577, "rewards/rollout_reward_func/mean": 1.0789563655853271, "rewards/rollout_reward_func/std": 0.37952113151550293, "sampling/importance_sampling_ratio/max": 0.5556120872497559, "sampling/importance_sampling_ratio/mean": 0.36617711186408997, "sampling/importance_sampling_ratio/min": 6.589202382507875e-13, "sampling/sampling_logp_difference/max": 4.94700288772583, "sampling/sampling_logp_difference/mean": 0.6056835651397705, "step": 1339, "step_time": 10.167633398014004 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 3.5327104926109314, "epoch": 0.0134, "grad_norm": 0.08542221039533615, "kl": 0.6081870719790459, "learning_rate": 9.999213075556682e-06, "loss": -0.0075, "step": 1340, "step_time": 5.744565900989983 }, { "clip_ratio/high_max": 0.004464285913854837, "clip_ratio/high_mean": 0.0022321429569274187, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022321429569274187, "completions/clipped_ratio": 0.0, "completions/max_length": 2866.0, "completions/max_terminated_length": 2866.0, "completions/mean_length": 762.90625, "completions/mean_terminated_length": 762.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.419985711574554, "epoch": 0.01341, "frac_reward_zero_std": 0.25, "grad_norm": 0.024816138669848442, "kl": 0.6344050988554955, "learning_rate": 9.999211868196863e-06, "loss": -0.006, "num_tokens": 34396631.0, "reward": 1.132725715637207, "reward_std": 0.223396897315979, "rewards/rollout_reward_func/mean": 1.132725715637207, "rewards/rollout_reward_func/std": 0.41196832060813904, "sampling/importance_sampling_ratio/max": 0.5557463765144348, "sampling/importance_sampling_ratio/mean": 0.22627432644367218, "sampling/importance_sampling_ratio/min": 3.6991213262990226e-20, "sampling/sampling_logp_difference/max": 9.431769371032715, "sampling/sampling_logp_difference/mean": 0.7675765752792358, "step": 1341, "step_time": 15.436559060006402 }, { "clip_ratio/high_max": 0.004464285913854837, "clip_ratio/high_mean": 0.0022321429569274187, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022321429569274187, "entropy": 4.435267001390457, "epoch": 0.01342, "grad_norm": 0.02642168290913105, "kl": 0.6348734945058823, "learning_rate": 9.999210659911638e-06, "loss": -0.0059, "step": 1342, "step_time": 9.070061156999145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2406.0, "completions/max_terminated_length": 2406.0, "completions/mean_length": 930.125, "completions/mean_terminated_length": 930.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.655524671077728, "epoch": 0.01343, "frac_reward_zero_std": 0.5, "grad_norm": 0.010783880017697811, "kl": 0.5461276024580002, "learning_rate": 9.999209450701009e-06, "loss": -0.0015, "num_tokens": 34466360.0, "reward": 0.5968979597091675, "reward_std": 0.24821163713932037, "rewards/rollout_reward_func/mean": 0.5968979597091675, "rewards/rollout_reward_func/std": 0.9783959984779358, "sampling/importance_sampling_ratio/max": 0.5473648905754089, "sampling/importance_sampling_ratio/mean": 0.1948995292186737, "sampling/importance_sampling_ratio/min": 7.706859150857781e-07, "sampling/sampling_logp_difference/max": 3.227776050567627, "sampling/sampling_logp_difference/mean": 0.750717282295227, "step": 1343, "step_time": 14.351072229001147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.680434048175812, "epoch": 0.01344, "grad_norm": 0.010438278317451477, "kl": 0.5435958318412304, "learning_rate": 9.999208240564978e-06, "loss": -0.0016, "step": 1344, "step_time": 8.075080264992721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1038.0, "completions/max_terminated_length": 1038.0, "completions/mean_length": 281.65625, "completions/mean_terminated_length": 281.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.7286073565483093, "epoch": 0.01345, "frac_reward_zero_std": 0.5, "grad_norm": 0.02554146572947502, "kl": 0.66005989164114, "learning_rate": 9.999207029503541e-06, "loss": -0.0102, "num_tokens": 34512776.0, "reward": 0.44985827803611755, "reward_std": 0.2741343080997467, "rewards/rollout_reward_func/mean": 0.44985827803611755, "rewards/rollout_reward_func/std": 1.0164358615875244, "sampling/importance_sampling_ratio/max": 0.5473783612251282, "sampling/importance_sampling_ratio/mean": 0.3383188247680664, "sampling/importance_sampling_ratio/min": 0.00018632209685165435, "sampling/sampling_logp_difference/max": 2.98854660987854, "sampling/sampling_logp_difference/mean": 0.5775971412658691, "step": 1345, "step_time": 9.704789993993472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.7552323937416077, "epoch": 0.01346, "grad_norm": 0.026466259732842445, "kl": 0.650529220700264, "learning_rate": 9.999205817516701e-06, "loss": -0.0102, "step": 1346, "step_time": 5.133603779002442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2073.0, "completions/max_terminated_length": 2073.0, "completions/mean_length": 679.0625, "completions/mean_terminated_length": 679.0625, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "entropy": 4.055148661136627, "epoch": 0.01347, "frac_reward_zero_std": 0.0, "grad_norm": 0.08827104419469833, "kl": 0.6135726571083069, "learning_rate": 9.999204604604457e-06, "loss": -0.0077, "num_tokens": 34575628.0, "reward": 0.24756066501140594, "reward_std": 0.5939170718193054, "rewards/rollout_reward_func/mean": 0.24756066501140594, "rewards/rollout_reward_func/std": 1.1169660091400146, "sampling/importance_sampling_ratio/max": 0.3133224546909332, "sampling/importance_sampling_ratio/mean": 0.1879035085439682, "sampling/importance_sampling_ratio/min": 7.023328362265602e-05, "sampling/sampling_logp_difference/max": 4.423449516296387, "sampling/sampling_logp_difference/mean": 0.6128610968589783, "step": 1347, "step_time": 12.925793529000657 }, { "clip_ratio/high_max": 0.026041666977107525, "clip_ratio/high_mean": 0.013020833488553762, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013020833488553762, "entropy": 4.016685515642166, "epoch": 0.01348, "grad_norm": 0.05923354998230934, "kl": 0.6203405074775219, "learning_rate": 9.999203390766811e-06, "loss": -0.008, "step": 1348, "step_time": 7.825963808994857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2165.0, "completions/max_terminated_length": 2165.0, "completions/mean_length": 470.25, "completions/mean_terminated_length": 470.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.6940470337867737, "epoch": 0.01349, "frac_reward_zero_std": 0.75, "grad_norm": 0.010753403417766094, "kl": 0.6008350737392902, "learning_rate": 9.999202176003763e-06, "loss": 0.0002, "num_tokens": 34630394.0, "reward": 0.5748882293701172, "reward_std": 0.17780563235282898, "rewards/rollout_reward_func/mean": 0.5748882293701172, "rewards/rollout_reward_func/std": 0.961178719997406, "sampling/importance_sampling_ratio/max": 0.5492008924484253, "sampling/importance_sampling_ratio/mean": 0.26188924908638, "sampling/importance_sampling_ratio/min": 0.0002254299761261791, "sampling/sampling_logp_difference/max": 2.6118335723876953, "sampling/sampling_logp_difference/mean": 0.4925340712070465, "step": 1349, "step_time": 12.95579807699687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.6603944897651672, "epoch": 0.0135, "grad_norm": 0.01101943850517273, "kl": 0.6038941107690334, "learning_rate": 9.999200960315312e-06, "loss": 0.0003, "step": 1350, "step_time": 7.727819802988961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1300.0, "completions/max_terminated_length": 1300.0, "completions/mean_length": 360.96875, "completions/mean_terminated_length": 360.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.727475702762604, "epoch": 0.01351, "frac_reward_zero_std": 0.25, "grad_norm": 0.04870832711458206, "kl": 0.7421403378248215, "learning_rate": 9.999199743701456e-06, "loss": -0.0026, "num_tokens": 34680373.0, "reward": 0.16435304284095764, "reward_std": 0.4745926260948181, "rewards/rollout_reward_func/mean": 0.16435304284095764, "rewards/rollout_reward_func/std": 1.0911701917648315, "sampling/importance_sampling_ratio/max": 0.5523207187652588, "sampling/importance_sampling_ratio/mean": 0.2111058533191681, "sampling/importance_sampling_ratio/min": 1.5636538432131316e-15, "sampling/sampling_logp_difference/max": 4.254447937011719, "sampling/sampling_logp_difference/mean": 0.9035031795501709, "step": 1351, "step_time": 10.10087124799611 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 4.686383694410324, "epoch": 0.01352, "grad_norm": 0.04236498102545738, "kl": 0.7437523230910301, "learning_rate": 9.999198526162202e-06, "loss": -0.0027, "step": 1352, "step_time": 5.726145424989227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 693.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 389.90625, "completions/mean_terminated_length": 386.45159912109375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.429178923368454, "epoch": 0.01353, "frac_reward_zero_std": 0.25, "grad_norm": 0.0490947961807251, "kl": 0.5230493098497391, "learning_rate": 9.999197307697545e-06, "loss": -0.0066, "num_tokens": 34732460.0, "reward": 0.6950621604919434, "reward_std": 0.2055300772190094, "rewards/rollout_reward_func/mean": 0.6950621604919434, "rewards/rollout_reward_func/std": 0.9143316745758057, "sampling/importance_sampling_ratio/max": 0.5517001152038574, "sampling/importance_sampling_ratio/mean": 0.3039281964302063, "sampling/importance_sampling_ratio/min": 1.1250172173786054e-14, "sampling/sampling_logp_difference/max": 4.092395782470703, "sampling/sampling_logp_difference/mean": 0.5873209238052368, "step": 1353, "step_time": 9.135147047993087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.4182170927524567, "epoch": 0.01354, "grad_norm": 0.049281004816293716, "kl": 0.522133108228445, "learning_rate": 9.999196088307487e-06, "loss": -0.0066, "step": 1354, "step_time": 4.482340482987638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2778.0, "completions/max_terminated_length": 2778.0, "completions/mean_length": 502.8125, "completions/mean_terminated_length": 502.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.860985904932022, "epoch": 0.01355, "frac_reward_zero_std": 0.5, "grad_norm": 0.0978013351559639, "kl": 0.6023385375738144, "learning_rate": 9.999194867992026e-06, "loss": -0.009, "num_tokens": 34789004.0, "reward": 0.6882279515266418, "reward_std": 0.5479263067245483, "rewards/rollout_reward_func/mean": 0.6882279515266418, "rewards/rollout_reward_func/std": 0.9191192388534546, "sampling/importance_sampling_ratio/max": 0.5549465417861938, "sampling/importance_sampling_ratio/mean": 0.2478799968957901, "sampling/importance_sampling_ratio/min": 0.002094848779961467, "sampling/sampling_logp_difference/max": 2.3481006622314453, "sampling/sampling_logp_difference/mean": 0.5221967697143555, "step": 1355, "step_time": 14.937230741998064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.8211289942264557, "epoch": 0.01356, "grad_norm": 0.10162021219730377, "kl": 0.6074471436440945, "learning_rate": 9.999193646751167e-06, "loss": -0.0093, "step": 1356, "step_time": 8.766143846987688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.0625, "completions/max_length": 1417.0, "completions/max_terminated_length": 1417.0, "completions/mean_length": 420.03125, "completions/mean_terminated_length": 427.7333679199219, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.8575536608695984, "epoch": 0.01357, "frac_reward_zero_std": 0.25, "grad_norm": 0.046467699110507965, "kl": 0.5654552578926086, "learning_rate": 9.999192424584906e-06, "loss": -0.005, "num_tokens": 34843161.0, "reward": 0.8921812772750854, "reward_std": 0.46691930294036865, "rewards/rollout_reward_func/mean": 0.8921812772750854, "rewards/rollout_reward_func/std": 0.716513454914093, "sampling/importance_sampling_ratio/max": 0.5505114197731018, "sampling/importance_sampling_ratio/mean": 0.27237147092819214, "sampling/importance_sampling_ratio/min": 2.0025708793048463e-14, "sampling/sampling_logp_difference/max": 4.420953750610352, "sampling/sampling_logp_difference/mean": 0.7410966157913208, "step": 1357, "step_time": 10.794083791981393 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.810346871614456, "epoch": 0.01358, "grad_norm": 0.041828788816928864, "kl": 0.571657732129097, "learning_rate": 9.999191201493247e-06, "loss": -0.0051, "step": 1358, "step_time": 6.001112355006626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1417.0, "completions/max_terminated_length": 1417.0, "completions/mean_length": 504.4375, "completions/mean_terminated_length": 504.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.818579852581024, "epoch": 0.01359, "frac_reward_zero_std": 0.25, "grad_norm": 0.051831893622875214, "kl": 0.5563832893967628, "learning_rate": 9.999189977476188e-06, "loss": -0.0015, "num_tokens": 34899026.0, "reward": 0.7810735702514648, "reward_std": 0.2571792006492615, "rewards/rollout_reward_func/mean": 0.7810735702514648, "rewards/rollout_reward_func/std": 0.8285926580429077, "sampling/importance_sampling_ratio/max": 0.5521712303161621, "sampling/importance_sampling_ratio/mean": 0.2778205871582031, "sampling/importance_sampling_ratio/min": 0.001155057456344366, "sampling/sampling_logp_difference/max": 2.388254404067993, "sampling/sampling_logp_difference/mean": 0.5416321158409119, "step": 1359, "step_time": 11.339871855998354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.804338216781616, "epoch": 0.0136, "grad_norm": 0.049399130046367645, "kl": 0.5480324178934097, "learning_rate": 9.999188752533728e-06, "loss": -0.0016, "step": 1360, "step_time": 6.01311653200537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2336.0, "completions/max_terminated_length": 2336.0, "completions/mean_length": 703.0625, "completions/mean_terminated_length": 732.8333740234375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.751077026128769, "epoch": 0.01361, "frac_reward_zero_std": 0.0, "grad_norm": 0.04569920524954796, "kl": 0.6796743646264076, "learning_rate": 9.99918752666587e-06, "loss": -0.0137, "num_tokens": 34959526.0, "reward": 0.5661485195159912, "reward_std": 0.2355436533689499, "rewards/rollout_reward_func/mean": 0.5661485195159912, "rewards/rollout_reward_func/std": 0.9962620139122009, "sampling/importance_sampling_ratio/max": 0.5533748269081116, "sampling/importance_sampling_ratio/mean": 0.2443760633468628, "sampling/importance_sampling_ratio/min": 1.6873218100510456e-10, "sampling/sampling_logp_difference/max": 3.698824405670166, "sampling/sampling_logp_difference/mean": 0.6170451641082764, "step": 1361, "step_time": 14.521188811006141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.715751826763153, "epoch": 0.01362, "grad_norm": 0.04316730797290802, "kl": 0.683807972818613, "learning_rate": 9.999186299872614e-06, "loss": -0.0138, "step": 1362, "step_time": 8.096556344979035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2665.0, "completions/max_terminated_length": 2665.0, "completions/mean_length": 607.71875, "completions/mean_terminated_length": 607.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.673883855342865, "epoch": 0.01363, "frac_reward_zero_std": 0.25, "grad_norm": 0.09737633913755417, "kl": 0.6276547349989414, "learning_rate": 9.99918507215396e-06, "loss": -0.0035, "num_tokens": 35019780.0, "reward": 0.5140457153320312, "reward_std": 0.38412630558013916, "rewards/rollout_reward_func/mean": 0.5140457153320312, "rewards/rollout_reward_func/std": 1.0568751096725464, "sampling/importance_sampling_ratio/max": 0.5442786812782288, "sampling/importance_sampling_ratio/mean": 0.2797035574913025, "sampling/importance_sampling_ratio/min": 2.677225195008015e-12, "sampling/sampling_logp_difference/max": 3.389540195465088, "sampling/sampling_logp_difference/mean": 0.587800920009613, "step": 1363, "step_time": 14.499295635992894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.630347430706024, "epoch": 0.01364, "grad_norm": 0.0636337623000145, "kl": 0.630491953343153, "learning_rate": 9.999183843509903e-06, "loss": -0.0038, "step": 1364, "step_time": 8.500331013987307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2806.0, "completions/max_terminated_length": 2806.0, "completions/mean_length": 1089.03125, "completions/mean_terminated_length": 1089.03125, "completions/min_length": 546.0, "completions/min_terminated_length": 546.0, "entropy": 3.9886207580566406, "epoch": 0.01365, "frac_reward_zero_std": 0.0, "grad_norm": 0.024201693013310432, "kl": 0.5490569397807121, "learning_rate": 9.999182613940452e-06, "loss": -0.0122, "num_tokens": 35096545.0, "reward": 0.7586131691932678, "reward_std": 0.9620631337165833, "rewards/rollout_reward_func/mean": 0.7586131691932678, "rewards/rollout_reward_func/std": 0.9516419768333435, "sampling/importance_sampling_ratio/max": 0.3072890639305115, "sampling/importance_sampling_ratio/mean": 0.13223446905612946, "sampling/importance_sampling_ratio/min": 0.0004665997694246471, "sampling/sampling_logp_difference/max": 2.2698512077331543, "sampling/sampling_logp_difference/mean": 0.5080416202545166, "step": 1365, "step_time": 16.119114216002345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.9797748923301697, "epoch": 0.01366, "grad_norm": 0.02530456706881523, "kl": 0.5490811541676521, "learning_rate": 9.999181383445602e-06, "loss": -0.0122, "step": 1366, "step_time": 8.990731469995808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2778.0, "completions/max_terminated_length": 2778.0, "completions/mean_length": 492.40625, "completions/mean_terminated_length": 492.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.507239729166031, "epoch": 0.01367, "frac_reward_zero_std": 0.75, "grad_norm": 0.004714044276624918, "kl": 0.7321679666638374, "learning_rate": 9.999180152025356e-06, "loss": 0.0003, "num_tokens": 35148749.0, "reward": 0.9225537776947021, "reward_std": 0.28596651554107666, "rewards/rollout_reward_func/mean": 0.9225537776947021, "rewards/rollout_reward_func/std": 0.6060687303543091, "sampling/importance_sampling_ratio/max": 0.5553436279296875, "sampling/importance_sampling_ratio/mean": 0.4047950506210327, "sampling/importance_sampling_ratio/min": 3.9457874663639814e-05, "sampling/sampling_logp_difference/max": 2.2072887420654297, "sampling/sampling_logp_difference/mean": 0.47434109449386597, "step": 1367, "step_time": 15.448888468003133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 3.497566431760788, "epoch": 0.01368, "grad_norm": 0.0044546304270625114, "kl": 0.7329477146267891, "learning_rate": 9.999178919679713e-06, "loss": 0.0003, "step": 1368, "step_time": 8.914819057994464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2029.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 826.4375, "completions/mean_terminated_length": 826.4375, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "entropy": 3.9531817734241486, "epoch": 0.01369, "frac_reward_zero_std": 0.0, "grad_norm": 0.06021375581622124, "kl": 0.5437473468482494, "learning_rate": 9.999177686408673e-06, "loss": 0.0107, "num_tokens": 35216347.0, "reward": 0.598594605922699, "reward_std": 0.5697320103645325, "rewards/rollout_reward_func/mean": 0.598594605922699, "rewards/rollout_reward_func/std": 0.9369995594024658, "sampling/importance_sampling_ratio/max": 0.31456947326660156, "sampling/importance_sampling_ratio/mean": 0.18684855103492737, "sampling/importance_sampling_ratio/min": 0.0003151797573082149, "sampling/sampling_logp_difference/max": 4.110978126525879, "sampling/sampling_logp_difference/mean": 0.5383567214012146, "step": 1369, "step_time": 12.846242602987331 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 3.9475951194763184, "epoch": 0.0137, "grad_norm": 0.0544804185628891, "kl": 0.5485015958547592, "learning_rate": 9.999176452212236e-06, "loss": 0.0106, "step": 1370, "step_time": 7.308622733005905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2219.0, "completions/max_terminated_length": 2219.0, "completions/mean_length": 538.125, "completions/mean_terminated_length": 538.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.275426208972931, "epoch": 0.01371, "frac_reward_zero_std": 0.5, "grad_norm": 0.08084015548229218, "kl": 0.7041791900992393, "learning_rate": 9.999175217090401e-06, "loss": 0.0052, "num_tokens": 35273924.0, "reward": 1.1603307723999023, "reward_std": 0.018937094137072563, "rewards/rollout_reward_func/mean": 1.1603307723999023, "rewards/rollout_reward_func/std": 0.08388464897871017, "sampling/importance_sampling_ratio/max": 0.5569384098052979, "sampling/importance_sampling_ratio/mean": 0.30499178171157837, "sampling/importance_sampling_ratio/min": 8.48987085078079e-08, "sampling/sampling_logp_difference/max": 2.606167793273926, "sampling/sampling_logp_difference/mean": 0.46109539270401, "step": 1371, "step_time": 13.330368875002023 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 3.2845041751861572, "epoch": 0.01372, "grad_norm": 0.05933119356632233, "kl": 0.6930981203913689, "learning_rate": 9.999173981043175e-06, "loss": 0.0049, "step": 1372, "step_time": 7.582654119993094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2671.0, "completions/max_terminated_length": 2671.0, "completions/mean_length": 915.59375, "completions/mean_terminated_length": 915.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.393952876329422, "epoch": 0.01373, "frac_reward_zero_std": 0.25, "grad_norm": 0.05949556455016136, "kl": 0.55182671174407, "learning_rate": 9.99917274407055e-06, "loss": -0.0071, "num_tokens": 35343598.0, "reward": 0.24469564855098724, "reward_std": 0.5837504267692566, "rewards/rollout_reward_func/mean": 0.24469564855098724, "rewards/rollout_reward_func/std": 1.0572441816329956, "sampling/importance_sampling_ratio/max": 0.55302494764328, "sampling/importance_sampling_ratio/mean": 0.18781086802482605, "sampling/importance_sampling_ratio/min": 2.1519297659051495e-15, "sampling/sampling_logp_difference/max": 11.729747772216797, "sampling/sampling_logp_difference/mean": 0.7484393119812012, "step": 1373, "step_time": 16.050344688017503 }, { "clip_ratio/high_max": 0.024553571827709675, "clip_ratio/high_mean": 0.012276785913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012276785913854837, "entropy": 4.389184236526489, "epoch": 0.01374, "grad_norm": 0.06649405509233475, "kl": 0.5603745877742767, "learning_rate": 9.999171506172532e-06, "loss": -0.0073, "step": 1374, "step_time": 8.882946503006679 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.03125, "completions/max_length": 1377.0, "completions/max_terminated_length": 1377.0, "completions/mean_length": 884.28125, "completions/mean_terminated_length": 912.290283203125, "completions/min_length": 16.0, "completions/min_terminated_length": 546.0, "entropy": 3.7483673095703125, "epoch": 0.01375, "frac_reward_zero_std": 0.25, "grad_norm": 0.01881346106529236, "kl": 0.6516193337738514, "learning_rate": 9.999170267349118e-06, "loss": -0.0008, "num_tokens": 35413553.0, "reward": 0.618714451789856, "reward_std": 0.5624960064888, "rewards/rollout_reward_func/mean": 0.618714451789856, "rewards/rollout_reward_func/std": 1.0155210494995117, "sampling/importance_sampling_ratio/max": 0.3016461431980133, "sampling/importance_sampling_ratio/mean": 0.15767255425453186, "sampling/importance_sampling_ratio/min": 4.071694987513057e-17, "sampling/sampling_logp_difference/max": 3.7949650287628174, "sampling/sampling_logp_difference/mean": 0.6559301614761353, "step": 1375, "step_time": 11.711011744009738 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 3.7524214386940002, "epoch": 0.01376, "grad_norm": 0.018180929124355316, "kl": 0.6527723744511604, "learning_rate": 9.999169027600309e-06, "loss": -0.0008, "step": 1376, "step_time": 6.326743794998038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2068.0, "completions/max_terminated_length": 2068.0, "completions/mean_length": 1048.46875, "completions/mean_terminated_length": 1048.46875, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "entropy": 4.338038682937622, "epoch": 0.01377, "frac_reward_zero_std": 0.0, "grad_norm": 0.06830808520317078, "kl": 0.6503557525575161, "learning_rate": 9.999167786926106e-06, "loss": -0.0007, "num_tokens": 35489082.0, "reward": 0.7544490098953247, "reward_std": 0.2185506522655487, "rewards/rollout_reward_func/mean": 0.7544490098953247, "rewards/rollout_reward_func/std": 0.9399747252464294, "sampling/importance_sampling_ratio/max": 0.29947996139526367, "sampling/importance_sampling_ratio/mean": 0.0975475162267685, "sampling/importance_sampling_ratio/min": 1.7244407077553767e-15, "sampling/sampling_logp_difference/max": 9.817237854003906, "sampling/sampling_logp_difference/mean": 0.7002236247062683, "step": 1377, "step_time": 13.74044009699719 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 4.3084282875061035, "epoch": 0.01378, "grad_norm": 0.06817519664764404, "kl": 0.6559486947953701, "learning_rate": 9.99916654532651e-06, "loss": -0.0008, "step": 1378, "step_time": 7.434748417013907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2191.0, "completions/max_terminated_length": 2191.0, "completions/mean_length": 890.375, "completions/mean_terminated_length": 918.5806274414062, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.6849439442157745, "epoch": 0.01379, "frac_reward_zero_std": 0.0, "grad_norm": 0.010374259203672409, "kl": 0.5957898534834385, "learning_rate": 9.999165302801519e-06, "loss": -0.0127, "num_tokens": 35557806.0, "reward": -0.09923505783081055, "reward_std": 0.5086116790771484, "rewards/rollout_reward_func/mean": -0.09923505783081055, "rewards/rollout_reward_func/std": 1.0475735664367676, "sampling/importance_sampling_ratio/max": 0.5538412928581238, "sampling/importance_sampling_ratio/mean": 0.17558713257312775, "sampling/importance_sampling_ratio/min": 5.791076951311425e-09, "sampling/sampling_logp_difference/max": 3.85978364944458, "sampling/sampling_logp_difference/mean": 0.7690011262893677, "step": 1379, "step_time": 13.896387175009295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.668285995721817, "epoch": 0.0138, "grad_norm": 0.010505616664886475, "kl": 0.5973918810486794, "learning_rate": 9.999164059351137e-06, "loss": -0.0127, "step": 1380, "step_time": 7.501281111006392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2045.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 918.4375, "completions/mean_terminated_length": 918.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.277374505996704, "epoch": 0.01381, "frac_reward_zero_std": 0.25, "grad_norm": 0.034736160188913345, "kl": 0.5998858287930489, "learning_rate": 9.99916281497536e-06, "loss": -0.0073, "num_tokens": 35627003.0, "reward": 0.8765496015548706, "reward_std": 0.6982904672622681, "rewards/rollout_reward_func/mean": 0.8765496015548706, "rewards/rollout_reward_func/std": 0.7926091551780701, "sampling/importance_sampling_ratio/max": 0.5546271800994873, "sampling/importance_sampling_ratio/mean": 0.20485010743141174, "sampling/importance_sampling_ratio/min": 3.5353817096073215e-11, "sampling/sampling_logp_difference/max": 12.778401374816895, "sampling/sampling_logp_difference/mean": 0.7531230449676514, "step": 1381, "step_time": 12.801678717005416 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 4.263773024082184, "epoch": 0.01382, "grad_norm": 0.024744244292378426, "kl": 0.6027652770280838, "learning_rate": 9.999161569674191e-06, "loss": -0.0073, "step": 1382, "step_time": 7.286835516009887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1950.0, "completions/max_terminated_length": 1950.0, "completions/mean_length": 655.6875, "completions/mean_terminated_length": 655.6875, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "entropy": 3.6148516833782196, "epoch": 0.01383, "frac_reward_zero_std": 0.5, "grad_norm": 0.021280687302350998, "kl": 0.7711542211472988, "learning_rate": 9.999160323447628e-06, "loss": 0.0017, "num_tokens": 35687635.0, "reward": -0.36962100863456726, "reward_std": 0.5279450416564941, "rewards/rollout_reward_func/mean": -0.36962100863456726, "rewards/rollout_reward_func/std": 1.0154173374176025, "sampling/importance_sampling_ratio/max": 0.3001876473426819, "sampling/importance_sampling_ratio/mean": 0.20872540771961212, "sampling/importance_sampling_ratio/min": 2.4298372203190866e-09, "sampling/sampling_logp_difference/max": 3.81428861618042, "sampling/sampling_logp_difference/mean": 0.5322339534759521, "step": 1383, "step_time": 12.943181289003405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.607692152261734, "epoch": 0.01384, "grad_norm": 0.021804913878440857, "kl": 0.7712594270706177, "learning_rate": 9.999159076295675e-06, "loss": 0.0016, "step": 1384, "step_time": 7.240305150990025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1926.0, "completions/max_terminated_length": 1926.0, "completions/mean_length": 898.8125, "completions/mean_terminated_length": 898.8125, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "entropy": 4.764696717262268, "epoch": 0.01385, "frac_reward_zero_std": 0.0, "grad_norm": 0.05339152738451958, "kl": 0.5186789892613888, "learning_rate": 9.99915782821833e-06, "loss": -0.0121, "num_tokens": 35758289.0, "reward": 0.3852425515651703, "reward_std": 0.9805931448936462, "rewards/rollout_reward_func/mean": 0.3852425515651703, "rewards/rollout_reward_func/std": 1.0465835332870483, "sampling/importance_sampling_ratio/max": 0.32729801535606384, "sampling/importance_sampling_ratio/mean": 0.11896758526563644, "sampling/importance_sampling_ratio/min": 5.282128164022155e-16, "sampling/sampling_logp_difference/max": 12.97942066192627, "sampling/sampling_logp_difference/mean": 0.9136576652526855, "step": 1385, "step_time": 13.850923756996053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "entropy": 4.744153529405594, "epoch": 0.01386, "grad_norm": 0.05130501091480255, "kl": 0.52315653860569, "learning_rate": 9.999156579215592e-06, "loss": -0.0122, "step": 1386, "step_time": 7.360633459989913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1112.0, "completions/max_terminated_length": 1112.0, "completions/mean_length": 480.65625, "completions/mean_terminated_length": 476.7419128417969, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.466715633869171, "epoch": 0.01387, "frac_reward_zero_std": 0.25, "grad_norm": 0.05097166448831558, "kl": 0.7097971849143505, "learning_rate": 9.999155329287464e-06, "loss": 0.0032, "num_tokens": 35814095.0, "reward": -0.19199848175048828, "reward_std": 0.5064871311187744, "rewards/rollout_reward_func/mean": -0.19199848175048828, "rewards/rollout_reward_func/std": 1.0997484922409058, "sampling/importance_sampling_ratio/max": 0.5521037578582764, "sampling/importance_sampling_ratio/mean": 0.28029781579971313, "sampling/importance_sampling_ratio/min": 2.6620324433679343e-07, "sampling/sampling_logp_difference/max": 3.8540961742401123, "sampling/sampling_logp_difference/mean": 0.5421860218048096, "step": 1387, "step_time": 9.595239209993451 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 3.452830493450165, "epoch": 0.01388, "grad_norm": 0.041355669498443604, "kl": 0.7124789655208588, "learning_rate": 9.999154078433945e-06, "loss": 0.0031, "step": 1388, "step_time": 5.636624598002527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1356.0, "completions/max_terminated_length": 1356.0, "completions/mean_length": 627.8125, "completions/mean_terminated_length": 627.8125, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "entropy": 3.2197281420230865, "epoch": 0.01389, "frac_reward_zero_std": 0.0, "grad_norm": 0.055295180529356, "kl": 0.6949229836463928, "learning_rate": 9.999152826655035e-06, "loss": -0.0114, "num_tokens": 35876213.0, "reward": 1.0716019868850708, "reward_std": 0.2806251645088196, "rewards/rollout_reward_func/mean": 1.0716019868850708, "rewards/rollout_reward_func/std": 0.5352727174758911, "sampling/importance_sampling_ratio/max": 0.3161250054836273, "sampling/importance_sampling_ratio/mean": 0.2362414002418518, "sampling/importance_sampling_ratio/min": 0.004391468595713377, "sampling/sampling_logp_difference/max": 4.011539936065674, "sampling/sampling_logp_difference/mean": 0.4451013207435608, "step": 1389, "step_time": 10.608312780997949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.205023854970932, "epoch": 0.0139, "grad_norm": 0.04888478294014931, "kl": 0.6962872371077538, "learning_rate": 9.999151573950735e-06, "loss": -0.0116, "step": 1390, "step_time": 6.415590853008325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1484.0, "completions/max_terminated_length": 1484.0, "completions/mean_length": 380.46875, "completions/mean_terminated_length": 380.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.076260656118393, "epoch": 0.01391, "frac_reward_zero_std": 0.5, "grad_norm": 0.016272440552711487, "kl": 0.7504723146557808, "learning_rate": 9.999150320321046e-06, "loss": -0.0062, "num_tokens": 35926742.0, "reward": 0.5997243523597717, "reward_std": 0.02740788459777832, "rewards/rollout_reward_func/mean": 0.5997243523597717, "rewards/rollout_reward_func/std": 0.9275181293487549, "sampling/importance_sampling_ratio/max": 0.5555180311203003, "sampling/importance_sampling_ratio/mean": 0.36645522713661194, "sampling/importance_sampling_ratio/min": 0.0001668519980739802, "sampling/sampling_logp_difference/max": 4.572616100311279, "sampling/sampling_logp_difference/mean": 0.42968326807022095, "step": 1391, "step_time": 10.750836583996715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.06967431306839, "epoch": 0.01392, "grad_norm": 0.015768123790621758, "kl": 0.7530668079853058, "learning_rate": 9.999149065765967e-06, "loss": -0.0062, "step": 1392, "step_time": 6.276998761000868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1421.0, "completions/max_terminated_length": 1421.0, "completions/mean_length": 713.46875, "completions/mean_terminated_length": 692.2257690429688, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.412090092897415, "epoch": 0.01393, "frac_reward_zero_std": 0.25, "grad_norm": 0.11551742255687714, "kl": 0.6184337362647057, "learning_rate": 9.999147810285496e-06, "loss": 0.0012, "num_tokens": 35991466.0, "reward": 1.124450445175171, "reward_std": 0.24270106852054596, "rewards/rollout_reward_func/mean": 1.124450445175171, "rewards/rollout_reward_func/std": 0.4334859848022461, "sampling/importance_sampling_ratio/max": 0.5579085946083069, "sampling/importance_sampling_ratio/mean": 0.26587069034576416, "sampling/importance_sampling_ratio/min": 1.7727740933212721e-12, "sampling/sampling_logp_difference/max": 8.42218017578125, "sampling/sampling_logp_difference/mean": 0.6175515055656433, "step": 1393, "step_time": 11.717878560019017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.021875000093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021875000093132257, "entropy": 3.403884381055832, "epoch": 0.01394, "grad_norm": 0.0610276460647583, "kl": 0.6082474775612354, "learning_rate": 9.99914655387964e-06, "loss": 0.001, "step": 1394, "step_time": 6.771648106994689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1193.0, "completions/max_terminated_length": 1193.0, "completions/mean_length": 522.8125, "completions/mean_terminated_length": 522.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.7917624711990356, "epoch": 0.01395, "frac_reward_zero_std": 0.25, "grad_norm": 0.039607930928468704, "kl": 0.804119810461998, "learning_rate": 9.999145296548393e-06, "loss": -0.0124, "num_tokens": 36049180.0, "reward": 0.213912695646286, "reward_std": 0.43567633628845215, "rewards/rollout_reward_func/mean": 0.213912695646286, "rewards/rollout_reward_func/std": 1.0537642240524292, "sampling/importance_sampling_ratio/max": 0.5484218597412109, "sampling/importance_sampling_ratio/mean": 0.26684656739234924, "sampling/importance_sampling_ratio/min": 2.294405984648584e-13, "sampling/sampling_logp_difference/max": 4.626593589782715, "sampling/sampling_logp_difference/mean": 0.632536768913269, "step": 1395, "step_time": 9.759274044008635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.78847274184227, "epoch": 0.01396, "grad_norm": 0.039424531161785126, "kl": 0.8041781559586525, "learning_rate": 9.999144038291758e-06, "loss": -0.0124, "step": 1396, "step_time": 5.896074102987768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1783.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 636.46875, "completions/mean_terminated_length": 611.1290283203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.525822728872299, "epoch": 0.01397, "frac_reward_zero_std": 0.25, "grad_norm": 0.03835984691977501, "kl": 0.6334419846534729, "learning_rate": 9.999142779109736e-06, "loss": -0.0092, "num_tokens": 36108505.0, "reward": 1.0983049869537354, "reward_std": 0.2035752534866333, "rewards/rollout_reward_func/mean": 1.0983049869537354, "rewards/rollout_reward_func/std": 0.36710336804389954, "sampling/importance_sampling_ratio/max": 0.5570312738418579, "sampling/importance_sampling_ratio/mean": 0.2783779501914978, "sampling/importance_sampling_ratio/min": 1.2910186337963407e-16, "sampling/sampling_logp_difference/max": 4.40770149230957, "sampling/sampling_logp_difference/mean": 0.7392042875289917, "step": 1397, "step_time": 11.74775629198848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.526144862174988, "epoch": 0.01398, "grad_norm": 0.03614466264843941, "kl": 0.6381373926997185, "learning_rate": 9.999141519002326e-06, "loss": -0.0092, "step": 1398, "step_time": 6.575091800994414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.03125, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 424.78125, "completions/mean_terminated_length": 437.9677429199219, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.4563763439655304, "epoch": 0.01399, "frac_reward_zero_std": 0.0, "grad_norm": 0.03330623358488083, "kl": 0.7418863363564014, "learning_rate": 9.999140257969527e-06, "loss": -0.0155, "num_tokens": 36160829.0, "reward": 0.6192235946655273, "reward_std": 0.0304913017898798, "rewards/rollout_reward_func/mean": 0.6192235946655273, "rewards/rollout_reward_func/std": 0.9431443214416504, "sampling/importance_sampling_ratio/max": 0.5560882091522217, "sampling/importance_sampling_ratio/mean": 0.32211536169052124, "sampling/importance_sampling_ratio/min": 5.4047389919323824e-20, "sampling/sampling_logp_difference/max": 12.766613006591797, "sampling/sampling_logp_difference/mean": 0.8474231958389282, "step": 1399, "step_time": 8.761963999997533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 3.4552295207977295, "epoch": 0.014, "grad_norm": 0.03434722125530243, "kl": 0.7421982884407043, "learning_rate": 9.999138996011341e-06, "loss": -0.0155, "step": 1400, "step_time": 4.920293852999748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1358.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 643.90625, "completions/mean_terminated_length": 643.90625, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "entropy": 3.4934695065021515, "epoch": 0.01401, "frac_reward_zero_std": 0.0, "grad_norm": 0.09345200657844543, "kl": 0.7288391590118408, "learning_rate": 9.99913773312777e-06, "loss": -0.0083, "num_tokens": 36223902.0, "reward": 0.5409808158874512, "reward_std": 0.585111677646637, "rewards/rollout_reward_func/mean": 0.5409808158874512, "rewards/rollout_reward_func/std": 0.9648924469947815, "sampling/importance_sampling_ratio/max": 0.3202582895755768, "sampling/importance_sampling_ratio/mean": 0.23422026634216309, "sampling/importance_sampling_ratio/min": 5.153510951316775e-09, "sampling/sampling_logp_difference/max": 5.016234397888184, "sampling/sampling_logp_difference/mean": 0.5870400667190552, "step": 1401, "step_time": 10.739761059987359 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 3.5080886781215668, "epoch": 0.01402, "grad_norm": 0.05227776616811752, "kl": 0.7308908104896545, "learning_rate": 9.99913646931881e-06, "loss": -0.0088, "step": 1402, "step_time": 6.401293806004105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1345.0, "completions/max_terminated_length": 1345.0, "completions/mean_length": 766.0625, "completions/mean_terminated_length": 781.6333618164062, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "entropy": 3.7485344409942627, "epoch": 0.01403, "frac_reward_zero_std": 0.0, "grad_norm": 0.0919574648141861, "kl": 0.6458827182650566, "learning_rate": 9.999135204584467e-06, "loss": -0.0015, "num_tokens": 36290002.0, "reward": 0.977172315120697, "reward_std": 0.46691977977752686, "rewards/rollout_reward_func/mean": 0.977172315120697, "rewards/rollout_reward_func/std": 0.6580736041069031, "sampling/importance_sampling_ratio/max": 0.31694820523262024, "sampling/importance_sampling_ratio/mean": 0.18685190379619598, "sampling/importance_sampling_ratio/min": 2.9758826085995748e-25, "sampling/sampling_logp_difference/max": 4.1606059074401855, "sampling/sampling_logp_difference/mean": 0.7090598344802856, "step": 1403, "step_time": 11.04721692000021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.7749939262866974, "epoch": 0.01404, "grad_norm": 0.0912642776966095, "kl": 0.64363843947649, "learning_rate": 9.999133938924735e-06, "loss": -0.0021, "step": 1404, "step_time": 6.108745389996329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2016.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 976.78125, "completions/mean_terminated_length": 976.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.409100353717804, "epoch": 0.01405, "frac_reward_zero_std": 0.25, "grad_norm": 0.017119428142905235, "kl": 0.5854775384068489, "learning_rate": 9.99913267233962e-06, "loss": -0.0043, "num_tokens": 36360560.0, "reward": 0.5309551358222961, "reward_std": 0.6731530427932739, "rewards/rollout_reward_func/mean": 0.5309551358222961, "rewards/rollout_reward_func/std": 1.0434997081756592, "sampling/importance_sampling_ratio/max": 0.5517112612724304, "sampling/importance_sampling_ratio/mean": 0.17657054960727692, "sampling/importance_sampling_ratio/min": 9.846853332717842e-10, "sampling/sampling_logp_difference/max": 13.387571334838867, "sampling/sampling_logp_difference/mean": 0.8184056282043457, "step": 1405, "step_time": 13.907202299997152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 4.427435994148254, "epoch": 0.01406, "grad_norm": 0.01612972281873226, "kl": 0.5899878516793251, "learning_rate": 9.99913140482912e-06, "loss": -0.0043, "step": 1406, "step_time": 8.067831410000508 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.03125, "completions/max_length": 2061.0, "completions/max_terminated_length": 2061.0, "completions/mean_length": 665.5625, "completions/mean_terminated_length": 668.290283203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.787778854370117, "epoch": 0.01407, "frac_reward_zero_std": 0.25, "grad_norm": 0.032422132790088654, "kl": 0.6217549182474613, "learning_rate": 9.999130136393232e-06, "loss": -0.0031, "num_tokens": 36421296.0, "reward": 0.580316424369812, "reward_std": 0.5529048442840576, "rewards/rollout_reward_func/mean": 0.580316424369812, "rewards/rollout_reward_func/std": 0.9314979910850525, "sampling/importance_sampling_ratio/max": 0.5540052652359009, "sampling/importance_sampling_ratio/mean": 0.25676429271698, "sampling/importance_sampling_ratio/min": 4.281993799793403e-14, "sampling/sampling_logp_difference/max": 4.304865837097168, "sampling/sampling_logp_difference/mean": 0.7667378187179565, "step": 1407, "step_time": 13.297030648012878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.8166574239730835, "epoch": 0.01408, "grad_norm": 0.025773657485842705, "kl": 0.6174580901861191, "learning_rate": 9.999128867031961e-06, "loss": -0.0031, "step": 1408, "step_time": 7.7267942979960935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2171.0, "completions/max_terminated_length": 2171.0, "completions/mean_length": 413.3125, "completions/mean_terminated_length": 413.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.223388195037842, "epoch": 0.01409, "frac_reward_zero_std": 0.5, "grad_norm": 0.013440006412565708, "kl": 0.6389300934970379, "learning_rate": 9.999127596745305e-06, "loss": -0.0029, "num_tokens": 36472674.0, "reward": 0.958491325378418, "reward_std": 0.2752949297428131, "rewards/rollout_reward_func/mean": 0.958491325378418, "rewards/rollout_reward_func/std": 0.6101993918418884, "sampling/importance_sampling_ratio/max": 0.5572336912155151, "sampling/importance_sampling_ratio/mean": 0.3813951015472412, "sampling/importance_sampling_ratio/min": 1.939217981751203e-15, "sampling/sampling_logp_difference/max": 4.031008720397949, "sampling/sampling_logp_difference/mean": 0.5457100868225098, "step": 1409, "step_time": 12.478571703992202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.2317563891410828, "epoch": 0.0141, "grad_norm": 0.012925309129059315, "kl": 0.6380256563425064, "learning_rate": 9.999126325533266e-06, "loss": -0.0029, "step": 1410, "step_time": 7.272247196000535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1534.0, "completions/max_terminated_length": 1534.0, "completions/mean_length": 825.0, "completions/mean_terminated_length": 825.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 3.7400355339050293, "epoch": 0.01411, "frac_reward_zero_std": 0.0, "grad_norm": 0.37827107310295105, "kl": 0.594345036894083, "learning_rate": 9.999125053395843e-06, "loss": -0.0065, "num_tokens": 36540866.0, "reward": 1.221824049949646, "reward_std": 0.023753222078084946, "rewards/rollout_reward_func/mean": 1.221824049949646, "rewards/rollout_reward_func/std": 0.03894800692796707, "sampling/importance_sampling_ratio/max": 0.3055083751678467, "sampling/importance_sampling_ratio/mean": 0.1640574336051941, "sampling/importance_sampling_ratio/min": 3.7514610503113447e-19, "sampling/sampling_logp_difference/max": 13.726458549499512, "sampling/sampling_logp_difference/mean": 0.7284671068191528, "step": 1411, "step_time": 12.02519677100645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01666666753590107, "clip_ratio/low_min": 0.012500000186264515, "clip_ratio/region_mean": 0.01666666753590107, "entropy": 3.749998092651367, "epoch": 0.01412, "grad_norm": 0.13428516685962677, "kl": 0.7803897671401501, "learning_rate": 9.999123780333036e-06, "loss": -0.0066, "step": 1412, "step_time": 6.43856403400423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1489.0, "completions/max_terminated_length": 1489.0, "completions/mean_length": 656.4375, "completions/mean_terminated_length": 656.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.698884278535843, "epoch": 0.01413, "frac_reward_zero_std": 0.25, "grad_norm": 0.012349395081400871, "kl": 0.7194571867585182, "learning_rate": 9.999122506344846e-06, "loss": -0.0099, "num_tokens": 36601606.0, "reward": 1.0604863166809082, "reward_std": 0.39793363213539124, "rewards/rollout_reward_func/mean": 1.0604863166809082, "rewards/rollout_reward_func/std": 0.5373680591583252, "sampling/importance_sampling_ratio/max": 0.5551943778991699, "sampling/importance_sampling_ratio/mean": 0.26055318117141724, "sampling/importance_sampling_ratio/min": 5.860380406375043e-05, "sampling/sampling_logp_difference/max": 3.9031009674072266, "sampling/sampling_logp_difference/mean": 0.5599287152290344, "step": 1413, "step_time": 11.415039138002612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.719662845134735, "epoch": 0.01414, "grad_norm": 0.012481773272156715, "kl": 0.7181898877024651, "learning_rate": 9.999121231431275e-06, "loss": -0.0099, "step": 1414, "step_time": 6.807134481990943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 282.4375, "completions/mean_terminated_length": 282.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.927135795354843, "epoch": 0.01415, "frac_reward_zero_std": 0.5, "grad_norm": 0.004723621532320976, "kl": 0.8069760352373123, "learning_rate": 9.999119955592321e-06, "loss": -0.0093, "num_tokens": 36651000.0, "reward": -0.0949237197637558, "reward_std": 0.3098698556423187, "rewards/rollout_reward_func/mean": -0.0949237197637558, "rewards/rollout_reward_func/std": 1.0398598909378052, "sampling/importance_sampling_ratio/max": 0.5570939779281616, "sampling/importance_sampling_ratio/mean": 0.3380119204521179, "sampling/importance_sampling_ratio/min": 2.048629148265757e-11, "sampling/sampling_logp_difference/max": 3.569638729095459, "sampling/sampling_logp_difference/mean": 0.6902531981468201, "step": 1415, "step_time": 8.038107956999738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.9305419623851776, "epoch": 0.01416, "grad_norm": 0.004792851395905018, "kl": 0.8076612949371338, "learning_rate": 9.999118678827984e-06, "loss": -0.0093, "step": 1416, "step_time": 4.3677039929971215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 602.75, "completions/mean_terminated_length": 602.75, "completions/min_length": 525.0, "completions/min_terminated_length": 525.0, "entropy": 3.3510658740997314, "epoch": 0.01417, "frac_reward_zero_std": 0.25, "grad_norm": 0.07345744967460632, "kl": 0.7502463310956955, "learning_rate": 9.999117401138263e-06, "loss": -0.0067, "num_tokens": 36712418.0, "reward": 0.43610551953315735, "reward_std": 0.46750408411026, "rewards/rollout_reward_func/mean": 0.43610551953315735, "rewards/rollout_reward_func/std": 1.0533137321472168, "sampling/importance_sampling_ratio/max": 0.31089457869529724, "sampling/importance_sampling_ratio/mean": 0.228261798620224, "sampling/importance_sampling_ratio/min": 5.852264506334137e-11, "sampling/sampling_logp_difference/max": 3.7483012676239014, "sampling/sampling_logp_difference/mean": 0.49489104747772217, "step": 1417, "step_time": 8.992655915018986 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 3.322850286960602, "epoch": 0.01418, "grad_norm": 0.06380610167980194, "kl": 0.754772312939167, "learning_rate": 9.999116122523164e-06, "loss": -0.0068, "step": 1418, "step_time": 4.634281503022066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2160.0, "completions/max_terminated_length": 2160.0, "completions/mean_length": 692.875, "completions/mean_terminated_length": 714.7096557617188, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.087395578622818, "epoch": 0.01419, "frac_reward_zero_std": 0.0, "grad_norm": 0.2017545849084854, "kl": 0.7048449888825417, "learning_rate": 9.999114842982682e-06, "loss": -0.0071, "num_tokens": 36774371.0, "reward": 0.7236706018447876, "reward_std": 0.451951801776886, "rewards/rollout_reward_func/mean": 0.7236706018447876, "rewards/rollout_reward_func/std": 0.879297137260437, "sampling/importance_sampling_ratio/max": 0.5558745265007019, "sampling/importance_sampling_ratio/mean": 0.21151934564113617, "sampling/importance_sampling_ratio/min": 5.863663998795232e-10, "sampling/sampling_logp_difference/max": 3.628308057785034, "sampling/sampling_logp_difference/mean": 0.6858550310134888, "step": 1419, "step_time": 13.35852344000159 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.026041666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875000931322575, "entropy": 4.079776465892792, "epoch": 0.0142, "grad_norm": 0.015921575948596, "kl": 0.7044565752148628, "learning_rate": 9.999113562516821e-06, "loss": -0.0075, "step": 1420, "step_time": 7.657078577991342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2131.0, "completions/max_terminated_length": 2131.0, "completions/mean_length": 1029.6875, "completions/mean_terminated_length": 1029.6875, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "entropy": 3.096116214990616, "epoch": 0.01421, "frac_reward_zero_std": 0.0, "grad_norm": 0.018619412556290627, "kl": 0.6828663758933544, "learning_rate": 9.999112281125578e-06, "loss": -0.0045, "num_tokens": 36849019.0, "reward": 0.6360116004943848, "reward_std": 0.2475680410861969, "rewards/rollout_reward_func/mean": 0.6360116004943848, "rewards/rollout_reward_func/std": 0.9495397806167603, "sampling/importance_sampling_ratio/max": 0.3089964687824249, "sampling/importance_sampling_ratio/mean": 0.1949702799320221, "sampling/importance_sampling_ratio/min": 0.0019477122696116567, "sampling/sampling_logp_difference/max": 2.6097702980041504, "sampling/sampling_logp_difference/mean": 0.3828999996185303, "step": 1421, "step_time": 13.968558906002727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.087693154811859, "epoch": 0.01422, "grad_norm": 0.01760866492986679, "kl": 0.677794549614191, "learning_rate": 9.999110998808955e-06, "loss": -0.0044, "step": 1422, "step_time": 7.673660329004633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2105.0, "completions/max_terminated_length": 2105.0, "completions/mean_length": 473.375, "completions/mean_terminated_length": 488.1290283203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.598227024078369, "epoch": 0.01423, "frac_reward_zero_std": 0.5, "grad_norm": 0.02926289476454258, "kl": 0.6302483193576336, "learning_rate": 9.999109715566952e-06, "loss": -0.0045, "num_tokens": 36901320.0, "reward": 0.753711462020874, "reward_std": 0.27132368087768555, "rewards/rollout_reward_func/mean": 0.753711462020874, "rewards/rollout_reward_func/std": 0.8382169008255005, "sampling/importance_sampling_ratio/max": 0.5589063167572021, "sampling/importance_sampling_ratio/mean": 0.3452377915382385, "sampling/importance_sampling_ratio/min": 1.7354053957752136e-11, "sampling/sampling_logp_difference/max": 3.165388584136963, "sampling/sampling_logp_difference/mean": 0.6839808225631714, "step": 1423, "step_time": 13.86739301700436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.5905067026615143, "epoch": 0.01424, "grad_norm": 0.028490759432315826, "kl": 0.6312305554747581, "learning_rate": 9.99910843139957e-06, "loss": -0.0044, "step": 1424, "step_time": 7.566364291989885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2506.0, "completions/max_terminated_length": 2506.0, "completions/mean_length": 715.6875, "completions/mean_terminated_length": 715.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.6365130841732025, "epoch": 0.01425, "frac_reward_zero_std": 0.5, "grad_norm": 0.0725921019911766, "kl": 0.7491151615977287, "learning_rate": 9.999107146306808e-06, "loss": 0.0004, "num_tokens": 36961478.0, "reward": 1.0953598022460938, "reward_std": 0.19583012163639069, "rewards/rollout_reward_func/mean": 1.0953598022460938, "rewards/rollout_reward_func/std": 0.3902657926082611, "sampling/importance_sampling_ratio/max": 0.5602676868438721, "sampling/importance_sampling_ratio/mean": 0.316847026348114, "sampling/importance_sampling_ratio/min": 1.759049041538674e-06, "sampling/sampling_logp_difference/max": 2.501211166381836, "sampling/sampling_logp_difference/mean": 0.5169849395751953, "step": 1425, "step_time": 14.80893768399983 } ], "logging_steps": 1.0, "max_steps": 200000, "num_input_tokens_seen": 36961478, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }