{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.00525, "eval_steps": 500, "global_step": 525, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 2.6875, "completions/mean_terminated_length": 2.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7784842923283577, "epoch": 1e-05, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "num_tokens": 45317.0, "reward": 2.1999998092651367, "reward_std": 1.7413380146026611, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.75, "rewards/probe_completion_length/std": 1.4142135381698608, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.6395175457000732, "sampling/importance_sampling_ratio/mean": 0.8939619064331055, "sampling/importance_sampling_ratio/min": 0.38482141494750977, "sampling/sampling_logp_difference/max": 0.7602746486663818, "sampling/sampling_logp_difference/mean": 0.08076699078083038, "step": 1, "step_time": 11.14861279899992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7784842923283577, "epoch": 2e-05, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.2857142857142855e-07, "loss": 0.0, "step": 2, "step_time": 6.046958927999867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.3125, "completions/mean_terminated_length": 2.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0997111424803734, "epoch": 3e-05, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.571428571428571e-07, "loss": 0.0, "num_tokens": 95616.0, "reward": 1.2625000476837158, "reward_std": 0.4709290862083435, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.3125, "rewards/probe_completion_length/std": 0.4709290862083435, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.11922025680542, "sampling/importance_sampling_ratio/mean": 0.8232259750366211, "sampling/importance_sampling_ratio/min": 0.467467337846756, "sampling/sampling_logp_difference/max": 0.4922431409358978, "sampling/sampling_logp_difference/mean": 0.10929661989212036, "step": 3, "step_time": 11.024081129000137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0997111424803734, "epoch": 4e-05, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 6.857142857142857e-07, "loss": 0.0, "step": 4, "step_time": 6.094397290000188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.46875, "completions/mean_terminated_length": 2.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8431090712547302, "epoch": 5e-05, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 9.142857142857142e-07, "loss": 0.0, "num_tokens": 143386.0, "reward": 1.4187500476837158, "reward_std": 0.507007360458374, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.46875, "rewards/probe_completion_length/std": 0.507007360458374, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5770914554595947, "sampling/importance_sampling_ratio/mean": 0.8894044160842896, "sampling/importance_sampling_ratio/min": 0.43558987975120544, "sampling/sampling_logp_difference/max": 0.7718255519866943, "sampling/sampling_logp_difference/mean": 0.07799207419157028, "step": 5, "step_time": 10.487163771999917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8431090712547302, "epoch": 6e-05, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.1428571428571428e-06, "loss": 0.0, "step": 6, "step_time": 6.185672804999967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.59375, "completions/mean_terminated_length": 2.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6589353680610657, "epoch": 7e-05, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.3714285714285715e-06, "loss": 0.0, "num_tokens": 188768.0, "reward": 2.043750047683716, "reward_std": 1.201058030128479, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.59375, "rewards/probe_completion_length/std": 0.49899089336395264, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.1102030277252197, "sampling/importance_sampling_ratio/mean": 0.9080963134765625, "sampling/importance_sampling_ratio/min": 0.7015503644943237, "sampling/sampling_logp_difference/max": 0.22858071327209473, "sampling/sampling_logp_difference/mean": 0.05401453375816345, "step": 7, "step_time": 11.051496965999718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6589353680610657, "epoch": 8e-05, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.6e-06, "loss": 0.0, "step": 8, "step_time": 5.989734975000033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5625, "completions/mean_terminated_length": 2.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8500039726495743, "epoch": 9e-05, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.8285714285714284e-06, "loss": 0.0, "num_tokens": 243357.0, "reward": 1.5125000476837158, "reward_std": 0.504016101360321, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5625, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.34220027923584, "sampling/importance_sampling_ratio/mean": 0.8937994241714478, "sampling/importance_sampling_ratio/min": 0.5326315760612488, "sampling/sampling_logp_difference/max": 0.925993800163269, "sampling/sampling_logp_difference/mean": 0.0961286872625351, "step": 9, "step_time": 10.82308242299996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8500039726495743, "epoch": 0.0001, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.057142857142857e-06, "loss": 0.0, "step": 10, "step_time": 6.345786435000036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.78125, "completions/mean_terminated_length": 2.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5993861705064774, "epoch": 0.00011, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.2857142857142856e-06, "loss": 0.0, "num_tokens": 292572.0, "reward": 2.231250047683716, "reward_std": 1.0846248865127563, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 0.420013427734375, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.206714153289795, "sampling/importance_sampling_ratio/mean": 0.9141116738319397, "sampling/importance_sampling_ratio/min": 0.6659113764762878, "sampling/sampling_logp_difference/max": 0.2911492586135864, "sampling/sampling_logp_difference/mean": 0.05162158980965614, "step": 11, "step_time": 10.827366451999865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5993861705064774, "epoch": 0.00012, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.5142857142857142e-06, "loss": 0.0, "step": 12, "step_time": 5.958845550000092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.59375, "completions/mean_terminated_length": 2.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7044863402843475, "epoch": 0.00013, "frac_reward_zero_std": 0.75, "grad_norm": 0.000793688406702131, "kl": 0.0, "learning_rate": 2.742857142857143e-06, "loss": 0.0, "num_tokens": 340626.0, "reward": 1.6062500476837158, "reward_std": 0.6530017852783203, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.59375, "rewards/probe_completion_length/std": 0.49899089336395264, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.96875, "rewards/probe_shaping_dominance/std": 0.1767766922712326, "rewards/probe_terminal_raw/mean": 0.03125, "rewards/probe_terminal_raw/std": 0.1767766922712326, "rewards/rollout_reward_func/mean": -0.9375, "rewards/rollout_reward_func/std": 0.3535533845424652, "sampling/importance_sampling_ratio/max": 1.0612809658050537, "sampling/importance_sampling_ratio/mean": 0.8982734680175781, "sampling/importance_sampling_ratio/min": 0.45619186758995056, "sampling/sampling_logp_difference/max": 0.5952367782592773, "sampling/sampling_logp_difference/mean": 0.05711527168750763, "step": 13, "step_time": 10.020639281000172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7065823003649712, "epoch": 0.00014, "grad_norm": 0.00079762825043872, "kl": 0.000474392608339258, "learning_rate": 2.9714285714285716e-06, "loss": -0.0, "step": 14, "step_time": 6.576750468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 3.15625, "completions/mean_terminated_length": 3.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6735320538282394, "epoch": 0.00015, "frac_reward_zero_std": 0.75, "grad_norm": 0.006771796382963657, "kl": 0.004827667989957263, "learning_rate": 3.2e-06, "loss": 0.0, "num_tokens": 381093.0, "reward": 2.6062498092651367, "reward_std": 1.4280457496643066, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 2.21875, "rewards/probe_completion_length/std": 1.2374368906021118, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.78125, "rewards/probe_shaping_dominance/std": 0.420013427734375, "rewards/probe_terminal_raw/mean": 0.21875, "rewards/probe_terminal_raw/std": 0.420013427734375, "rewards/rollout_reward_func/mean": -0.5625, "rewards/rollout_reward_func/std": 0.84002685546875, "sampling/importance_sampling_ratio/max": 1.2053685188293457, "sampling/importance_sampling_ratio/mean": 0.890531063079834, "sampling/importance_sampling_ratio/min": 0.5232845544815063, "sampling/sampling_logp_difference/max": 0.4864621162414551, "sampling/sampling_logp_difference/mean": 0.06389547139406204, "step": 15, "step_time": 10.024728831000175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.671803779900074, "epoch": 0.00016, "grad_norm": 0.005499588791280985, "kl": 0.008253770578448894, "learning_rate": 3.428571428571428e-06, "loss": -0.0, "step": 16, "step_time": 5.566490591999809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.625, "completions/mean_terminated_length": 2.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8892305418848991, "epoch": 0.00017, "frac_reward_zero_std": 0.75, "grad_norm": 0.003721765475347638, "kl": 0.0053854092129768105, "learning_rate": 3.657142857142857e-06, "loss": -0.0, "num_tokens": 424900.0, "reward": 1.7000000476837158, "reward_std": 0.7620007395744324, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.625, "rewards/probe_completion_length/std": 0.49186936020851135, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9375, "rewards/probe_shaping_dominance/std": 0.24593468010425568, "rewards/probe_terminal_raw/mean": 0.0625, "rewards/probe_terminal_raw/std": 0.24593468010425568, "rewards/rollout_reward_func/mean": -0.875, "rewards/rollout_reward_func/std": 0.49186936020851135, "sampling/importance_sampling_ratio/max": 0.9796642661094666, "sampling/importance_sampling_ratio/mean": 0.8644938468933105, "sampling/importance_sampling_ratio/min": 0.5730080008506775, "sampling/sampling_logp_difference/max": 0.48313581943511963, "sampling/sampling_logp_difference/mean": 0.06963828206062317, "step": 17, "step_time": 9.886864271999912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8878831416368484, "epoch": 0.00018, "grad_norm": 0.0030290314462035894, "kl": 0.013682284701644676, "learning_rate": 3.885714285714286e-06, "loss": -0.0, "step": 18, "step_time": 6.536783615000104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8332870826125145, "epoch": 0.00019, "frac_reward_zero_std": 1.0, "grad_norm": 4.497891859500669e-05, "kl": 0.0054511257992544415, "learning_rate": 4.114285714285714e-06, "loss": 0.0, "num_tokens": 474441.0, "reward": 1.9500000476837158, "reward_std": 1.2443419694900513, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5, "rewards/probe_completion_length/std": 0.5080004930496216, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.0432957410812378, "sampling/importance_sampling_ratio/mean": 0.8474187850952148, "sampling/importance_sampling_ratio/min": 0.3696611821651459, "sampling/sampling_logp_difference/max": 1.1028695106506348, "sampling/sampling_logp_difference/mean": 0.0895814448595047, "step": 19, "step_time": 10.478061494000144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8406587913632393, "epoch": 0.0002, "grad_norm": 3.429582648095675e-05, "kl": 0.004676993812608998, "learning_rate": 4.342857142857142e-06, "loss": 0.0, "step": 20, "step_time": 6.040967968000018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8806165009737015, "epoch": 0.00021, "frac_reward_zero_std": 1.0, "grad_norm": 0.00014957028906792402, "kl": 0.0227611528680427, "learning_rate": 4.571428571428571e-06, "loss": 0.0, "num_tokens": 522511.0, "reward": 1.4500000476837158, "reward_std": 0.5080004930496216, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5, "rewards/probe_completion_length/std": 0.5080004930496216, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8144991397857666, "sampling/importance_sampling_ratio/mean": 0.9702324867248535, "sampling/importance_sampling_ratio/min": 0.36904460191726685, "sampling/sampling_logp_difference/max": 0.860051155090332, "sampling/sampling_logp_difference/mean": 0.10725787281990051, "step": 21, "step_time": 10.022902672999862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8847335204482079, "epoch": 0.00022, "grad_norm": 4.064434324391186e-05, "kl": 0.00797901525220368, "learning_rate": 4.8e-06, "loss": 0.0, "step": 22, "step_time": 6.566884732000062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5625, "completions/mean_terminated_length": 2.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9157858863472939, "epoch": 0.00023, "frac_reward_zero_std": 0.75, "grad_norm": 0.0012762682745233178, "kl": 0.006473740606452338, "learning_rate": 5.0285714285714285e-06, "loss": -0.0, "num_tokens": 573663.0, "reward": 1.8250000476837158, "reward_std": 1.0395408868789673, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5625, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.84375, "rewards/probe_shaping_dominance/std": 0.3689020276069641, "rewards/probe_terminal_raw/mean": 0.15625, "rewards/probe_terminal_raw/std": 0.3689020276069641, "rewards/rollout_reward_func/mean": -0.6875, "rewards/rollout_reward_func/std": 0.7378040552139282, "sampling/importance_sampling_ratio/max": 1.4042459726333618, "sampling/importance_sampling_ratio/mean": 0.8917810320854187, "sampling/importance_sampling_ratio/min": 0.38385266065597534, "sampling/sampling_logp_difference/max": 0.6875017881393433, "sampling/sampling_logp_difference/mean": 0.09778842329978943, "step": 23, "step_time": 10.025745644999915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9124525971710682, "epoch": 0.00024, "grad_norm": 0.0014905633870512247, "kl": 0.008495110145304352, "learning_rate": 5.257142857142857e-06, "loss": -0.0, "step": 24, "step_time": 6.059164398000007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.46875, "completions/mean_terminated_length": 2.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8142954409122467, "epoch": 0.00025, "frac_reward_zero_std": 1.0, "grad_norm": 2.238359047623817e-05, "kl": 0.008038407871936215, "learning_rate": 5.485714285714286e-06, "loss": 0.0, "num_tokens": 619642.0, "reward": 1.9187500476837158, "reward_std": 1.256836175918579, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.46875, "rewards/probe_completion_length/std": 0.507007360458374, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.6296800374984741, "sampling/importance_sampling_ratio/mean": 0.9253418445587158, "sampling/importance_sampling_ratio/min": 0.5489835143089294, "sampling/sampling_logp_difference/max": 0.5560274124145508, "sampling/sampling_logp_difference/mean": 0.07420844584703445, "step": 25, "step_time": 9.907065097999862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8018932789564133, "epoch": 0.00026, "grad_norm": 2.5122495571849868e-05, "kl": 0.0032655789345881203, "learning_rate": 5.7142857142857145e-06, "loss": 0.0, "step": 26, "step_time": 6.951123436999865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5625, "completions/mean_terminated_length": 2.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8414878621697426, "epoch": 0.00027, "frac_reward_zero_std": 0.75, "grad_norm": 0.008148695342242718, "kl": 0.01356741931522265, "learning_rate": 5.942857142857143e-06, "loss": 0.0001, "num_tokens": 666818.0, "reward": 1.8875000476837158, "reward_std": 1.1053389310836792, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5625, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.8125, "rewards/probe_shaping_dominance/std": 0.3965577781200409, "rewards/probe_terminal_raw/mean": 0.1875, "rewards/probe_terminal_raw/std": 0.3965577781200409, "rewards/rollout_reward_func/mean": -0.625, "rewards/rollout_reward_func/std": 0.7931155562400818, "sampling/importance_sampling_ratio/max": 1.8690545558929443, "sampling/importance_sampling_ratio/mean": 0.9360212087631226, "sampling/importance_sampling_ratio/min": 0.3559110760688782, "sampling/sampling_logp_difference/max": 0.7921642065048218, "sampling/sampling_logp_difference/mean": 0.0954742580652237, "step": 27, "step_time": 10.1197560600001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.8348528742790222, "epoch": 0.00028, "grad_norm": 0.009077177383005619, "kl": 0.03206715080887079, "learning_rate": 6.171428571428571e-06, "loss": 0.0001, "step": 28, "step_time": 6.045674705999772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5625, "completions/mean_terminated_length": 2.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8990254625678062, "epoch": 0.00029, "frac_reward_zero_std": 1.0, "grad_norm": 9.727512224344537e-05, "kl": 0.011075327638536692, "learning_rate": 6.4e-06, "loss": 0.0, "num_tokens": 715066.0, "reward": 1.5125000476837158, "reward_std": 0.504016101360321, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5625, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8236454725265503, "sampling/importance_sampling_ratio/mean": 0.8915936946868896, "sampling/importance_sampling_ratio/min": 0.4869925379753113, "sampling/sampling_logp_difference/max": 0.7040233612060547, "sampling/sampling_logp_difference/mean": 0.0989106148481369, "step": 29, "step_time": 10.703582490000144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.90165825933218, "epoch": 0.0003, "grad_norm": 2.8979713533772156e-05, "kl": 0.0033032803366950247, "learning_rate": 6.628571428571428e-06, "loss": 0.0, "step": 30, "step_time": 6.642205023999736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.21875, "completions/mean_terminated_length": 2.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9571298733353615, "epoch": 0.00031, "frac_reward_zero_std": 1.0, "grad_norm": 5.247951048659161e-05, "kl": 0.012058097088811337, "learning_rate": 6.857142857142856e-06, "loss": 0.0, "num_tokens": 765477.0, "reward": 1.1687500476837158, "reward_std": 0.420013427734375, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.21875, "rewards/probe_completion_length/std": 0.420013427734375, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1994438171386719, "sampling/importance_sampling_ratio/mean": 0.8842767477035522, "sampling/importance_sampling_ratio/min": 0.625706136226654, "sampling/sampling_logp_difference/max": 0.3384011387825012, "sampling/sampling_logp_difference/mean": 0.08385618031024933, "step": 31, "step_time": 9.973356800999909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9696694537997246, "epoch": 0.00032, "grad_norm": 9.517256694380194e-05, "kl": 0.013399210826719354, "learning_rate": 7.085714285714285e-06, "loss": 0.0, "step": 32, "step_time": 6.042420091000054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.75, "completions/mean_terminated_length": 2.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8252013362944126, "epoch": 0.00033, "frac_reward_zero_std": 0.25, "grad_norm": 0.007044240366667509, "kl": 0.1163103471044451, "learning_rate": 7.314285714285714e-06, "loss": -0.0, "num_tokens": 806728.0, "reward": 2.262500047683716, "reward_std": 1.1482806205749512, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.75, "rewards/probe_completion_length/std": 0.4399413466453552, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.71875, "rewards/probe_shaping_dominance/std": 0.45680341124534607, "rewards/probe_terminal_raw/mean": 0.28125, "rewards/probe_terminal_raw/std": 0.45680341124534607, "rewards/rollout_reward_func/mean": -0.4375, "rewards/rollout_reward_func/std": 0.9136068224906921, "sampling/importance_sampling_ratio/max": 1.6308449506759644, "sampling/importance_sampling_ratio/mean": 0.8596781492233276, "sampling/importance_sampling_ratio/min": 0.3087958097457886, "sampling/sampling_logp_difference/max": 1.070175290107727, "sampling/sampling_logp_difference/mean": 0.10737159848213196, "step": 33, "step_time": 9.889731536 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.031250000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0416666679084301, "entropy": 0.8173648975789547, "epoch": 0.00034, "grad_norm": 0.004044355824589729, "kl": 0.1296040709130466, "learning_rate": 7.542857142857142e-06, "loss": -0.0, "step": 34, "step_time": 6.904155912999954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.53125, "completions/mean_terminated_length": 2.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9066675156354904, "epoch": 0.00035, "frac_reward_zero_std": 1.0, "grad_norm": 6.820289127063006e-05, "kl": 0.021789020313008223, "learning_rate": 7.771428571428572e-06, "loss": 0.0, "num_tokens": 860935.0, "reward": 1.4812500476837158, "reward_std": 0.507007360458374, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.53125, "rewards/probe_completion_length/std": 0.507007360458374, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1967573165893555, "sampling/importance_sampling_ratio/mean": 0.8414925336837769, "sampling/importance_sampling_ratio/min": 0.4453066885471344, "sampling/sampling_logp_difference/max": 0.7035496234893799, "sampling/sampling_logp_difference/mean": 0.0921107828617096, "step": 35, "step_time": 10.496407848999866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8933523520827293, "epoch": 0.00036, "grad_norm": 0.00010574555926723406, "kl": 0.02637881641567219, "learning_rate": 8e-06, "loss": 0.0, "step": 36, "step_time": 6.320377744999973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5705685466527939, "epoch": 0.00037, "frac_reward_zero_std": 1.0, "grad_norm": 1.521575904916972e-05, "kl": 0.0023586903529349, "learning_rate": 7.99999999962976e-06, "loss": 0.0, "num_tokens": 904680.0, "reward": 2.793750047683716, "reward_std": 1.221035361289978, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 0.3689020276069641, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.0734606981277466, "sampling/importance_sampling_ratio/mean": 0.8809072971343994, "sampling/importance_sampling_ratio/min": 0.6448503136634827, "sampling/sampling_logp_difference/max": 0.47716373205184937, "sampling/sampling_logp_difference/mean": 0.058208562433719635, "step": 37, "step_time": 9.859377416000143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5728087574243546, "epoch": 0.00038, "grad_norm": 4.920143328490667e-05, "kl": 0.005856037396995362, "learning_rate": 7.99999999851904e-06, "loss": 0.0, "step": 38, "step_time": 6.990694649000034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 2.59375, "completions/mean_terminated_length": 2.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8889300264418125, "epoch": 0.00039, "frac_reward_zero_std": 1.0, "grad_norm": 6.715894414810464e-05, "kl": 0.19043646939098835, "learning_rate": 7.999999996667841e-06, "loss": 0.0, "num_tokens": 952527.0, "reward": 2.106250047683716, "reward_std": 1.6482517719268799, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.65625, "rewards/probe_completion_length/std": 1.260040283203125, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.661733865737915, "sampling/importance_sampling_ratio/mean": 0.9227524995803833, "sampling/importance_sampling_ratio/min": 0.49729371070861816, "sampling/sampling_logp_difference/max": 0.796552300453186, "sampling/sampling_logp_difference/mean": 0.08508022874593735, "step": 39, "step_time": 10.035715292000077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8587775826454163, "epoch": 0.0004, "grad_norm": 7.648151949979365e-05, "kl": 0.19953790280851535, "learning_rate": 7.999999994076165e-06, "loss": 0.0, "step": 40, "step_time": 6.051480846000004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.625, "completions/mean_terminated_length": 2.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.840924471616745, "epoch": 0.00041, "frac_reward_zero_std": 1.0, "grad_norm": 0.00012767076259478927, "kl": 0.061314951490203384, "learning_rate": 7.999999990744006e-06, "loss": 0.0, "num_tokens": 1007061.0, "reward": 1.5750000476837158, "reward_std": 0.49186936020851135, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.625, "rewards/probe_completion_length/std": 0.49186936020851135, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7941770553588867, "sampling/importance_sampling_ratio/mean": 0.9026079177856445, "sampling/importance_sampling_ratio/min": 0.39223504066467285, "sampling/sampling_logp_difference/max": 0.6463124752044678, "sampling/sampling_logp_difference/mean": 0.10292024910449982, "step": 41, "step_time": 10.33222729900001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8315484449267387, "epoch": 0.00042, "grad_norm": 0.00022670082398690283, "kl": 0.085723073239933, "learning_rate": 7.999999986671369e-06, "loss": 0.0, "step": 42, "step_time": 7.211746527999935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.65625, "completions/mean_terminated_length": 2.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6875193417072296, "epoch": 0.00043, "frac_reward_zero_std": 0.5, "grad_norm": 0.010541373863816261, "kl": 0.5206158077344298, "learning_rate": 7.999999981858253e-06, "loss": -0.0002, "num_tokens": 1052251.0, "reward": 2.106250047683716, "reward_std": 1.1670026779174805, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.65625, "rewards/probe_completion_length/std": 0.4825586974620819, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 2.845515251159668, "sampling/importance_sampling_ratio/mean": 0.9331997632980347, "sampling/importance_sampling_ratio/min": 0.27450695633888245, "sampling/sampling_logp_difference/max": 1.1816264390945435, "sampling/sampling_logp_difference/mean": 0.09470215439796448, "step": 43, "step_time": 9.963842145999934 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.0416666679084301, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06250000186264515, "entropy": 0.6695949211716652, "epoch": 0.00044, "grad_norm": 0.004431854467839003, "kl": 1.1412691371515393, "learning_rate": 7.999999976304658e-06, "loss": -0.0002, "step": 44, "step_time": 5.966688828000315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 2.5625, "completions/mean_terminated_length": 2.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8274488374590874, "epoch": 0.00045, "frac_reward_zero_std": 0.75, "grad_norm": 0.006249376107007265, "kl": 0.19522789436450694, "learning_rate": 7.999999970010581e-06, "loss": -0.0, "num_tokens": 1099900.0, "reward": 2.012500047683716, "reward_std": 1.6251550912857056, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.625, "rewards/probe_completion_length/std": 1.263635277748108, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.78125, "rewards/probe_shaping_dominance/std": 0.420013427734375, "rewards/probe_terminal_raw/mean": 0.21875, "rewards/probe_terminal_raw/std": 0.420013427734375, "rewards/rollout_reward_func/mean": -0.5625, "rewards/rollout_reward_func/std": 0.84002685546875, "sampling/importance_sampling_ratio/max": 1.6532174348831177, "sampling/importance_sampling_ratio/mean": 0.85263991355896, "sampling/importance_sampling_ratio/min": 0.3612442910671234, "sampling/sampling_logp_difference/max": 0.6850485801696777, "sampling/sampling_logp_difference/mean": 0.09981696307659149, "step": 45, "step_time": 10.048606317000122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.8223946020007133, "epoch": 0.00046, "grad_norm": 0.005041504744440317, "kl": 0.9564005452157289, "learning_rate": 7.999999962976027e-06, "loss": -0.0, "step": 46, "step_time": 6.050002238000047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.1875, "completions/mean_terminated_length": 2.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9545164927840233, "epoch": 0.00047, "frac_reward_zero_std": 1.0, "grad_norm": 4.465667007025331e-05, "kl": 0.010277454368406325, "learning_rate": 7.999999955200991e-06, "loss": 0.0, "num_tokens": 1149711.0, "reward": 1.1375000476837158, "reward_std": 0.3965577781200409, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.1875, "rewards/probe_completion_length/std": 0.3965577781200409, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.72605562210083, "sampling/importance_sampling_ratio/mean": 0.9260122776031494, "sampling/importance_sampling_ratio/min": 0.6488375663757324, "sampling/sampling_logp_difference/max": 1.0003349781036377, "sampling/sampling_logp_difference/mean": 0.10305842757225037, "step": 47, "step_time": 11.073963104999962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9436408951878548, "epoch": 0.00048, "grad_norm": 3.3426829759264365e-05, "kl": 0.006428241502362653, "learning_rate": 7.999999946685478e-06, "loss": 0.0, "step": 48, "step_time": 6.095011092000163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 2.75, "completions/mean_terminated_length": 2.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8800734430551529, "epoch": 0.00049, "frac_reward_zero_std": 0.75, "grad_norm": 0.023053472861647606, "kl": 2.7146486702840775, "learning_rate": 7.999999937429484e-06, "loss": -0.0001, "num_tokens": 1196495.0, "reward": 2.137500047683716, "reward_std": 1.6546610593795776, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 1.4013242721557617, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.8125, "rewards/probe_shaping_dominance/std": 0.3965577781200409, "rewards/probe_terminal_raw/mean": 0.1875, "rewards/probe_terminal_raw/std": 0.3965577781200409, "rewards/rollout_reward_func/mean": -0.625, "rewards/rollout_reward_func/std": 0.7931155562400818, "sampling/importance_sampling_ratio/max": 2.174635648727417, "sampling/importance_sampling_ratio/mean": 0.8316177129745483, "sampling/importance_sampling_ratio/min": 0.007070684805512428, "sampling/sampling_logp_difference/max": 3.8774948120117188, "sampling/sampling_logp_difference/mean": 0.16757240891456604, "step": 49, "step_time": 9.95241602700014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.924486055970192, "epoch": 0.0005, "grad_norm": 0.00967924203723669, "kl": 0.35286546498537064, "learning_rate": 7.999999927433012e-06, "loss": -0.0001, "step": 50, "step_time": 6.020178775999966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8942786604166031, "epoch": 0.00051, "frac_reward_zero_std": 1.0, "grad_norm": 0.00024296288029290736, "kl": 0.08609151990822284, "learning_rate": 7.99999991669606e-06, "loss": 0.0, "num_tokens": 1248112.0, "reward": 1.4812500476837158, "reward_std": 0.9498514533042908, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.53125, "rewards/probe_completion_length/std": 0.949851393699646, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.209699273109436, "sampling/importance_sampling_ratio/mean": 0.8616048097610474, "sampling/importance_sampling_ratio/min": 0.36346468329429626, "sampling/sampling_logp_difference/max": 0.5839085578918457, "sampling/sampling_logp_difference/mean": 0.09252315759658813, "step": 51, "step_time": 10.509565642999974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8962793052196503, "epoch": 0.00052, "grad_norm": 0.0008928372408263385, "kl": 0.07233380628167652, "learning_rate": 7.999999905218627e-06, "loss": 0.0, "step": 52, "step_time": 6.072045420999871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.75, "completions/mean_terminated_length": 2.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6282005533576012, "epoch": 0.00053, "frac_reward_zero_std": 0.75, "grad_norm": 0.010705840773880482, "kl": 1.7787348770652898, "learning_rate": 7.999999893000716e-06, "loss": -0.0001, "num_tokens": 1295553.0, "reward": 2.075000047683716, "reward_std": 1.008032202720642, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.75, "rewards/probe_completion_length/std": 0.4399413466453552, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.8125, "rewards/probe_shaping_dominance/std": 0.3965577781200409, "rewards/probe_terminal_raw/mean": 0.1875, "rewards/probe_terminal_raw/std": 0.3965577781200409, "rewards/rollout_reward_func/mean": -0.625, "rewards/rollout_reward_func/std": 0.7931155562400818, "sampling/importance_sampling_ratio/max": 1.718209981918335, "sampling/importance_sampling_ratio/mean": 0.8762518763542175, "sampling/importance_sampling_ratio/min": 0.0712529644370079, "sampling/sampling_logp_difference/max": 2.461639881134033, "sampling/sampling_logp_difference/mean": 0.13911215960979462, "step": 53, "step_time": 10.180075020999652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6223511770367622, "epoch": 0.00054, "grad_norm": 0.003766631940379739, "kl": 0.724787222861778, "learning_rate": 7.999999880042326e-06, "loss": -0.0001, "step": 54, "step_time": 6.618187610000177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.48880940675735474, "epoch": 0.00055, "frac_reward_zero_std": 0.75, "grad_norm": 0.05148077383637428, "kl": 10.946514587849379, "learning_rate": 7.999999866343456e-06, "loss": 0.0, "num_tokens": 1333718.0, "reward": 3.3562498092651367, "reward_std": 1.6036123037338257, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 2.03125, "rewards/probe_completion_length/std": 1.331610083580017, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.3125, "rewards/probe_shaping_dominance/std": 0.4709290862083435, "rewards/probe_terminal_raw/mean": 0.6875, "rewards/probe_terminal_raw/std": 0.4709290862083435, "rewards/rollout_reward_func/mean": 0.375, "rewards/rollout_reward_func/std": 0.941858172416687, "sampling/importance_sampling_ratio/max": 1.252049207687378, "sampling/importance_sampling_ratio/mean": 0.8787084221839905, "sampling/importance_sampling_ratio/min": 0.020720435306429863, "sampling/sampling_logp_difference/max": 3.2375454902648926, "sampling/sampling_logp_difference/mean": 0.12630514800548553, "step": 55, "step_time": 9.787316816999919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5050413832068443, "epoch": 0.00056, "grad_norm": 0.0006665181717835367, "kl": 1.611095119267702, "learning_rate": 7.999999851904105e-06, "loss": -0.0001, "step": 56, "step_time": 5.5423178839998855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.78125, "completions/mean_terminated_length": 2.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6285109668970108, "epoch": 0.00057, "frac_reward_zero_std": 0.75, "grad_norm": 0.005352190230041742, "kl": 0.8933676667511463, "learning_rate": 7.999999836724277e-06, "loss": -0.0, "num_tokens": 1382055.0, "reward": 1.9812500476837158, "reward_std": 0.8607714176177979, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 0.420013427734375, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.875, "rewards/probe_shaping_dominance/std": 0.33601075410842896, "rewards/probe_terminal_raw/mean": 0.125, "rewards/probe_terminal_raw/std": 0.33601075410842896, "rewards/rollout_reward_func/mean": -0.75, "rewards/rollout_reward_func/std": 0.6720215082168579, "sampling/importance_sampling_ratio/max": 1.6246511936187744, "sampling/importance_sampling_ratio/mean": 0.9118392467498779, "sampling/importance_sampling_ratio/min": 0.1631554365158081, "sampling/sampling_logp_difference/max": 1.7249137163162231, "sampling/sampling_logp_difference/mean": 0.108734130859375, "step": 57, "step_time": 9.972802231999822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6326157301664352, "epoch": 0.00058, "grad_norm": 0.005271768197417259, "kl": 0.7564798947423697, "learning_rate": 7.999999820803968e-06, "loss": -0.0, "step": 58, "step_time": 6.481488288000264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8018255531787872, "epoch": 0.00059, "frac_reward_zero_std": 1.0, "grad_norm": 9.812507050810382e-05, "kl": 0.08816051934809366, "learning_rate": 7.99999980414318e-06, "loss": 0.0, "num_tokens": 1425529.0, "reward": 1.8875000476837158, "reward_std": 1.2684128284454346, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.4375, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 0.9522015452384949, "sampling/importance_sampling_ratio/mean": 0.8646754026412964, "sampling/importance_sampling_ratio/min": 0.16998648643493652, "sampling/sampling_logp_difference/max": 1.2199361324310303, "sampling/sampling_logp_difference/mean": 0.0755390077829361, "step": 59, "step_time": 9.85027468400017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8057889193296432, "epoch": 0.0006, "grad_norm": 0.00017048718291334808, "kl": 0.08889914313340341, "learning_rate": 7.999999786741913e-06, "loss": 0.0, "step": 60, "step_time": 6.49511566000001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.34375, "completions/mean_terminated_length": 2.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9359987080097198, "epoch": 0.00061, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006201486103236675, "kl": 0.1183123029768467, "learning_rate": 7.999999768600167e-06, "loss": 0.0, "num_tokens": 1475587.0, "reward": 1.2937500476837158, "reward_std": 0.4825586974620819, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.34375, "rewards/probe_completion_length/std": 0.4825586974620819, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.0106334686279297, "sampling/importance_sampling_ratio/mean": 0.8268498182296753, "sampling/importance_sampling_ratio/min": 0.16725747287273407, "sampling/sampling_logp_difference/max": 1.6995172500610352, "sampling/sampling_logp_difference/mean": 0.1090221256017685, "step": 61, "step_time": 10.056730226000127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.920487180352211, "epoch": 0.00062, "grad_norm": 0.000263864261796698, "kl": 0.04295372625347227, "learning_rate": 7.99999974971794e-06, "loss": 0.0, "step": 62, "step_time": 6.643375879000132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.65625, "completions/mean_terminated_length": 2.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6303055956959724, "epoch": 0.00063, "frac_reward_zero_std": 1.0, "grad_norm": 7.813687261659652e-05, "kl": 0.1321493198513508, "learning_rate": 7.999999730095235e-06, "loss": 0.0, "num_tokens": 1523106.0, "reward": 2.137500047683716, "reward_std": 1.1482806205749512, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.65625, "rewards/probe_completion_length/std": 0.4825586974620819, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.489367961883545, "sampling/importance_sampling_ratio/mean": 0.9285314083099365, "sampling/importance_sampling_ratio/min": 0.5652889609336853, "sampling/sampling_logp_difference/max": 0.46361231803894043, "sampling/sampling_logp_difference/mean": 0.05925370007753372, "step": 63, "step_time": 10.579496304000031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6239962689578533, "epoch": 0.00064, "grad_norm": 8.079901454038918e-05, "kl": 0.13366722106775342, "learning_rate": 7.99999970973205e-06, "loss": 0.0, "step": 64, "step_time": 6.11811525600001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7411899194121361, "epoch": 0.00065, "frac_reward_zero_std": 1.0, "grad_norm": 4.089561844011769e-05, "kl": 0.022076929230934184, "learning_rate": 7.999999688628386e-06, "loss": 0.0, "num_tokens": 1573719.0, "reward": 1.3875000476837158, "reward_std": 0.504016101360321, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.4375, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1824043989181519, "sampling/importance_sampling_ratio/mean": 0.8717299103736877, "sampling/importance_sampling_ratio/min": 0.5599439740180969, "sampling/sampling_logp_difference/max": 0.4129828214645386, "sampling/sampling_logp_difference/mean": 0.06982976198196411, "step": 65, "step_time": 10.05901396799959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7376994863152504, "epoch": 0.00066, "grad_norm": 7.469410047633573e-05, "kl": 0.022887282819283428, "learning_rate": 7.999999666784243e-06, "loss": 0.0, "step": 66, "step_time": 7.14928144000055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.6875, "completions/mean_terminated_length": 2.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7326344400644302, "epoch": 0.00067, "frac_reward_zero_std": 0.5, "grad_norm": 0.00794161669909954, "kl": 0.08390913903713226, "learning_rate": 7.999999644199619e-06, "loss": 0.0, "num_tokens": 1621426.0, "reward": 1.8875000476837158, "reward_std": 0.9136068224906921, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.6875, "rewards/probe_completion_length/std": 0.4709290862083435, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.875, "rewards/probe_shaping_dominance/std": 0.33601075410842896, "rewards/probe_terminal_raw/mean": 0.125, "rewards/probe_terminal_raw/std": 0.33601075410842896, "rewards/rollout_reward_func/mean": -0.75, "rewards/rollout_reward_func/std": 0.6720215082168579, "sampling/importance_sampling_ratio/max": 1.5268666744232178, "sampling/importance_sampling_ratio/mean": 0.9305216073989868, "sampling/importance_sampling_ratio/min": 0.3797222077846527, "sampling/sampling_logp_difference/max": 0.650033712387085, "sampling/sampling_logp_difference/mean": 0.08735646307468414, "step": 67, "step_time": 10.012451381999881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0416666679084301, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0416666679084301, "entropy": 0.7200787663459778, "epoch": 0.00068, "grad_norm": 0.0036266117822378874, "kl": 0.0821327978046611, "learning_rate": 7.999999620874517e-06, "loss": -0.0, "step": 68, "step_time": 5.994549507999864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.65625, "completions/mean_terminated_length": 2.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7011678218841553, "epoch": 0.00069, "frac_reward_zero_std": 0.75, "grad_norm": 0.004761384334415197, "kl": 0.07737505971454084, "learning_rate": 7.999999596808934e-06, "loss": -0.0, "num_tokens": 1670605.0, "reward": 1.8562500476837158, "reward_std": 0.9283830523490906, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.65625, "rewards/probe_completion_length/std": 0.4825586974620819, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.875, "rewards/probe_shaping_dominance/std": 0.33601075410842896, "rewards/probe_terminal_raw/mean": 0.125, "rewards/probe_terminal_raw/std": 0.33601075410842896, "rewards/rollout_reward_func/mean": -0.75, "rewards/rollout_reward_func/std": 0.6720215082168579, "sampling/importance_sampling_ratio/max": 1.6977269649505615, "sampling/importance_sampling_ratio/mean": 0.9019441604614258, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.9512550830841064, "sampling/sampling_logp_difference/mean": 0.09673301875591278, "step": 69, "step_time": 10.269808110999975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7132209166884422, "epoch": 0.0007, "grad_norm": 0.0037254930939525366, "kl": 0.04896980174817145, "learning_rate": 7.999999572002872e-06, "loss": -0.0, "step": 70, "step_time": 7.155508138000187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.53125, "completions/mean_terminated_length": 2.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8009077645838261, "epoch": 0.00071, "frac_reward_zero_std": 1.0, "grad_norm": 9.20472084544599e-05, "kl": 0.08694314991589636, "learning_rate": 7.999999546456332e-06, "loss": 0.0, "num_tokens": 1718484.0, "reward": 1.9812500476837158, "reward_std": 1.2309025526046753, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.53125, "rewards/probe_completion_length/std": 0.507007360458374, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.4197654724121094, "sampling/importance_sampling_ratio/mean": 0.8803520202636719, "sampling/importance_sampling_ratio/min": 0.5503826141357422, "sampling/sampling_logp_difference/max": 0.5132150053977966, "sampling/sampling_logp_difference/mean": 0.07343609631061554, "step": 71, "step_time": 10.061626925999917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7975645214319229, "epoch": 0.00072, "grad_norm": 8.292407437693328e-05, "kl": 0.08916806877823547, "learning_rate": 7.999999520169313e-06, "loss": 0.0, "step": 72, "step_time": 6.039824374000318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.75, "completions/mean_terminated_length": 2.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6600424498319626, "epoch": 0.00073, "frac_reward_zero_std": 0.75, "grad_norm": 0.002133256522938609, "kl": 0.0518730686235358, "learning_rate": 7.999999493141815e-06, "loss": -0.0, "num_tokens": 1769135.0, "reward": 1.7625000476837158, "reward_std": 0.5922891497612, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.75, "rewards/probe_completion_length/std": 0.4399413466453552, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.96875, "rewards/probe_shaping_dominance/std": 0.1767766922712326, "rewards/probe_terminal_raw/mean": 0.03125, "rewards/probe_terminal_raw/std": 0.1767766922712326, "rewards/rollout_reward_func/mean": -0.9375, "rewards/rollout_reward_func/std": 0.3535533845424652, "sampling/importance_sampling_ratio/max": 1.2418115139007568, "sampling/importance_sampling_ratio/mean": 0.8889593482017517, "sampling/importance_sampling_ratio/min": 0.7007108926773071, "sampling/sampling_logp_difference/max": 0.29283052682876587, "sampling/sampling_logp_difference/mean": 0.05654953420162201, "step": 73, "step_time": 10.22246037199966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6605305559933186, "epoch": 0.00074, "grad_norm": 0.0020768321119248867, "kl": 0.05201199743896723, "learning_rate": 7.999999465373833e-06, "loss": -0.0, "step": 74, "step_time": 7.141134875000034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.6875, "completions/mean_terminated_length": 2.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.707104254513979, "epoch": 0.00075, "frac_reward_zero_std": 0.75, "grad_norm": 0.0004946052795276046, "kl": 0.30471506575122476, "learning_rate": 7.999999436865376e-06, "loss": 0.0, "num_tokens": 1818515.0, "reward": 1.7000000476837158, "reward_std": 0.6221709847450256, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.6875, "rewards/probe_completion_length/std": 0.4709290862083435, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.96875, "rewards/probe_shaping_dominance/std": 0.1767766922712326, "rewards/probe_terminal_raw/mean": 0.03125, "rewards/probe_terminal_raw/std": 0.1767766922712326, "rewards/rollout_reward_func/mean": -0.9375, "rewards/rollout_reward_func/std": 0.3535533845424652, "sampling/importance_sampling_ratio/max": 1.143739938735962, "sampling/importance_sampling_ratio/mean": 0.8724870085716248, "sampling/importance_sampling_ratio/min": 0.6202055811882019, "sampling/sampling_logp_difference/max": 0.41298508644104004, "sampling/sampling_logp_difference/mean": 0.06320011615753174, "step": 75, "step_time": 10.026756965999766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.7264066264033318, "epoch": 0.00076, "grad_norm": 0.0005243932246230543, "kl": 0.32161051593720913, "learning_rate": 7.999999407616439e-06, "loss": -0.0, "step": 76, "step_time": 6.015877203999935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.6875, "completions/mean_terminated_length": 2.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6747354827821255, "epoch": 0.00077, "frac_reward_zero_std": 0.75, "grad_norm": 0.0009918597061187029, "kl": 0.06751795881427824, "learning_rate": 7.999999377627022e-06, "loss": -0.0, "num_tokens": 1864433.0, "reward": 2.293750047683716, "reward_std": 1.2077537775039673, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.6875, "rewards/probe_completion_length/std": 0.4709290862083435, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.6875, "rewards/probe_shaping_dominance/std": 0.4709290862083435, "rewards/probe_terminal_raw/mean": 0.3125, "rewards/probe_terminal_raw/std": 0.4709290862083435, "rewards/rollout_reward_func/mean": -0.375, "rewards/rollout_reward_func/std": 0.941858172416687, "sampling/importance_sampling_ratio/max": 1.3293083906173706, "sampling/importance_sampling_ratio/mean": 0.9025354385375977, "sampling/importance_sampling_ratio/min": 0.5597118139266968, "sampling/sampling_logp_difference/max": 0.8906712532043457, "sampling/sampling_logp_difference/mean": 0.0869922935962677, "step": 77, "step_time": 10.032041373000084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.672482568770647, "epoch": 0.00078, "grad_norm": 0.001063486561179161, "kl": 0.09529352828394622, "learning_rate": 7.999999346897126e-06, "loss": -0.0, "step": 78, "step_time": 6.591032714999983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.40625, "completions/mean_terminated_length": 2.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9129854887723923, "epoch": 0.00079, "frac_reward_zero_std": 0.75, "grad_norm": 0.0032934746704995632, "kl": 0.28487967513501644, "learning_rate": 7.99999931542675e-06, "loss": -0.0, "num_tokens": 1911461.0, "reward": 1.6687500476837158, "reward_std": 1.0846248865127563, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.40625, "rewards/probe_completion_length/std": 0.49899089336395264, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.84375, "rewards/probe_shaping_dominance/std": 0.3689020276069641, "rewards/probe_terminal_raw/mean": 0.15625, "rewards/probe_terminal_raw/std": 0.3689020276069641, "rewards/rollout_reward_func/mean": -0.6875, "rewards/rollout_reward_func/std": 0.7378040552139282, "sampling/importance_sampling_ratio/max": 1.3015565872192383, "sampling/importance_sampling_ratio/mean": 0.878451943397522, "sampling/importance_sampling_ratio/min": 0.25084182620048523, "sampling/sampling_logp_difference/max": 0.9859634637832642, "sampling/sampling_logp_difference/mean": 0.09893403947353363, "step": 79, "step_time": 10.137737221999942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9050725996494293, "epoch": 0.0008, "grad_norm": 0.0033992675598710775, "kl": 0.29830983486317564, "learning_rate": 7.999999283215897e-06, "loss": -0.0, "step": 80, "step_time": 6.156149169999935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.625, "completions/mean_terminated_length": 2.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7616499289870262, "epoch": 0.00081, "frac_reward_zero_std": 0.75, "grad_norm": 0.0003873612731695175, "kl": 0.2376087919692509, "learning_rate": 7.999999250264562e-06, "loss": -0.0001, "num_tokens": 1959182.0, "reward": 1.9500000476837158, "reward_std": 1.0776318311691284, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.625, "rewards/probe_completion_length/std": 0.49186936020851135, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.8125, "rewards/probe_shaping_dominance/std": 0.3965577781200409, "rewards/probe_terminal_raw/mean": 0.1875, "rewards/probe_terminal_raw/std": 0.3965577781200409, "rewards/rollout_reward_func/mean": -0.625, "rewards/rollout_reward_func/std": 0.7931155562400818, "sampling/importance_sampling_ratio/max": 1.5489988327026367, "sampling/importance_sampling_ratio/mean": 0.9243887662887573, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2893295288085938, "sampling/sampling_logp_difference/mean": 0.0874212235212326, "step": 81, "step_time": 10.081119074000526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.7553209736943245, "epoch": 0.00082, "grad_norm": 0.0003225726541131735, "kl": 0.2503548744134605, "learning_rate": 7.999999216572749e-06, "loss": -0.0001, "step": 82, "step_time": 7.087644705999992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.6875, "completions/mean_terminated_length": 2.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5698596127331257, "epoch": 0.00083, "frac_reward_zero_std": 0.5, "grad_norm": 0.005867909174412489, "kl": 0.5150133110582829, "learning_rate": 7.999999182140456e-06, "loss": -0.0, "num_tokens": 2004465.0, "reward": 2.325000047683716, "reward_std": 1.263635277748108, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.6875, "rewards/probe_completion_length/std": 0.4709290862083435, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.65625, "rewards/probe_shaping_dominance/std": 0.4825586974620819, "rewards/probe_terminal_raw/mean": 0.34375, "rewards/probe_terminal_raw/std": 0.4825586974620819, "rewards/rollout_reward_func/mean": -0.3125, "rewards/rollout_reward_func/std": 0.9651173949241638, "sampling/importance_sampling_ratio/max": 0.9957227110862732, "sampling/importance_sampling_ratio/mean": 0.883823037147522, "sampling/importance_sampling_ratio/min": 0.38355496525764465, "sampling/sampling_logp_difference/max": 0.884765625, "sampling/sampling_logp_difference/mean": 0.05957601219415665, "step": 83, "step_time": 9.975995759000398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.5635711327195168, "epoch": 0.00084, "grad_norm": 0.0055901966989040375, "kl": 0.6353859202936292, "learning_rate": 7.999999146967684e-06, "loss": -0.0001, "step": 84, "step_time": 6.023201422999591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.65625, "completions/mean_terminated_length": 2.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6848401352763176, "epoch": 0.00085, "frac_reward_zero_std": 0.75, "grad_norm": 0.0006412569200620055, "kl": 0.15268925488635432, "learning_rate": 7.999999111054434e-06, "loss": -0.0, "num_tokens": 2046568.0, "reward": 2.481250047683716, "reward_std": 1.3674646615982056, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.65625, "rewards/probe_completion_length/std": 0.4825586974620819, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5625, "rewards/probe_shaping_dominance/std": 0.504016101360321, "rewards/probe_terminal_raw/mean": 0.4375, "rewards/probe_terminal_raw/std": 0.504016101360321, "rewards/rollout_reward_func/mean": -0.125, "rewards/rollout_reward_func/std": 1.008032202720642, "sampling/importance_sampling_ratio/max": 1.0659711360931396, "sampling/importance_sampling_ratio/mean": 0.8968173861503601, "sampling/importance_sampling_ratio/min": 0.6035577058792114, "sampling/sampling_logp_difference/max": 0.39724525809288025, "sampling/sampling_logp_difference/mean": 0.06118996441364288, "step": 85, "step_time": 10.44943671900046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6844260990619659, "epoch": 0.00086, "grad_norm": 0.001066790777258575, "kl": 0.16099315127939917, "learning_rate": 7.999999074400703e-06, "loss": -0.0, "step": 86, "step_time": 5.960067805999643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.6875, "completions/mean_terminated_length": 2.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6104786917567253, "epoch": 0.00087, "frac_reward_zero_std": 0.5, "grad_norm": 0.006237240973860025, "kl": 0.5770804272033274, "learning_rate": 7.999999037006494e-06, "loss": -0.0, "num_tokens": 2092772.0, "reward": 2.075000047683716, "reward_std": 1.0998533964157104, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.6875, "rewards/probe_completion_length/std": 0.4709290862083435, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.78125, "rewards/probe_shaping_dominance/std": 0.420013427734375, "rewards/probe_terminal_raw/mean": 0.21875, "rewards/probe_terminal_raw/std": 0.420013427734375, "rewards/rollout_reward_func/mean": -0.5625, "rewards/rollout_reward_func/std": 0.84002685546875, "sampling/importance_sampling_ratio/max": 1.9442269802093506, "sampling/importance_sampling_ratio/mean": 0.9282306432723999, "sampling/importance_sampling_ratio/min": 0.5401378870010376, "sampling/sampling_logp_difference/max": 0.6658430099487305, "sampling/sampling_logp_difference/mean": 0.06711626052856445, "step": 87, "step_time": 10.2343749260001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.6204207167029381, "epoch": 0.00088, "grad_norm": 0.006610149517655373, "kl": 0.5862025530077517, "learning_rate": 7.999998998871805e-06, "loss": -0.0, "step": 88, "step_time": 6.1646545170001446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.625, "completions/mean_terminated_length": 2.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8936571925878525, "epoch": 0.00089, "frac_reward_zero_std": 0.75, "grad_norm": 0.002412655623629689, "kl": 0.3142129776533693, "learning_rate": 7.999998959996637e-06, "loss": 0.0, "num_tokens": 2137465.0, "reward": 1.9500000476837158, "reward_std": 1.0776318311691284, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.625, "rewards/probe_completion_length/std": 0.49186936020851135, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.8125, "rewards/probe_shaping_dominance/std": 0.3965577781200409, "rewards/probe_terminal_raw/mean": 0.1875, "rewards/probe_terminal_raw/std": 0.3965577781200409, "rewards/rollout_reward_func/mean": -0.625, "rewards/rollout_reward_func/std": 0.7931155562400818, "sampling/importance_sampling_ratio/max": 1.3561961650848389, "sampling/importance_sampling_ratio/mean": 0.889546275138855, "sampling/importance_sampling_ratio/min": 0.3530333340167999, "sampling/sampling_logp_difference/max": 0.656822681427002, "sampling/sampling_logp_difference/mean": 0.09530897438526154, "step": 89, "step_time": 10.901634533000333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8688658103346825, "epoch": 0.0009, "grad_norm": 0.002043940592557192, "kl": 0.29210204584524035, "learning_rate": 7.99999892038099e-06, "loss": 0.0, "step": 90, "step_time": 5.993553177000194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.625, "completions/mean_terminated_length": 2.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6592781916260719, "epoch": 0.00091, "frac_reward_zero_std": 0.5, "grad_norm": 0.004781234543770552, "kl": 1.3648682069979259, "learning_rate": 7.999998880024863e-06, "loss": -0.0, "num_tokens": 2184283.0, "reward": 1.9187500476837158, "reward_std": 0.999495804309845, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.625, "rewards/probe_completion_length/std": 0.49186936020851135, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.84375, "rewards/probe_shaping_dominance/std": 0.3689020276069641, "rewards/probe_terminal_raw/mean": 0.15625, "rewards/probe_terminal_raw/std": 0.3689020276069641, "rewards/rollout_reward_func/mean": -0.6875, "rewards/rollout_reward_func/std": 0.7378040552139282, "sampling/importance_sampling_ratio/max": 1.1354765892028809, "sampling/importance_sampling_ratio/mean": 0.8667730689048767, "sampling/importance_sampling_ratio/min": 0.1865314096212387, "sampling/sampling_logp_difference/max": 1.308552861213684, "sampling/sampling_logp_difference/mean": 0.0986025482416153, "step": 91, "step_time": 9.94715041100062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02291666716337204, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "entropy": 0.6524480134248734, "epoch": 0.00092, "grad_norm": 0.003115040250122547, "kl": 1.1155421955772908, "learning_rate": 7.999998838928257e-06, "loss": -0.0, "step": 92, "step_time": 6.535503675999735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.625, "completions/mean_terminated_length": 2.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7372243441641331, "epoch": 0.00093, "frac_reward_zero_std": 0.5, "grad_norm": 0.002905595349147916, "kl": 0.41144446027465165, "learning_rate": 7.999998797091172e-06, "loss": -0.0, "num_tokens": 2232643.0, "reward": 1.9500000476837158, "reward_std": 1.0776318311691284, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.625, "rewards/probe_completion_length/std": 0.49186936020851135, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.8125, "rewards/probe_shaping_dominance/std": 0.3965577781200409, "rewards/probe_terminal_raw/mean": 0.1875, "rewards/probe_terminal_raw/std": 0.3965577781200409, "rewards/rollout_reward_func/mean": -0.625, "rewards/rollout_reward_func/std": 0.7931155562400818, "sampling/importance_sampling_ratio/max": 1.311123251914978, "sampling/importance_sampling_ratio/mean": 0.8911527991294861, "sampling/importance_sampling_ratio/min": 0.5693881511688232, "sampling/sampling_logp_difference/max": 0.5735888481140137, "sampling/sampling_logp_difference/mean": 0.07083894312381744, "step": 93, "step_time": 10.464223708999953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7358968108892441, "epoch": 0.00094, "grad_norm": 0.004680361598730087, "kl": 0.4150165840983391, "learning_rate": 7.999998754513608e-06, "loss": -0.0, "step": 94, "step_time": 6.073112994999974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5625, "completions/mean_terminated_length": 2.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7264605090022087, "epoch": 0.00095, "frac_reward_zero_std": 0.75, "grad_norm": 0.005375358276069164, "kl": 0.23870017855006154, "learning_rate": 7.999998711195565e-06, "loss": -0.0, "num_tokens": 2275802.0, "reward": 2.262500047683716, "reward_std": 1.3781123161315918, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5625, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.625, "rewards/probe_shaping_dominance/std": 0.49186936020851135, "rewards/probe_terminal_raw/mean": 0.375, "rewards/probe_terminal_raw/std": 0.49186936020851135, "rewards/rollout_reward_func/mean": -0.25, "rewards/rollout_reward_func/std": 0.9837387204170227, "sampling/importance_sampling_ratio/max": 1.2891932725906372, "sampling/importance_sampling_ratio/mean": 0.9060554504394531, "sampling/importance_sampling_ratio/min": 0.40840575098991394, "sampling/sampling_logp_difference/max": 0.8128976225852966, "sampling/sampling_logp_difference/mean": 0.0853162333369255, "step": 95, "step_time": 9.82769083400035 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.735051341354847, "epoch": 0.00096, "grad_norm": 0.0030660636257380247, "kl": 0.2500158410985023, "learning_rate": 7.999998667137043e-06, "loss": -0.0, "step": 96, "step_time": 6.971147591000317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.640817366540432, "epoch": 0.00097, "frac_reward_zero_std": 0.75, "grad_norm": 0.0021315033081918955, "kl": 0.6351375498416019, "learning_rate": 7.999998622338041e-06, "loss": -0.0, "num_tokens": 2315463.0, "reward": 2.6687498092651367, "reward_std": 1.7456743717193604, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 1.3762823343276978, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.59375, "rewards/probe_shaping_dominance/std": 0.49899089336395264, "rewards/probe_terminal_raw/mean": 0.40625, "rewards/probe_terminal_raw/std": 0.49899089336395264, "rewards/rollout_reward_func/mean": -0.1875, "rewards/rollout_reward_func/std": 0.9979817867279053, "sampling/importance_sampling_ratio/max": 1.2316614389419556, "sampling/importance_sampling_ratio/mean": 0.8954340219497681, "sampling/importance_sampling_ratio/min": 0.5406860709190369, "sampling/sampling_logp_difference/max": 0.47210049629211426, "sampling/sampling_logp_difference/mean": 0.05805978551506996, "step": 97, "step_time": 9.842610594000234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.637968648225069, "epoch": 0.00098, "grad_norm": 0.0014506971929222345, "kl": 0.670122139959858, "learning_rate": 7.999998576798562e-06, "loss": -0.0, "step": 98, "step_time": 5.918145998000227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.53125, "completions/mean_terminated_length": 2.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6516329012811184, "epoch": 0.00099, "frac_reward_zero_std": 0.75, "grad_norm": 0.0019182172836735845, "kl": 0.6717661089580815, "learning_rate": 7.999998530518601e-06, "loss": -0.0001, "num_tokens": 2361282.0, "reward": 2.106250047683716, "reward_std": 1.322494626045227, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.53125, "rewards/probe_completion_length/std": 0.507007360458374, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.6875, "rewards/probe_shaping_dominance/std": 0.4709290862083435, "rewards/probe_terminal_raw/mean": 0.3125, "rewards/probe_terminal_raw/std": 0.4709290862083435, "rewards/rollout_reward_func/mean": -0.375, "rewards/rollout_reward_func/std": 0.941858172416687, "sampling/importance_sampling_ratio/max": 1.2388545274734497, "sampling/importance_sampling_ratio/mean": 0.840278148651123, "sampling/importance_sampling_ratio/min": 0.172871395945549, "sampling/sampling_logp_difference/max": 1.687204122543335, "sampling/sampling_logp_difference/mean": 0.10819916427135468, "step": 99, "step_time": 10.378238312000121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.661732904613018, "epoch": 0.001, "grad_norm": 0.002160546835511923, "kl": 0.6096960648123968, "learning_rate": 7.999998483498162e-06, "loss": -0.0001, "step": 100, "step_time": 6.532955088000108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5625, "completions/mean_terminated_length": 2.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7278852015733719, "epoch": 0.00101, "frac_reward_zero_std": 1.0, "grad_norm": 0.00023665007029194385, "kl": 0.15003768350288738, "learning_rate": 7.999998435737244e-06, "loss": 0.0, "num_tokens": 2406084.0, "reward": 2.012500047683716, "reward_std": 1.2164862155914307, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5625, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.0562355518341064, "sampling/importance_sampling_ratio/mean": 0.8963831663131714, "sampling/importance_sampling_ratio/min": 0.3585115075111389, "sampling/sampling_logp_difference/max": 0.5996613502502441, "sampling/sampling_logp_difference/mean": 0.069548100233078, "step": 101, "step_time": 9.839882647999957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7269566580653191, "epoch": 0.00102, "grad_norm": 0.0002451199106872082, "kl": 0.16160269570536911, "learning_rate": 7.999998387235846e-06, "loss": 0.0, "step": 102, "step_time": 5.938691667999592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.53125, "completions/mean_terminated_length": 2.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5686472542583942, "epoch": 0.00103, "frac_reward_zero_std": 1.0, "grad_norm": 8.88900613063015e-05, "kl": 0.05235966534110048, "learning_rate": 7.99999833799397e-06, "loss": 0.0, "num_tokens": 2447237.0, "reward": 2.481250047683716, "reward_std": 1.5023503303527832, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.53125, "rewards/probe_completion_length/std": 0.507007360458374, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.6354694366455078, "sampling/importance_sampling_ratio/mean": 0.95149165391922, "sampling/importance_sampling_ratio/min": 0.42863741517066956, "sampling/sampling_logp_difference/max": 0.6258417367935181, "sampling/sampling_logp_difference/mean": 0.05896550416946411, "step": 103, "step_time": 9.785490112999923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5708712860941887, "epoch": 0.00104, "grad_norm": 0.00012892860104329884, "kl": 0.06115663269883953, "learning_rate": 7.999998288011616e-06, "loss": 0.0, "step": 104, "step_time": 6.934970639000312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8786773160099983, "epoch": 0.00105, "frac_reward_zero_std": 0.75, "grad_norm": 0.008230560459196568, "kl": 0.3639178788289428, "learning_rate": 7.999998237288781e-06, "loss": -0.0001, "num_tokens": 2496418.0, "reward": 1.5750000476837158, "reward_std": 0.9069623351097107, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.4375, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.90625, "rewards/probe_shaping_dominance/std": 0.2961445748806, "rewards/probe_terminal_raw/mean": 0.09375, "rewards/probe_terminal_raw/std": 0.2961445748806, "rewards/rollout_reward_func/mean": -0.8125, "rewards/rollout_reward_func/std": 0.5922891497612, "sampling/importance_sampling_ratio/max": 1.8547699451446533, "sampling/importance_sampling_ratio/mean": 0.8809961080551147, "sampling/importance_sampling_ratio/min": 0.2464524656534195, "sampling/sampling_logp_difference/max": 1.2753207683563232, "sampling/sampling_logp_difference/mean": 0.11397504061460495, "step": 105, "step_time": 10.14871879899988 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033333334140479565, "entropy": 0.8598420843482018, "epoch": 0.00106, "grad_norm": 0.0009213939192704856, "kl": 0.3936503166332841, "learning_rate": 7.999998185825468e-06, "loss": -0.0001, "step": 106, "step_time": 6.146154757999739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.71875, "completions/mean_terminated_length": 2.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6813515201210976, "epoch": 0.00107, "frac_reward_zero_std": 0.75, "grad_norm": 0.002292017685249448, "kl": 0.9095189813524485, "learning_rate": 7.999998133621676e-06, "loss": -0.0, "num_tokens": 2544489.0, "reward": 2.043750047683716, "reward_std": 1.0273478031158447, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.71875, "rewards/probe_completion_length/std": 0.45680341124534607, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.8125, "rewards/probe_shaping_dominance/std": 0.3965577781200409, "rewards/probe_terminal_raw/mean": 0.1875, "rewards/probe_terminal_raw/std": 0.3965577781200409, "rewards/rollout_reward_func/mean": -0.625, "rewards/rollout_reward_func/std": 0.7931155562400818, "sampling/importance_sampling_ratio/max": 1.4959931373596191, "sampling/importance_sampling_ratio/mean": 0.8679620623588562, "sampling/importance_sampling_ratio/min": 0.15595608949661255, "sampling/sampling_logp_difference/max": 1.701085090637207, "sampling/sampling_logp_difference/mean": 0.10083653032779694, "step": 107, "step_time": 11.193133051999894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6780059337615967, "epoch": 0.00108, "grad_norm": 0.0036851251497864723, "kl": 0.9937728652730584, "learning_rate": 7.999998080677404e-06, "loss": -0.0, "step": 108, "step_time": 6.075748887000373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.46875, "completions/mean_terminated_length": 2.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.814523309469223, "epoch": 0.00109, "frac_reward_zero_std": 1.0, "grad_norm": 8.425825944868848e-05, "kl": 0.12947031523799524, "learning_rate": 7.999998026992654e-06, "loss": 0.0, "num_tokens": 2594989.0, "reward": 1.4187500476837158, "reward_std": 0.507007360458374, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.46875, "rewards/probe_completion_length/std": 0.507007360458374, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.0389419794082642, "sampling/importance_sampling_ratio/mean": 0.8629075288772583, "sampling/importance_sampling_ratio/min": 0.5298696756362915, "sampling/sampling_logp_difference/max": 0.548180103302002, "sampling/sampling_logp_difference/mean": 0.07628992199897766, "step": 109, "step_time": 10.092573802000288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8251436538994312, "epoch": 0.0011, "grad_norm": 0.00012063535541528836, "kl": 0.13848416117252782, "learning_rate": 7.999997972567424e-06, "loss": 0.0, "step": 110, "step_time": 6.0987305060002654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.4712951257824898, "epoch": 0.00111, "frac_reward_zero_std": 0.5, "grad_norm": 0.005738819949328899, "kl": 3.548261895775795, "learning_rate": 7.999997917401717e-06, "loss": -0.0001, "num_tokens": 2641307.0, "reward": 3.012500047683716, "reward_std": 1.014014720916748, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 2.0, "rewards/probe_completion_length/std": 0.0, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.46875, "rewards/probe_shaping_dominance/std": 0.507007360458374, "rewards/probe_terminal_raw/mean": 0.53125, "rewards/probe_terminal_raw/std": 0.507007360458374, "rewards/rollout_reward_func/mean": 0.0625, "rewards/rollout_reward_func/std": 1.014014720916748, "sampling/importance_sampling_ratio/max": 1.3101725578308105, "sampling/importance_sampling_ratio/mean": 0.8800893425941467, "sampling/importance_sampling_ratio/min": 0.03823943808674812, "sampling/sampling_logp_difference/max": 3.1835083961486816, "sampling/sampling_logp_difference/mean": 0.1357693374156952, "step": 111, "step_time": 10.871187624000413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.463666919618845, "epoch": 0.00112, "grad_norm": 0.007913350127637386, "kl": 4.181354582309723, "learning_rate": 7.99999786149553e-06, "loss": -0.0001, "step": 112, "step_time": 5.8647342179997395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.65625, "completions/mean_terminated_length": 2.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6317500248551369, "epoch": 0.00113, "frac_reward_zero_std": 1.0, "grad_norm": 7.839502359274775e-05, "kl": 0.5892274444922805, "learning_rate": 7.999997804848863e-06, "loss": 0.0, "num_tokens": 2688559.0, "reward": 2.606250047683716, "reward_std": 1.4052752256393433, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.65625, "rewards/probe_completion_length/std": 0.4825586974620819, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.4366681575775146, "sampling/importance_sampling_ratio/mean": 0.9554047584533691, "sampling/importance_sampling_ratio/min": 0.7517088055610657, "sampling/sampling_logp_difference/max": 0.6721935272216797, "sampling/sampling_logp_difference/mean": 0.06398595869541168, "step": 113, "step_time": 9.927891519000013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.632821474224329, "epoch": 0.00114, "grad_norm": 7.503158849431202e-05, "kl": 0.5771996614057571, "learning_rate": 7.999997747461717e-06, "loss": 0.0, "step": 114, "step_time": 6.027323633999913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 2.65625, "completions/mean_terminated_length": 2.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8952479958534241, "epoch": 0.00115, "frac_reward_zero_std": 0.75, "grad_norm": 0.0037443714682012796, "kl": 0.4335987113881856, "learning_rate": 7.999997689334094e-06, "loss": -0.0001, "num_tokens": 2735416.0, "reward": 1.9187499284744263, "reward_std": 1.6161357164382935, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.71875, "rewards/probe_completion_length/std": 1.419549584388733, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.875, "rewards/probe_shaping_dominance/std": 0.33601075410842896, "rewards/probe_terminal_raw/mean": 0.125, "rewards/probe_terminal_raw/std": 0.33601075410842896, "rewards/rollout_reward_func/mean": -0.75, "rewards/rollout_reward_func/std": 0.6720215082168579, "sampling/importance_sampling_ratio/max": 1.2050693035125732, "sampling/importance_sampling_ratio/mean": 0.8156166672706604, "sampling/importance_sampling_ratio/min": 0.4705444872379303, "sampling/sampling_logp_difference/max": 0.7938392162322998, "sampling/sampling_logp_difference/mean": 0.10766139626502991, "step": 115, "step_time": 10.529442986999811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.8934030309319496, "epoch": 0.00116, "grad_norm": 0.0021822690032422543, "kl": 0.46727226325310767, "learning_rate": 7.99999763046599e-06, "loss": -0.0001, "step": 116, "step_time": 6.006565065000359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.59375, "completions/mean_terminated_length": 2.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.629519946873188, "epoch": 0.00117, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005409108707681298, "kl": 0.5161917522200383, "learning_rate": 7.999997570857409e-06, "loss": 0.0, "num_tokens": 2780743.0, "reward": 2.043750047683716, "reward_std": 1.2790968418121338, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.59375, "rewards/probe_completion_length/std": 0.6652370095252991, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.601353645324707, "sampling/importance_sampling_ratio/mean": 0.8943063020706177, "sampling/importance_sampling_ratio/min": 0.17391638457775116, "sampling/sampling_logp_difference/max": 1.632702112197876, "sampling/sampling_logp_difference/mean": 0.10117563605308533, "step": 117, "step_time": 10.079833390999966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6213446594774723, "epoch": 0.00118, "grad_norm": 0.0004654082003980875, "kl": 0.48489440116100013, "learning_rate": 7.999997510508348e-06, "loss": 0.0, "step": 118, "step_time": 6.549988884000413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7453503087162971, "epoch": 0.00119, "frac_reward_zero_std": 1.0, "grad_norm": 0.00014552050561178476, "kl": 0.22149313415866345, "learning_rate": 7.999997449418809e-06, "loss": 0.0, "num_tokens": 2825585.0, "reward": 2.325000047683716, "reward_std": 1.6800537109375, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 1.38540780544281, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.91416597366333, "sampling/importance_sampling_ratio/mean": 0.9072912335395813, "sampling/importance_sampling_ratio/min": 0.445117324590683, "sampling/sampling_logp_difference/max": 0.7076725959777832, "sampling/sampling_logp_difference/mean": 0.08132369816303253, "step": 119, "step_time": 10.666148662000296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7450893074274063, "epoch": 0.0012, "grad_norm": 0.00017443943943362683, "kl": 0.23418569331988692, "learning_rate": 7.99999738758879e-06, "loss": 0.0, "step": 120, "step_time": 6.12022116199978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.46727633848786354, "epoch": 0.00121, "frac_reward_zero_std": 0.75, "grad_norm": 0.003154071979224682, "kl": 1.0851212590932846, "learning_rate": 7.999997325018293e-06, "loss": -0.0001, "num_tokens": 2869873.0, "reward": 2.637500047683716, "reward_std": 1.2296733856201172, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 0.3965577781200409, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5625, "rewards/probe_shaping_dominance/std": 0.504016101360321, "rewards/probe_terminal_raw/mean": 0.4375, "rewards/probe_terminal_raw/std": 0.504016101360321, "rewards/rollout_reward_func/mean": -0.125, "rewards/rollout_reward_func/std": 1.008032202720642, "sampling/importance_sampling_ratio/max": 1.017763614654541, "sampling/importance_sampling_ratio/mean": 0.8756238222122192, "sampling/importance_sampling_ratio/min": 0.3428362011909485, "sampling/sampling_logp_difference/max": 1.0764925479888916, "sampling/sampling_logp_difference/mean": 0.06550387293100357, "step": 121, "step_time": 9.955963402999714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.45788635686039925, "epoch": 0.00122, "grad_norm": 0.0027381964027881622, "kl": 1.146961946040392, "learning_rate": 7.999997261707317e-06, "loss": -0.0001, "step": 122, "step_time": 6.473173019000114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.40625, "completions/mean_terminated_length": 2.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0169628709554672, "epoch": 0.00123, "frac_reward_zero_std": 1.0, "grad_norm": 0.00026922833058051765, "kl": 0.24311537444009446, "learning_rate": 7.999997197655861e-06, "loss": 0.0, "num_tokens": 2922019.0, "reward": 1.3562500476837158, "reward_std": 0.498990923166275, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.40625, "rewards/probe_completion_length/std": 0.49899089336395264, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.919534683227539, "sampling/importance_sampling_ratio/mean": 0.8465455174446106, "sampling/importance_sampling_ratio/min": 0.2150496542453766, "sampling/sampling_logp_difference/max": 1.3104627132415771, "sampling/sampling_logp_difference/mean": 0.16094987094402313, "step": 123, "step_time": 10.31892864700012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0269659757614136, "epoch": 0.00124, "grad_norm": 0.0002643872576300055, "kl": 0.2424255390651524, "learning_rate": 7.999997132863928e-06, "loss": 0.0, "step": 124, "step_time": 6.228070854999714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.75, "completions/mean_terminated_length": 2.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.567756149917841, "epoch": 0.00125, "frac_reward_zero_std": 1.0, "grad_norm": 1.2914318176626693e-05, "kl": 0.0012493410226852575, "learning_rate": 7.999997067331516e-06, "loss": 0.0, "num_tokens": 2961809.0, "reward": 3.200000047683716, "reward_std": 1.319823980331421, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.75, "rewards/probe_completion_length/std": 0.4399413466453552, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.25, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.75, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": 0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 0.9578561782836914, "sampling/importance_sampling_ratio/mean": 0.8919152021408081, "sampling/importance_sampling_ratio/min": 0.5368828177452087, "sampling/sampling_logp_difference/max": 0.41883519291877747, "sampling/sampling_logp_difference/mean": 0.04870132356882095, "step": 125, "step_time": 9.758941899000092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5752867646515369, "epoch": 0.00126, "grad_norm": 9.876180229184683e-06, "kl": 0.0008715322792340885, "learning_rate": 7.999997001058625e-06, "loss": 0.0, "step": 126, "step_time": 6.379213361999746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.65625, "completions/mean_terminated_length": 2.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5773117579519749, "epoch": 0.00127, "frac_reward_zero_std": 0.5, "grad_norm": 0.0019505616510286927, "kl": 1.3100847490131855, "learning_rate": 7.999996934045254e-06, "loss": -0.0, "num_tokens": 3007241.0, "reward": 2.481250047683716, "reward_std": 1.3674646615982056, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.65625, "rewards/probe_completion_length/std": 0.4825586974620819, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5625, "rewards/probe_shaping_dominance/std": 0.504016101360321, "rewards/probe_terminal_raw/mean": 0.4375, "rewards/probe_terminal_raw/std": 0.504016101360321, "rewards/rollout_reward_func/mean": -0.125, "rewards/rollout_reward_func/std": 1.008032202720642, "sampling/importance_sampling_ratio/max": 1.2050573825836182, "sampling/importance_sampling_ratio/mean": 0.9187490940093994, "sampling/importance_sampling_ratio/min": 0.21540533006191254, "sampling/sampling_logp_difference/max": 1.124786615371704, "sampling/sampling_logp_difference/mean": 0.08163493126630783, "step": 127, "step_time": 10.438749245999361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5682215988636017, "epoch": 0.00128, "grad_norm": 0.0021813081111758947, "kl": 1.3469352908432484, "learning_rate": 7.999996866291406e-06, "loss": -0.0, "step": 128, "step_time": 5.971626152999988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.625, "completions/mean_terminated_length": 2.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.643170677125454, "epoch": 0.00129, "frac_reward_zero_std": 1.0, "grad_norm": 0.00019623831030912697, "kl": 0.6019693883135915, "learning_rate": 7.999996797797079e-06, "loss": 0.0, "num_tokens": 3058344.0, "reward": 1.5750000476837158, "reward_std": 0.49186936020851135, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.625, "rewards/probe_completion_length/std": 0.49186936020851135, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0693564414978027, "sampling/importance_sampling_ratio/mean": 0.9278706908226013, "sampling/importance_sampling_ratio/min": 0.32946839928627014, "sampling/sampling_logp_difference/max": 1.113986611366272, "sampling/sampling_logp_difference/mean": 0.08836956322193146, "step": 129, "step_time": 10.049561713999992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6441235989332199, "epoch": 0.0013, "grad_norm": 0.00011418814392527565, "kl": 0.5788391289788706, "learning_rate": 7.999996728562273e-06, "loss": 0.0, "step": 130, "step_time": 6.564315318000126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5625, "completions/mean_terminated_length": 2.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6178856827318668, "epoch": 0.00131, "frac_reward_zero_std": 1.0, "grad_norm": 6.785019650124013e-05, "kl": 0.7463820744305849, "learning_rate": 7.999996658586989e-06, "loss": 0.0, "num_tokens": 3104192.0, "reward": 2.043750047683716, "reward_std": 1.201058030128479, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5625, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.4856263399124146, "sampling/importance_sampling_ratio/mean": 0.9183324575424194, "sampling/importance_sampling_ratio/min": 0.5824882984161377, "sampling/sampling_logp_difference/max": 0.4804893732070923, "sampling/sampling_logp_difference/mean": 0.06271836906671524, "step": 131, "step_time": 10.31453646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6255888566374779, "epoch": 0.00132, "grad_norm": 7.66810990171507e-05, "kl": 0.7543911002576351, "learning_rate": 7.999996587871225e-06, "loss": 0.0, "step": 132, "step_time": 5.922291539999378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.29608798772096634, "epoch": 0.00133, "frac_reward_zero_std": 0.75, "grad_norm": 0.023734381422400475, "kl": 7.532367415726185, "learning_rate": 7.999996516414982e-06, "loss": 0.0, "num_tokens": 3145988.0, "reward": 3.762500047683716, "reward_std": 0.5922891497612, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 2.0, "rewards/probe_completion_length/std": 0.0, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.09375, "rewards/probe_shaping_dominance/std": 0.2961445748806, "rewards/probe_terminal_raw/mean": 0.90625, "rewards/probe_terminal_raw/std": 0.2961445748806, "rewards/rollout_reward_func/mean": 0.8125, "rewards/rollout_reward_func/std": 0.5922891497612, "sampling/importance_sampling_ratio/max": 1.0541738271713257, "sampling/importance_sampling_ratio/mean": 0.9211882948875427, "sampling/importance_sampling_ratio/min": 0.12494106590747833, "sampling/sampling_logp_difference/max": 1.975663185119629, "sampling/sampling_logp_difference/mean": 0.044852159917354584, "step": 133, "step_time": 10.01835433600013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.296497218310833, "epoch": 0.00134, "grad_norm": 0.00268841371871531, "kl": 2.1114935651421547, "learning_rate": 7.999996444218262e-06, "loss": -0.0, "step": 134, "step_time": 5.628559854000287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.3125, "completions/mean_terminated_length": 2.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1430947929620743, "epoch": 0.00135, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015971790708135813, "kl": 0.15770327649079263, "learning_rate": 7.999996371281063e-06, "loss": 0.0, "num_tokens": 3197833.0, "reward": 1.2625000476837158, "reward_std": 0.4709290862083435, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.3125, "rewards/probe_completion_length/std": 0.4709290862083435, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2785383462905884, "sampling/importance_sampling_ratio/mean": 0.8237912654876709, "sampling/importance_sampling_ratio/min": 0.3784085810184479, "sampling/sampling_logp_difference/max": 0.7751260995864868, "sampling/sampling_logp_difference/mean": 0.12488892674446106, "step": 135, "step_time": 10.539870797000276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1564906761050224, "epoch": 0.00136, "grad_norm": 0.00014199405268300325, "kl": 0.14654893381521106, "learning_rate": 7.999996297603385e-06, "loss": 0.0, "step": 136, "step_time": 6.1141825849999805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.65625, "completions/mean_terminated_length": 2.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6290507353842258, "epoch": 0.00137, "frac_reward_zero_std": 1.0, "grad_norm": 8.231111860368401e-05, "kl": 0.574861828237772, "learning_rate": 7.999996223185228e-06, "loss": 0.0, "num_tokens": 3245445.0, "reward": 2.606250047683716, "reward_std": 1.4052752256393433, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.65625, "rewards/probe_completion_length/std": 0.4825586974620819, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.1159268617630005, "sampling/importance_sampling_ratio/mean": 0.8925962448120117, "sampling/importance_sampling_ratio/min": 0.41961097717285156, "sampling/sampling_logp_difference/max": 0.7973945140838623, "sampling/sampling_logp_difference/mean": 0.07273562997579575, "step": 137, "step_time": 10.542406992999759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.635385774075985, "epoch": 0.00138, "grad_norm": 6.592683348571882e-05, "kl": 0.5743689931114204, "learning_rate": 7.999996148026594e-06, "loss": 0.0, "step": 138, "step_time": 6.578659082000513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.625, "completions/mean_terminated_length": 2.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7651711776852608, "epoch": 0.00139, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001584794808877632, "kl": 0.5461679399013519, "learning_rate": 7.99999607212748e-06, "loss": 0.0, "num_tokens": 3295589.0, "reward": 1.5750000476837158, "reward_std": 0.49186936020851135, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.625, "rewards/probe_completion_length/std": 0.49186936020851135, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.483282208442688, "sampling/importance_sampling_ratio/mean": 0.9177201986312866, "sampling/importance_sampling_ratio/min": 0.32641756534576416, "sampling/sampling_logp_difference/max": 0.6357085704803467, "sampling/sampling_logp_difference/mean": 0.08419822156429291, "step": 139, "step_time": 10.432583765000118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8008819669485092, "epoch": 0.0014, "grad_norm": 0.00021447367907967418, "kl": 0.5468537542037666, "learning_rate": 7.999995995487888e-06, "loss": 0.0, "step": 140, "step_time": 6.285205011999778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.71875, "completions/mean_terminated_length": 2.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6883749663829803, "epoch": 0.00141, "frac_reward_zero_std": 0.75, "grad_norm": 0.0016245342558249831, "kl": 2.113200404914096, "learning_rate": 7.999995918107818e-06, "loss": -0.0, "num_tokens": 3342865.0, "reward": 2.606250047683716, "reward_std": 1.334634780883789, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.71875, "rewards/probe_completion_length/std": 0.45680341124534607, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.53125, "rewards/probe_shaping_dominance/std": 0.507007360458374, "rewards/probe_terminal_raw/mean": 0.46875, "rewards/probe_terminal_raw/std": 0.507007360458374, "rewards/rollout_reward_func/mean": -0.0625, "rewards/rollout_reward_func/std": 1.014014720916748, "sampling/importance_sampling_ratio/max": 1.1353306770324707, "sampling/importance_sampling_ratio/mean": 0.8638892769813538, "sampling/importance_sampling_ratio/min": 0.27192848920822144, "sampling/sampling_logp_difference/max": 1.2334420680999756, "sampling/sampling_logp_difference/mean": 0.08482334017753601, "step": 141, "step_time": 10.48996880799973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6942117437720299, "epoch": 0.00142, "grad_norm": 0.0024598841555416584, "kl": 2.2772381571121514, "learning_rate": 7.999995839987269e-06, "loss": -0.0, "step": 142, "step_time": 6.0210988920000545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.09375, "completions/mean_terminated_length": 2.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2067401632666588, "epoch": 0.00143, "frac_reward_zero_std": 1.0, "grad_norm": 0.00035976560320705175, "kl": 0.1048080543987453, "learning_rate": 7.999995761126243e-06, "loss": 0.0, "num_tokens": 3391495.0, "reward": 1.0437500476837158, "reward_std": 0.29614460468292236, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.09375, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1019096374511719, "sampling/importance_sampling_ratio/mean": 0.861067533493042, "sampling/importance_sampling_ratio/min": 0.5161373019218445, "sampling/sampling_logp_difference/max": 0.41501492261886597, "sampling/sampling_logp_difference/mean": 0.09741053730249405, "step": 143, "step_time": 10.409878404999972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2020705938339233, "epoch": 0.00144, "grad_norm": 0.00014758903125766665, "kl": 0.09721950892708264, "learning_rate": 7.999995681524736e-06, "loss": 0.0, "step": 144, "step_time": 6.574293671999612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8218728005886078, "epoch": 0.00145, "frac_reward_zero_std": 1.0, "grad_norm": 0.00010783881589304656, "kl": 0.5577464867383242, "learning_rate": 7.999995601182752e-06, "loss": 0.0, "num_tokens": 3438637.0, "reward": 2.262500047683716, "reward_std": 1.0606601238250732, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 0.3965577781200409, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.0564521551132202, "sampling/importance_sampling_ratio/mean": 0.79786217212677, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.278409481048584, "sampling/sampling_logp_difference/mean": 0.10636921972036362, "step": 145, "step_time": 10.219195225000021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8261116743087769, "epoch": 0.00146, "grad_norm": 9.589105320628732e-05, "kl": 0.5436595194041729, "learning_rate": 7.999995520100289e-06, "loss": 0.0, "step": 146, "step_time": 6.632304521999458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.65625, "completions/mean_terminated_length": 2.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.662704985588789, "epoch": 0.00147, "frac_reward_zero_std": 1.0, "grad_norm": 2.6694644475355744e-05, "kl": 0.5263466238975525, "learning_rate": 7.999995438277348e-06, "loss": 0.0, "num_tokens": 3482380.0, "reward": 2.606250047683716, "reward_std": 1.4052752256393433, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.65625, "rewards/probe_completion_length/std": 0.4825586974620819, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.115870475769043, "sampling/importance_sampling_ratio/mean": 0.9159581661224365, "sampling/importance_sampling_ratio/min": 0.6554316282272339, "sampling/sampling_logp_difference/max": 0.27883535623550415, "sampling/sampling_logp_difference/mean": 0.047572724521160126, "step": 147, "step_time": 9.820527543000026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6697583869099617, "epoch": 0.00148, "grad_norm": 2.685715298866853e-05, "kl": 0.5248434137902223, "learning_rate": 7.99999535571393e-06, "loss": 0.0, "step": 148, "step_time": 5.959111133999841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.3125, "completions/mean_terminated_length": 2.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8875343278050423, "epoch": 0.00149, "frac_reward_zero_std": 1.0, "grad_norm": 5.001196768716909e-05, "kl": 0.0462939627468586, "learning_rate": 7.999995272410032e-06, "loss": 0.0, "num_tokens": 3528001.0, "reward": 1.7625000476837158, "reward_std": 1.3060035705566406, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.3125, "rewards/probe_completion_length/std": 0.4709290862083435, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.5503108501434326, "sampling/importance_sampling_ratio/mean": 0.8569838404655457, "sampling/importance_sampling_ratio/min": 0.38676348328590393, "sampling/sampling_logp_difference/max": 0.8490326404571533, "sampling/sampling_logp_difference/mean": 0.10086904466152191, "step": 149, "step_time": 10.409084240000084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.885762520134449, "epoch": 0.0015, "grad_norm": 4.54061409982387e-05, "kl": 0.0408337813714752, "learning_rate": 7.999995188365656e-06, "loss": 0.0, "step": 150, "step_time": 6.53108840799996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.40625, "completions/mean_terminated_length": 2.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9287142679095268, "epoch": 0.00151, "frac_reward_zero_std": 1.0, "grad_norm": 2.5531895516905934e-05, "kl": 0.08757305558538064, "learning_rate": 7.999995103580802e-06, "loss": 0.0, "num_tokens": 3577744.0, "reward": 1.3562500476837158, "reward_std": 0.498990923166275, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.40625, "rewards/probe_completion_length/std": 0.49899089336395264, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1747065782546997, "sampling/importance_sampling_ratio/mean": 0.8980759382247925, "sampling/importance_sampling_ratio/min": 0.6533486247062683, "sampling/sampling_logp_difference/max": 0.3130163550376892, "sampling/sampling_logp_difference/mean": 0.07302668690681458, "step": 151, "step_time": 10.05969518300003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9390483722090721, "epoch": 0.00152, "grad_norm": 2.0379042325657792e-05, "kl": 0.08184675365919247, "learning_rate": 7.99999501805547e-06, "loss": 0.0, "step": 152, "step_time": 6.642018062000034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9555944502353668, "epoch": 0.00153, "frac_reward_zero_std": 1.0, "grad_norm": 5.609483559965156e-05, "kl": 0.04718830151250586, "learning_rate": 7.99999493178966e-06, "loss": 0.0, "num_tokens": 3622932.0, "reward": 1.3875000476837158, "reward_std": 0.504016101360321, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.4375, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.049941062927246, "sampling/importance_sampling_ratio/mean": 0.8323773145675659, "sampling/importance_sampling_ratio/min": 0.476598858833313, "sampling/sampling_logp_difference/max": 0.45580363273620605, "sampling/sampling_logp_difference/mean": 0.10586726665496826, "step": 153, "step_time": 10.33890006899992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9704329445958138, "epoch": 0.00154, "grad_norm": 4.737279596156441e-05, "kl": 0.04786970978602767, "learning_rate": 7.99999484478337e-06, "loss": 0.0, "step": 154, "step_time": 5.966872542999681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5625, "completions/mean_terminated_length": 2.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7249070331454277, "epoch": 0.00155, "frac_reward_zero_std": 0.75, "grad_norm": 0.00405209232121706, "kl": 0.4083508696348872, "learning_rate": 7.999994757036603e-06, "loss": -0.0001, "num_tokens": 3671748.0, "reward": 1.9500000476837158, "reward_std": 1.1639753580093384, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5625, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.78125, "rewards/probe_shaping_dominance/std": 0.420013427734375, "rewards/probe_terminal_raw/mean": 0.21875, "rewards/probe_terminal_raw/std": 0.420013427734375, "rewards/rollout_reward_func/mean": -0.5625, "rewards/rollout_reward_func/std": 0.84002685546875, "sampling/importance_sampling_ratio/max": 1.0744291543960571, "sampling/importance_sampling_ratio/mean": 0.8961950540542603, "sampling/importance_sampling_ratio/min": 0.4760172367095947, "sampling/sampling_logp_difference/max": 0.604301929473877, "sampling/sampling_logp_difference/mean": 0.06410501897335052, "step": 155, "step_time": 10.121234842999684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.7260234132409096, "epoch": 0.00156, "grad_norm": 0.0011063937563449144, "kl": 0.4271300343389157, "learning_rate": 7.999994668549356e-06, "loss": -0.0001, "step": 156, "step_time": 6.629027390999681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.46875, "completions/mean_terminated_length": 2.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0310870260000229, "epoch": 0.00157, "frac_reward_zero_std": 0.75, "grad_norm": 0.0030120075680315495, "kl": 0.20033146999776363, "learning_rate": 7.999994579321633e-06, "loss": -0.0, "num_tokens": 3723638.0, "reward": 1.4812500476837158, "reward_std": 0.671271026134491, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.46875, "rewards/probe_completion_length/std": 0.507007360458374, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.96875, "rewards/probe_shaping_dominance/std": 0.1767766922712326, "rewards/probe_terminal_raw/mean": 0.03125, "rewards/probe_terminal_raw/std": 0.1767766922712326, "rewards/rollout_reward_func/mean": -0.9375, "rewards/rollout_reward_func/std": 0.3535533845424652, "sampling/importance_sampling_ratio/max": 1.8720694780349731, "sampling/importance_sampling_ratio/mean": 0.896269679069519, "sampling/importance_sampling_ratio/min": 0.544737696647644, "sampling/sampling_logp_difference/max": 0.7200915813446045, "sampling/sampling_logp_difference/mean": 0.10810667276382446, "step": 157, "step_time": 10.497650911999926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 1.0275204181671143, "epoch": 0.00158, "grad_norm": 0.002701184945181012, "kl": 0.20881290454417467, "learning_rate": 7.999994489353432e-06, "loss": -0.0, "step": 158, "step_time": 6.045203006000065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 2.75, "completions/mean_terminated_length": 2.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7524381056427956, "epoch": 0.00159, "frac_reward_zero_std": 1.0, "grad_norm": 4.496976180234924e-05, "kl": 0.02527153951814398, "learning_rate": 7.999994398644752e-06, "loss": 0.0, "num_tokens": 3766015.0, "reward": 2.2624998092651367, "reward_std": 1.7121481895446777, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 1.4013242721557617, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.0045019388198853, "sampling/importance_sampling_ratio/mean": 0.8471688032150269, "sampling/importance_sampling_ratio/min": 0.5022687315940857, "sampling/sampling_logp_difference/max": 0.5595995187759399, "sampling/sampling_logp_difference/mean": 0.06900060176849365, "step": 159, "step_time": 9.84526067000047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.744699664413929, "epoch": 0.0016, "grad_norm": 4.002218338428065e-05, "kl": 0.0263721200872169, "learning_rate": 7.999994307195594e-06, "loss": 0.0, "step": 160, "step_time": 6.48367115699989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.45632531866431236, "epoch": 0.00161, "frac_reward_zero_std": 1.0, "grad_norm": 2.4971124730654992e-05, "kl": 0.5871615024880157, "learning_rate": 7.99999421500596e-06, "loss": 0.0, "num_tokens": 3809262.0, "reward": 3.356250047683716, "reward_std": 1.0734140872955322, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.25, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.75, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": 0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 0.9640352725982666, "sampling/importance_sampling_ratio/mean": 0.9091618061065674, "sampling/importance_sampling_ratio/min": 0.5254960060119629, "sampling/sampling_logp_difference/max": 0.30299806594848633, "sampling/sampling_logp_difference/mean": 0.037094734609127045, "step": 161, "step_time": 10.210792510000601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.45290883257985115, "epoch": 0.00162, "grad_norm": 2.5933706638170406e-05, "kl": 0.5865964656695724, "learning_rate": 7.999994122075845e-06, "loss": 0.0, "step": 162, "step_time": 5.857546231000242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.25, "completions/mean_terminated_length": 2.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.152751550078392, "epoch": 0.00163, "frac_reward_zero_std": 1.0, "grad_norm": 6.958712037885562e-05, "kl": 0.07499606953933835, "learning_rate": 7.999994028405253e-06, "loss": 0.0, "num_tokens": 3856949.0, "reward": 1.2000000476837158, "reward_std": 0.4399413466453552, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.25, "rewards/probe_completion_length/std": 0.4399413466453552, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1793038845062256, "sampling/importance_sampling_ratio/mean": 0.8708353042602539, "sampling/importance_sampling_ratio/min": 0.4095854163169861, "sampling/sampling_logp_difference/max": 0.46399402618408203, "sampling/sampling_logp_difference/mean": 0.10172684490680695, "step": 163, "step_time": 10.000568296000438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1484008505940437, "epoch": 0.00164, "grad_norm": 9.33783157961443e-05, "kl": 0.0766537832969334, "learning_rate": 7.999993933994183e-06, "loss": 0.0, "step": 164, "step_time": 6.573066442999789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.65625, "completions/mean_terminated_length": 2.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8715492784976959, "epoch": 0.00165, "frac_reward_zero_std": 0.75, "grad_norm": 0.001674352795816958, "kl": 0.38867083785589784, "learning_rate": 7.999993838842636e-06, "loss": -0.0, "num_tokens": 3906527.0, "reward": 1.7937500476837158, "reward_std": 0.8466013669967651, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.65625, "rewards/probe_completion_length/std": 0.4825586974620819, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.90625, "rewards/probe_shaping_dominance/std": 0.2961445748806, "rewards/probe_terminal_raw/mean": 0.09375, "rewards/probe_terminal_raw/std": 0.2961445748806, "rewards/rollout_reward_func/mean": -0.8125, "rewards/rollout_reward_func/std": 0.5922891497612, "sampling/importance_sampling_ratio/max": 1.153779149055481, "sampling/importance_sampling_ratio/mean": 0.8230181336402893, "sampling/importance_sampling_ratio/min": 0.3351452946662903, "sampling/sampling_logp_difference/max": 0.924765944480896, "sampling/sampling_logp_difference/mean": 0.10521954298019409, "step": 165, "step_time": 10.572739006000347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.8823190033435822, "epoch": 0.00166, "grad_norm": 0.0011349389096722007, "kl": 0.3862094555515796, "learning_rate": 7.99999374295061e-06, "loss": -0.0, "step": 166, "step_time": 6.040446489000033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.625, "completions/mean_terminated_length": 2.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9422005489468575, "epoch": 0.00167, "frac_reward_zero_std": 0.75, "grad_norm": 0.0008965503075160086, "kl": 0.3014809291344136, "learning_rate": 7.999993646318106e-06, "loss": -0.0, "num_tokens": 3954615.0, "reward": 1.7625000476837158, "reward_std": 0.8590129613876343, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.625, "rewards/probe_completion_length/std": 0.49186936020851135, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.90625, "rewards/probe_shaping_dominance/std": 0.2961445748806, "rewards/probe_terminal_raw/mean": 0.09375, "rewards/probe_terminal_raw/std": 0.2961445748806, "rewards/rollout_reward_func/mean": -0.8125, "rewards/rollout_reward_func/std": 0.5922891497612, "sampling/importance_sampling_ratio/max": 1.2705289125442505, "sampling/importance_sampling_ratio/mean": 0.8543938994407654, "sampling/importance_sampling_ratio/min": 0.325277715921402, "sampling/sampling_logp_difference/max": 0.7510074377059937, "sampling/sampling_logp_difference/mean": 0.1064271330833435, "step": 167, "step_time": 9.966102381999917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9336102455854416, "epoch": 0.00168, "grad_norm": 0.0007681242423132062, "kl": 0.29743986390531063, "learning_rate": 7.999993548945123e-06, "loss": -0.0, "step": 168, "step_time": 6.981794994000211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 2.53125, "completions/mean_terminated_length": 2.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8915975391864777, "epoch": 0.00169, "frac_reward_zero_std": 0.75, "grad_norm": 0.0011564207961782813, "kl": 0.25107664382085204, "learning_rate": 7.999993450831664e-06, "loss": -0.0, "num_tokens": 4001506.0, "reward": 1.8250000476837158, "reward_std": 1.31369948387146, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5625, "rewards/probe_completion_length/std": 0.9482581615447998, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.84375, "rewards/probe_shaping_dominance/std": 0.3689020276069641, "rewards/probe_terminal_raw/mean": 0.15625, "rewards/probe_terminal_raw/std": 0.3689020276069641, "rewards/rollout_reward_func/mean": -0.6875, "rewards/rollout_reward_func/std": 0.7378040552139282, "sampling/importance_sampling_ratio/max": 1.938115119934082, "sampling/importance_sampling_ratio/mean": 0.9251878261566162, "sampling/importance_sampling_ratio/min": 0.5063843727111816, "sampling/sampling_logp_difference/max": 0.7367093563079834, "sampling/sampling_logp_difference/mean": 0.08683702349662781, "step": 169, "step_time": 9.905912688000399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8912236839532852, "epoch": 0.0017, "grad_norm": 0.0012997838202863932, "kl": 0.2457849751226604, "learning_rate": 7.999993351977727e-06, "loss": -0.0, "step": 170, "step_time": 6.000670364999905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.836796797811985, "epoch": 0.00171, "frac_reward_zero_std": 1.0, "grad_norm": 8.854575571604073e-05, "kl": 0.06441712872037897, "learning_rate": 7.999993252383311e-06, "loss": 0.0, "num_tokens": 4049332.0, "reward": 1.8875000476837158, "reward_std": 1.2684128284454346, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.4375, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.1459448337554932, "sampling/importance_sampling_ratio/mean": 0.8837835788726807, "sampling/importance_sampling_ratio/min": 0.5503401160240173, "sampling/sampling_logp_difference/max": 0.2968907952308655, "sampling/sampling_logp_difference/mean": 0.06938318908214569, "step": 171, "step_time": 10.424372221999874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8422264903783798, "epoch": 0.00172, "grad_norm": 6.102959014242515e-05, "kl": 0.05693867025547661, "learning_rate": 7.999993152048418e-06, "loss": 0.0, "step": 172, "step_time": 6.742195677000609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5464613921940327, "epoch": 0.00173, "frac_reward_zero_std": 1.0, "grad_norm": 0.00066503876587376, "kl": 0.9290937134064734, "learning_rate": 7.999993050973047e-06, "loss": 0.0, "num_tokens": 4094304.0, "reward": 2.262500047683716, "reward_std": 1.0606601238250732, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 0.3965577781200409, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.245759129524231, "sampling/importance_sampling_ratio/mean": 0.8740517497062683, "sampling/importance_sampling_ratio/min": 0.12748770415782928, "sampling/sampling_logp_difference/max": 1.9313684701919556, "sampling/sampling_logp_difference/mean": 0.08258374035358429, "step": 173, "step_time": 10.055813986999965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5519991889595985, "epoch": 0.00174, "grad_norm": 0.0006888221250846982, "kl": 0.9350114449625835, "learning_rate": 7.999992949157197e-06, "loss": 0.0, "step": 174, "step_time": 5.969105376000243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.34375, "completions/mean_terminated_length": 2.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9456525817513466, "epoch": 0.00175, "frac_reward_zero_std": 0.75, "grad_norm": 0.0010870946571230888, "kl": 0.579621426644735, "learning_rate": 7.999992846600872e-06, "loss": -0.0, "num_tokens": 4143282.0, "reward": 1.7312500476837158, "reward_std": 1.2374368906021118, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.34375, "rewards/probe_completion_length/std": 0.4825586974620819, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.78125, "rewards/probe_shaping_dominance/std": 0.420013427734375, "rewards/probe_terminal_raw/mean": 0.21875, "rewards/probe_terminal_raw/std": 0.420013427734375, "rewards/rollout_reward_func/mean": -0.5625, "rewards/rollout_reward_func/std": 0.84002685546875, "sampling/importance_sampling_ratio/max": 1.0026894807815552, "sampling/importance_sampling_ratio/mean": 0.8730309009552002, "sampling/importance_sampling_ratio/min": 0.5163137316703796, "sampling/sampling_logp_difference/max": 0.3348468840122223, "sampling/sampling_logp_difference/mean": 0.07812627404928207, "step": 175, "step_time": 10.401969632999453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9536789208650589, "epoch": 0.00176, "grad_norm": 0.001607311423867941, "kl": 0.5831765849143267, "learning_rate": 7.999992743304069e-06, "loss": -0.0, "step": 176, "step_time": 6.50468187199931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.40625, "completions/mean_terminated_length": 2.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8540512844920158, "epoch": 0.00177, "frac_reward_zero_std": 1.0, "grad_norm": 5.113742736284621e-05, "kl": 0.5639435993507504, "learning_rate": 7.999992639266786e-06, "loss": 0.0, "num_tokens": 4188525.0, "reward": 1.8562500476837158, "reward_std": 1.2790968418121338, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.40625, "rewards/probe_completion_length/std": 0.49899089336395264, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.5230607986450195, "sampling/importance_sampling_ratio/mean": 0.9213600754737854, "sampling/importance_sampling_ratio/min": 0.5386036038398743, "sampling/sampling_logp_difference/max": 0.5391008853912354, "sampling/sampling_logp_difference/mean": 0.07177218794822693, "step": 177, "step_time": 9.79140261700013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8499638438224792, "epoch": 0.00178, "grad_norm": 5.0212442147312686e-05, "kl": 0.5627318718470633, "learning_rate": 7.999992534489026e-06, "loss": 0.0, "step": 178, "step_time": 5.927821066999968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.15625, "completions/mean_terminated_length": 2.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.19364695250988, "epoch": 0.00179, "frac_reward_zero_std": 1.0, "grad_norm": 0.000155127709149383, "kl": 0.05882764467969537, "learning_rate": 7.99999242897079e-06, "loss": 0.0, "num_tokens": 4239646.0, "reward": 1.1062500476837158, "reward_std": 0.3689020574092865, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.15625, "rewards/probe_completion_length/std": 0.3689020276069641, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3667725324630737, "sampling/importance_sampling_ratio/mean": 0.8795427680015564, "sampling/importance_sampling_ratio/min": 0.3527143597602844, "sampling/sampling_logp_difference/max": 0.6183974742889404, "sampling/sampling_logp_difference/mean": 0.1178746223449707, "step": 179, "step_time": 10.46284436499991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1964821219444275, "epoch": 0.0018, "grad_norm": 6.373682845151052e-05, "kl": 0.04784144414588809, "learning_rate": 7.999992322712075e-06, "loss": 0.0, "step": 180, "step_time": 6.581123827999818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2230155989527702, "epoch": 0.00181, "frac_reward_zero_std": 1.0, "grad_norm": 6.241205119295046e-05, "kl": 0.2899513263255358, "learning_rate": 7.999992215712882e-06, "loss": 0.0, "num_tokens": 4287392.0, "reward": 1.3875000476837158, "reward_std": 0.504016101360321, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.4375, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.687260389328003, "sampling/importance_sampling_ratio/mean": 0.8565247058868408, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2024495601654053, "sampling/sampling_logp_difference/mean": 0.1502533257007599, "step": 181, "step_time": 9.957160163000026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2253851145505905, "epoch": 0.00182, "grad_norm": 7.033378642518073e-05, "kl": 0.2906297470035497, "learning_rate": 7.999992107973214e-06, "loss": 0.0, "step": 182, "step_time": 6.031087762999505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9202814698219299, "epoch": 0.00183, "frac_reward_zero_std": 1.0, "grad_norm": 0.00013524244423024356, "kl": 0.29891258222050965, "learning_rate": 7.999991999493065e-06, "loss": 0.0, "num_tokens": 4334719.0, "reward": 1.4500000476837158, "reward_std": 0.5080004930496216, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5, "rewards/probe_completion_length/std": 0.5080004930496216, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.0915449857711792, "sampling/importance_sampling_ratio/mean": 0.8393715620040894, "sampling/importance_sampling_ratio/min": 0.21315446496009827, "sampling/sampling_logp_difference/max": 1.4369076490402222, "sampling/sampling_logp_difference/mean": 0.10889923572540283, "step": 183, "step_time": 10.379452743000002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9316936507821083, "epoch": 0.00184, "grad_norm": 0.0001468406117055565, "kl": 0.29929662309587, "learning_rate": 7.999991890272441e-06, "loss": 0.0, "step": 184, "step_time": 6.48263137099957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5239233821630478, "epoch": 0.00185, "frac_reward_zero_std": 0.75, "grad_norm": 0.0020477992948144674, "kl": 0.5273855591076426, "learning_rate": 7.999991780311339e-06, "loss": -0.0001, "num_tokens": 4379553.0, "reward": 3.231250047683716, "reward_std": 1.1139692068099976, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.3125, "rewards/probe_shaping_dominance/std": 0.4709290862083435, "rewards/probe_terminal_raw/mean": 0.6875, "rewards/probe_terminal_raw/std": 0.4709290862083435, "rewards/rollout_reward_func/mean": 0.375, "rewards/rollout_reward_func/std": 0.941858172416687, "sampling/importance_sampling_ratio/max": 1.0954153537750244, "sampling/importance_sampling_ratio/mean": 0.8618326783180237, "sampling/importance_sampling_ratio/min": 0.4107683002948761, "sampling/sampling_logp_difference/max": 0.7406651973724365, "sampling/sampling_logp_difference/mean": 0.06812501698732376, "step": 185, "step_time": 9.822880733999682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.5153519753366709, "epoch": 0.00186, "grad_norm": 0.001283274032175541, "kl": 0.5346019868738949, "learning_rate": 7.999991669609758e-06, "loss": -0.0001, "step": 186, "step_time": 5.9687305269994795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.71875, "completions/mean_terminated_length": 2.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6474461331963539, "epoch": 0.00187, "frac_reward_zero_std": 1.0, "grad_norm": 4.33456661994569e-05, "kl": 0.41244959738105536, "learning_rate": 7.999991558167702e-06, "loss": 0.0, "num_tokens": 4426775.0, "reward": 2.231250047683716, "reward_std": 1.0846248865127563, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.71875, "rewards/probe_completion_length/std": 0.45680341124534607, "rewards/probe_invalid_count/mean": 0.0625, "rewards/probe_invalid_count/std": 0.24593468010425568, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.5267152786254883, "sampling/importance_sampling_ratio/mean": 0.9380771517753601, "sampling/importance_sampling_ratio/min": 0.6180238127708435, "sampling/sampling_logp_difference/max": 0.5256781578063965, "sampling/sampling_logp_difference/mean": 0.06567574292421341, "step": 187, "step_time": 10.448041156000272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6464254632592201, "epoch": 0.00188, "grad_norm": 4.453281871974468e-05, "kl": 0.41222386225126684, "learning_rate": 7.999991445985168e-06, "loss": 0.0, "step": 188, "step_time": 6.039322745000391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.40625, "completions/mean_terminated_length": 2.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0263744220137596, "epoch": 0.00189, "frac_reward_zero_std": 1.0, "grad_norm": 4.976044874638319e-05, "kl": 0.5344130313023925, "learning_rate": 7.999991333062156e-06, "loss": 0.0, "num_tokens": 4475338.0, "reward": 1.3875000476837158, "reward_std": 0.504016101360321, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.40625, "rewards/probe_completion_length/std": 0.49899089336395264, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1248929500579834, "sampling/importance_sampling_ratio/mean": 0.8441773653030396, "sampling/importance_sampling_ratio/min": 0.3922726809978485, "sampling/sampling_logp_difference/max": 0.6367834806442261, "sampling/sampling_logp_difference/mean": 0.10147504508495331, "step": 189, "step_time": 10.450124996000795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0167877674102783, "epoch": 0.0019, "grad_norm": 5.3309948270907626e-05, "kl": 0.5326844826340675, "learning_rate": 7.999991219398665e-06, "loss": 0.0, "step": 190, "step_time": 6.544803051000599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5253524780273438, "epoch": 0.00191, "frac_reward_zero_std": 0.75, "grad_norm": 0.0043292418122291565, "kl": 2.8481499888002872, "learning_rate": 7.999991104994699e-06, "loss": -0.0001, "num_tokens": 4520792.0, "reward": 3.231250047683716, "reward_std": 1.1976959705352783, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 0.3689020276069641, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.28125, "rewards/probe_shaping_dominance/std": 0.45680341124534607, "rewards/probe_terminal_raw/mean": 0.71875, "rewards/probe_terminal_raw/std": 0.45680341124534607, "rewards/rollout_reward_func/mean": 0.4375, "rewards/rollout_reward_func/std": 0.9136068224906921, "sampling/importance_sampling_ratio/max": 1.7498235702514648, "sampling/importance_sampling_ratio/mean": 0.9441710114479065, "sampling/importance_sampling_ratio/min": 0.24626334011554718, "sampling/sampling_logp_difference/max": 1.2287273406982422, "sampling/sampling_logp_difference/mean": 0.05860981345176697, "step": 191, "step_time": 9.974904008000067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.5309231318533421, "epoch": 0.00192, "grad_norm": 0.002293355530127883, "kl": 2.6455338932573795, "learning_rate": 7.999990989850255e-06, "loss": -0.0001, "step": 192, "step_time": 6.537967493999986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5067413002252579, "epoch": 0.00193, "frac_reward_zero_std": 0.5, "grad_norm": 0.002911539049819112, "kl": 2.6948545714840293, "learning_rate": 7.999990873965335e-06, "loss": -0.0, "num_tokens": 4566659.0, "reward": 3.075000047683716, "reward_std": 1.008032202720642, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 2.0, "rewards/probe_completion_length/std": 0.0, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.4375, "rewards/probe_shaping_dominance/std": 0.504016101360321, "rewards/probe_terminal_raw/mean": 0.5625, "rewards/probe_terminal_raw/std": 0.504016101360321, "rewards/rollout_reward_func/mean": 0.125, "rewards/rollout_reward_func/std": 1.008032202720642, "sampling/importance_sampling_ratio/max": 1.6099581718444824, "sampling/importance_sampling_ratio/mean": 0.8238017559051514, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.4553546905517578, "sampling/sampling_logp_difference/mean": 0.12532293796539307, "step": 193, "step_time": 10.576807229999758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5161035768687725, "epoch": 0.00194, "grad_norm": 0.0030419155955314636, "kl": 2.718808156438172, "learning_rate": 7.999990757339936e-06, "loss": -0.0, "step": 194, "step_time": 6.058262799999284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5364239364862442, "epoch": 0.00195, "frac_reward_zero_std": 0.75, "grad_norm": 0.003811279311776161, "kl": 0.7552666682749987, "learning_rate": 7.99999063997406e-06, "loss": -0.0, "num_tokens": 4616206.0, "reward": 2.918750047683716, "reward_std": 1.2309025526046753, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 0.3965577781200409, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.4375, "rewards/probe_shaping_dominance/std": 0.504016101360321, "rewards/probe_terminal_raw/mean": 0.5625, "rewards/probe_terminal_raw/std": 0.504016101360321, "rewards/rollout_reward_func/mean": 0.125, "rewards/rollout_reward_func/std": 1.008032202720642, "sampling/importance_sampling_ratio/max": 1.210990071296692, "sampling/importance_sampling_ratio/mean": 0.9246749877929688, "sampling/importance_sampling_ratio/min": 0.5606986880302429, "sampling/sampling_logp_difference/max": 0.39641594886779785, "sampling/sampling_logp_difference/mean": 0.04741666093468666, "step": 195, "step_time": 10.604286133999722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.5519651547074318, "epoch": 0.00196, "grad_norm": 0.0031288517639040947, "kl": 0.757812971714884, "learning_rate": 7.999990521867708e-06, "loss": -0.0, "step": 196, "step_time": 6.057004269001027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.59375, "completions/mean_terminated_length": 2.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9043734967708588, "epoch": 0.00197, "frac_reward_zero_std": 1.0, "grad_norm": 5.1686216465895995e-05, "kl": 0.6568926060572267, "learning_rate": 7.999990403020877e-06, "loss": 0.0, "num_tokens": 4660047.0, "reward": 2.543750047683716, "reward_std": 1.4560080766677856, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.59375, "rewards/probe_completion_length/std": 0.49899089336395264, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.1121628284454346, "sampling/importance_sampling_ratio/mean": 0.8413404226303101, "sampling/importance_sampling_ratio/min": 0.46061256527900696, "sampling/sampling_logp_difference/max": 0.463133841753006, "sampling/sampling_logp_difference/mean": 0.08755563199520111, "step": 197, "step_time": 10.420948368001063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9122908115386963, "epoch": 0.00198, "grad_norm": 4.955950134899467e-05, "kl": 0.6581820687279105, "learning_rate": 7.99999028343357e-06, "loss": 0.0, "step": 198, "step_time": 5.955800602000636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.53125, "completions/mean_terminated_length": 2.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0404513776302338, "epoch": 0.00199, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003683996037580073, "kl": 0.8756526485085487, "learning_rate": 7.999990163105786e-06, "loss": 0.0, "num_tokens": 4708237.0, "reward": 1.9812500476837158, "reward_std": 1.2309025526046753, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.53125, "rewards/probe_completion_length/std": 0.507007360458374, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 2.5098929405212402, "sampling/importance_sampling_ratio/mean": 0.9004417657852173, "sampling/importance_sampling_ratio/min": 0.1496535837650299, "sampling/sampling_logp_difference/max": 1.4193356037139893, "sampling/sampling_logp_difference/mean": 0.14671702682971954, "step": 199, "step_time": 10.53082898600087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.051968514919281, "epoch": 0.002, "grad_norm": 0.00044793030247092247, "kl": 0.8906243778765202, "learning_rate": 7.999990042037526e-06, "loss": 0.0, "step": 200, "step_time": 6.516393788000187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5625, "completions/mean_terminated_length": 2.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1170097887516022, "epoch": 0.00201, "frac_reward_zero_std": 1.0, "grad_norm": 6.0482478147605434e-05, "kl": 0.7796681722393259, "learning_rate": 7.999989920228787e-06, "loss": 0.0, "num_tokens": 4754978.0, "reward": 2.012500047683716, "reward_std": 1.2164862155914307, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5625, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.2457513809204102, "sampling/importance_sampling_ratio/mean": 0.8713302612304688, "sampling/importance_sampling_ratio/min": 0.3511073887348175, "sampling/sampling_logp_difference/max": 0.9598789215087891, "sampling/sampling_logp_difference/mean": 0.10739913582801819, "step": 201, "step_time": 9.898047209000197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1049348339438438, "epoch": 0.00202, "grad_norm": 9.396205859957263e-05, "kl": 0.7901200173655525, "learning_rate": 7.999989797679573e-06, "loss": 0.0, "step": 202, "step_time": 6.52449063100039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.033333334140479565, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033333334140479565, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0378850102424622, "epoch": 0.00203, "frac_reward_zero_std": 0.75, "grad_norm": 0.0005378347705118358, "kl": 0.2997244680300355, "learning_rate": 7.99998967438988e-06, "loss": -0.0001, "num_tokens": 4805189.0, "reward": 1.5750000476837158, "reward_std": 0.7931155562400818, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5, "rewards/probe_completion_length/std": 0.5080004930496216, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9375, "rewards/probe_shaping_dominance/std": 0.24593468010425568, "rewards/probe_terminal_raw/mean": 0.0625, "rewards/probe_terminal_raw/std": 0.24593468010425568, "rewards/rollout_reward_func/mean": -0.875, "rewards/rollout_reward_func/std": 0.49186936020851135, "sampling/importance_sampling_ratio/max": 1.14525306224823, "sampling/importance_sampling_ratio/mean": 0.758590817451477, "sampling/importance_sampling_ratio/min": 0.21433056890964508, "sampling/sampling_logp_difference/max": 1.4624667167663574, "sampling/sampling_logp_difference/mean": 0.15886078774929047, "step": 203, "step_time": 9.993277525999929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 1.058366522192955, "epoch": 0.00204, "grad_norm": 0.0009679613285697997, "kl": 0.3104899302124977, "learning_rate": 7.999989550359713e-06, "loss": -0.0001, "step": 204, "step_time": 6.569216364000113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5625, "completions/mean_terminated_length": 2.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0512825474143028, "epoch": 0.00205, "frac_reward_zero_std": 1.0, "grad_norm": 8.101179992081597e-05, "kl": 0.15521670621819794, "learning_rate": 7.999989425589066e-06, "loss": 0.0, "num_tokens": 4852978.0, "reward": 2.012500047683716, "reward_std": 1.2164862155914307, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5625, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.1806155443191528, "sampling/importance_sampling_ratio/mean": 0.8150932192802429, "sampling/importance_sampling_ratio/min": 0.44537922739982605, "sampling/sampling_logp_difference/max": 0.5530131459236145, "sampling/sampling_logp_difference/mean": 0.11240445077419281, "step": 205, "step_time": 10.127766082000107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0513731241226196, "epoch": 0.00206, "grad_norm": 0.00010898512846324593, "kl": 0.16864246875047684, "learning_rate": 7.999989300077943e-06, "loss": 0.0, "step": 206, "step_time": 6.666827258999547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.65625, "completions/mean_terminated_length": 2.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9041898250579834, "epoch": 0.00207, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006515422137454152, "kl": 1.24289670586586, "learning_rate": 7.999989173826344e-06, "loss": 0.0, "num_tokens": 4900008.0, "reward": 1.6687500476837158, "reward_std": 0.45680341124534607, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.65625, "rewards/probe_completion_length/std": 0.4825586974620819, "rewards/probe_invalid_count/mean": 0.0625, "rewards/probe_invalid_count/std": 0.24593468010425568, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.87306547164917, "sampling/importance_sampling_ratio/mean": 0.8855913281440735, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.2157533168792725, "sampling/sampling_logp_difference/mean": 0.18908201158046722, "step": 207, "step_time": 9.946056094000596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8971953243017197, "epoch": 0.00208, "grad_norm": 0.0004797672445420176, "kl": 1.1996175777167082, "learning_rate": 7.999989046834267e-06, "loss": 0.0, "step": 208, "step_time": 6.549594394000906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 2.59375, "completions/mean_terminated_length": 2.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9334496781229973, "epoch": 0.00209, "frac_reward_zero_std": 1.0, "grad_norm": 2.8083361030439846e-05, "kl": 0.10746793961152434, "learning_rate": 7.999988919101714e-06, "loss": 0.0, "num_tokens": 4947000.0, "reward": 2.1062498092651367, "reward_std": 1.7799850702285767, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.65625, "rewards/probe_completion_length/std": 1.4280457496643066, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.4213379621505737, "sampling/importance_sampling_ratio/mean": 0.8671090006828308, "sampling/importance_sampling_ratio/min": 0.6377511024475098, "sampling/sampling_logp_difference/max": 0.518344521522522, "sampling/sampling_logp_difference/mean": 0.08076303452253342, "step": 209, "step_time": 9.98248077000062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9456727504730225, "epoch": 0.0021, "grad_norm": 3.3368298318237066e-05, "kl": 0.1082556719193235, "learning_rate": 7.999988790628685e-06, "loss": 0.0, "step": 210, "step_time": 6.5108358289990065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.3125, "completions/mean_terminated_length": 2.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2894764840602875, "epoch": 0.00211, "frac_reward_zero_std": 1.0, "grad_norm": 0.01579872891306877, "kl": 1.3035051634069532, "learning_rate": 7.999988661415179e-06, "loss": 0.0, "num_tokens": 4996332.0, "reward": 1.2625000476837158, "reward_std": 0.4709290862083435, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.3125, "rewards/probe_completion_length/std": 0.4709290862083435, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.0238986015319824, "sampling/importance_sampling_ratio/mean": 0.7401514053344727, "sampling/importance_sampling_ratio/min": 0.20138022303581238, "sampling/sampling_logp_difference/max": 1.3580923080444336, "sampling/sampling_logp_difference/mean": 0.17557558417320251, "step": 211, "step_time": 10.527099645000362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2880031913518906, "epoch": 0.00212, "grad_norm": 0.0018287961138412356, "kl": 1.0675127455033362, "learning_rate": 7.999988531461196e-06, "loss": 0.0, "step": 212, "step_time": 6.03448314499974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 6.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9502601698040962, "epoch": 0.00213, "frac_reward_zero_std": 0.75, "grad_norm": 0.005205703433603048, "kl": 1.4420340545475483, "learning_rate": 7.999988400766736e-06, "loss": 0.0, "num_tokens": 5047044.0, "reward": 12.856250762939453, "reward_std": 58.7901496887207, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 12.09375, "rewards/probe_completion_length/std": 58.92715835571289, "rewards/probe_invalid_count/mean": 0.0625, "rewards/probe_invalid_count/std": 0.24593468010425568, "rewards/probe_shaping_dominance/mean": 0.625, "rewards/probe_shaping_dominance/std": 0.49186936020851135, "rewards/probe_terminal_raw/mean": 0.375, "rewards/probe_terminal_raw/std": 0.49186936020851135, "rewards/rollout_reward_func/mean": -0.25, "rewards/rollout_reward_func/std": 0.9837387204170227, "sampling/importance_sampling_ratio/max": 2.39088773727417, "sampling/importance_sampling_ratio/mean": 0.8575817942619324, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.5442379713058472, "sampling/sampling_logp_difference/mean": 0.19435152411460876, "step": 213, "step_time": 11.199458510000568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02291666716337204, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "entropy": 0.9396770298480988, "epoch": 0.00214, "grad_norm": 0.0025155353359878063, "kl": 1.3642384968698025, "learning_rate": 7.999988269331798e-06, "loss": -0.0, "step": 214, "step_time": 6.795581481000227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.625, "completions/mean_terminated_length": 2.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7093618139624596, "epoch": 0.00215, "frac_reward_zero_std": 1.0, "grad_norm": 4.147308936808258e-05, "kl": 0.8155974000692368, "learning_rate": 7.999988137156384e-06, "loss": 0.0, "num_tokens": 5092001.0, "reward": 2.575000047683716, "reward_std": 1.4312187433242798, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.625, "rewards/probe_completion_length/std": 0.49186936020851135, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.1610689163208008, "sampling/importance_sampling_ratio/mean": 0.9294631481170654, "sampling/importance_sampling_ratio/min": 0.6643704175949097, "sampling/sampling_logp_difference/max": 0.2963399291038513, "sampling/sampling_logp_difference/mean": 0.05530760437250137, "step": 215, "step_time": 10.349047988000166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7076744586229324, "epoch": 0.00216, "grad_norm": 2.461934491293505e-05, "kl": 0.8128750696778297, "learning_rate": 7.999988004240496e-06, "loss": 0.0, "step": 216, "step_time": 5.973635649000244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 2.78125, "completions/mean_terminated_length": 2.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9698906242847443, "epoch": 0.00217, "frac_reward_zero_std": 0.75, "grad_norm": 0.0015630932757630944, "kl": 0.5165037027036306, "learning_rate": 7.99998787058413e-06, "loss": -0.0001, "num_tokens": 5138251.0, "reward": 2.481250047683716, "reward_std": 1.6260851621627808, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 1.221035361289978, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.65625, "rewards/probe_shaping_dominance/std": 0.4825586974620819, "rewards/probe_terminal_raw/mean": 0.34375, "rewards/probe_terminal_raw/std": 0.4825586974620819, "rewards/rollout_reward_func/mean": -0.3125, "rewards/rollout_reward_func/std": 0.9651173949241638, "sampling/importance_sampling_ratio/max": 1.0233986377716064, "sampling/importance_sampling_ratio/mean": 0.8025634288787842, "sampling/importance_sampling_ratio/min": 0.2496911883354187, "sampling/sampling_logp_difference/max": 1.2798473834991455, "sampling/sampling_logp_difference/mean": 0.1027006059885025, "step": 217, "step_time": 9.982337998999355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9763199612498283, "epoch": 0.00218, "grad_norm": 0.001598120667040348, "kl": 0.5372840870841173, "learning_rate": 7.999987736187286e-06, "loss": -0.0001, "step": 218, "step_time": 6.55624300799991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5885202661156654, "epoch": 0.00219, "frac_reward_zero_std": 0.75, "grad_norm": 0.0022829018998891115, "kl": 1.3220407664775848, "learning_rate": 7.999987601049967e-06, "loss": -0.0001, "num_tokens": 5188054.0, "reward": 3.168750047683716, "reward_std": 1.2110878229141235, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 0.3689020276069641, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.3125, "rewards/probe_shaping_dominance/std": 0.4709290862083435, "rewards/probe_terminal_raw/mean": 0.6875, "rewards/probe_terminal_raw/std": 0.4709290862083435, "rewards/rollout_reward_func/mean": 0.375, "rewards/rollout_reward_func/std": 0.941858172416687, "sampling/importance_sampling_ratio/max": 1.3682610988616943, "sampling/importance_sampling_ratio/mean": 0.8985254764556885, "sampling/importance_sampling_ratio/min": 0.30704811215400696, "sampling/sampling_logp_difference/max": 1.0124280452728271, "sampling/sampling_logp_difference/mean": 0.07173188030719757, "step": 219, "step_time": 10.571298075000414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.5978320278227329, "epoch": 0.0022, "grad_norm": 0.0007615772192366421, "kl": 1.3663252778351307, "learning_rate": 7.99998746517217e-06, "loss": -0.0001, "step": 220, "step_time": 6.068325715998981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.65625, "completions/mean_terminated_length": 2.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.091103121638298, "epoch": 0.00221, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003277143114246428, "kl": 0.8019103519618511, "learning_rate": 7.999987328553901e-06, "loss": 0.0, "num_tokens": 5236858.0, "reward": 1.6062500476837158, "reward_std": 0.4825586974620819, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.65625, "rewards/probe_completion_length/std": 0.4825586974620819, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9474204778671265, "sampling/importance_sampling_ratio/mean": 0.8071853518486023, "sampling/importance_sampling_ratio/min": 0.23529821634292603, "sampling/sampling_logp_difference/max": 1.3313655853271484, "sampling/sampling_logp_difference/mean": 0.15911154448986053, "step": 221, "step_time": 10.23791552100056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.089771643280983, "epoch": 0.00222, "grad_norm": 0.00040844481554813683, "kl": 0.8609168156981468, "learning_rate": 7.999987191195152e-06, "loss": 0.0, "step": 222, "step_time": 7.202484536999691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5625, "completions/mean_terminated_length": 2.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9173014238476753, "epoch": 0.00223, "frac_reward_zero_std": 1.0, "grad_norm": 7.766550697851926e-05, "kl": 1.2079013511538506, "learning_rate": 7.999987053095927e-06, "loss": 0.0, "num_tokens": 5288141.0, "reward": 2.012500047683716, "reward_std": 1.2164862155914307, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5625, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.8275642395019531, "sampling/importance_sampling_ratio/mean": 0.8998434543609619, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3255605697631836, "sampling/sampling_logp_difference/mean": 0.11937170475721359, "step": 223, "step_time": 10.373311099000148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9247178286314011, "epoch": 0.00224, "grad_norm": 7.279904093593359e-05, "kl": 1.2060724347829819, "learning_rate": 7.999986914256228e-06, "loss": 0.0, "step": 224, "step_time": 6.244458222000503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.78125, "completions/mean_terminated_length": 2.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7969616949558258, "epoch": 0.00225, "frac_reward_zero_std": 0.5, "grad_norm": 0.0032166424207389355, "kl": 1.2331771329045296, "learning_rate": 7.99998677467605e-06, "loss": -0.0001, "num_tokens": 5329787.0, "reward": 2.543750047683716, "reward_std": 1.2406911849975586, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 0.420013427734375, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.59375, "rewards/probe_shaping_dominance/std": 0.49899089336395264, "rewards/probe_terminal_raw/mean": 0.40625, "rewards/probe_terminal_raw/std": 0.49899089336395264, "rewards/rollout_reward_func/mean": -0.1875, "rewards/rollout_reward_func/std": 0.9979817867279053, "sampling/importance_sampling_ratio/max": 0.9643709063529968, "sampling/importance_sampling_ratio/mean": 0.7731209993362427, "sampling/importance_sampling_ratio/min": 0.25780153274536133, "sampling/sampling_logp_difference/max": 1.1979801654815674, "sampling/sampling_logp_difference/mean": 0.11933620274066925, "step": 225, "step_time": 10.426335748000838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7953629866242409, "epoch": 0.00226, "grad_norm": 0.0023331514094024897, "kl": 1.408123280853033, "learning_rate": 7.999986634355396e-06, "loss": -0.0001, "step": 226, "step_time": 6.432960845000707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5347747877240181, "epoch": 0.00227, "frac_reward_zero_std": 1.0, "grad_norm": 8.225801138905808e-05, "kl": 0.4508226178586483, "learning_rate": 7.999986493294267e-06, "loss": 0.0, "num_tokens": 5367612.0, "reward": 3.262500047683716, "reward_std": 1.2296733856201172, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 0.3965577781200409, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.25, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.75, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": 0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.0039564371109009, "sampling/importance_sampling_ratio/mean": 0.9031103849411011, "sampling/importance_sampling_ratio/min": 0.511532723903656, "sampling/sampling_logp_difference/max": 0.46492481231689453, "sampling/sampling_logp_difference/mean": 0.04437258839607239, "step": 227, "step_time": 9.742115944000489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5368616580963135, "epoch": 0.00228, "grad_norm": 5.892190893064253e-05, "kl": 0.4490237596037332, "learning_rate": 7.99998635149266e-06, "loss": 0.0, "step": 228, "step_time": 5.865072176000012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5625, "completions/mean_terminated_length": 2.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0168335363268852, "epoch": 0.00229, "frac_reward_zero_std": 1.0, "grad_norm": 3.180320345563814e-05, "kl": 0.3754228218458593, "learning_rate": 7.99998620895058e-06, "loss": 0.0, "num_tokens": 5417542.0, "reward": 2.012500047683716, "reward_std": 1.2164862155914307, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5625, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.1908074617385864, "sampling/importance_sampling_ratio/mean": 0.8601307272911072, "sampling/importance_sampling_ratio/min": 0.5950575470924377, "sampling/sampling_logp_difference/max": 0.333740234375, "sampling/sampling_logp_difference/mean": 0.08275139331817627, "step": 229, "step_time": 10.50062071100001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.01875402033329, "epoch": 0.0023, "grad_norm": 3.042643038497772e-05, "kl": 0.3755720476619899, "learning_rate": 7.999986065668021e-06, "loss": 0.0, "step": 230, "step_time": 6.5292883610009085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1066185683012009, "epoch": 0.00231, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006037249695509672, "kl": 0.5976095534861088, "learning_rate": 7.999985921644988e-06, "loss": 0.0, "num_tokens": 5464839.0, "reward": 1.9500000476837158, "reward_std": 1.2443419694900513, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5, "rewards/probe_completion_length/std": 0.5080004930496216, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.0169601440429688, "sampling/importance_sampling_ratio/mean": 0.7551096677780151, "sampling/importance_sampling_ratio/min": 0.1823217123746872, "sampling/sampling_logp_difference/max": 1.54960298538208, "sampling/sampling_logp_difference/mean": 0.14190775156021118, "step": 231, "step_time": 9.871105007000097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1074366867542267, "epoch": 0.00232, "grad_norm": 0.00048756369506008923, "kl": 0.5469358395785093, "learning_rate": 7.999985776881479e-06, "loss": 0.0, "step": 232, "step_time": 6.5496248439994815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.625, "completions/mean_terminated_length": 2.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8052812218666077, "epoch": 0.00233, "frac_reward_zero_std": 1.0, "grad_norm": 0.000179827562533319, "kl": 0.8202984808012843, "learning_rate": 7.999985631377493e-06, "loss": 0.0, "num_tokens": 5508160.0, "reward": 2.075000047683716, "reward_std": 1.1845782995224, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.625, "rewards/probe_completion_length/std": 0.49186936020851135, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 2.091932773590088, "sampling/importance_sampling_ratio/mean": 0.9047304391860962, "sampling/importance_sampling_ratio/min": 0.3661075830459595, "sampling/sampling_logp_difference/max": 0.8838050961494446, "sampling/sampling_logp_difference/mean": 0.12408536672592163, "step": 233, "step_time": 10.55139884099981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8096337169408798, "epoch": 0.00234, "grad_norm": 0.00014511325571220368, "kl": 0.8095588530413806, "learning_rate": 7.99998548513303e-06, "loss": 0.0, "step": 234, "step_time": 6.070462433999637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.53125, "completions/mean_terminated_length": 2.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9569649696350098, "epoch": 0.00235, "frac_reward_zero_std": 1.0, "grad_norm": 9.280137601308525e-05, "kl": 0.7535966765135527, "learning_rate": 7.999985338148094e-06, "loss": 0.0, "num_tokens": 5553778.0, "reward": 1.9812500476837158, "reward_std": 1.2309025526046753, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.53125, "rewards/probe_completion_length/std": 0.507007360458374, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.3018815517425537, "sampling/importance_sampling_ratio/mean": 0.8392325639724731, "sampling/importance_sampling_ratio/min": 0.22106604278087616, "sampling/sampling_logp_difference/max": 0.8340189456939697, "sampling/sampling_logp_difference/mean": 0.11187630146741867, "step": 235, "step_time": 9.83470956599922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9559657499194145, "epoch": 0.00236, "grad_norm": 6.163664511404932e-05, "kl": 0.7435445003211498, "learning_rate": 7.99998519042268e-06, "loss": 0.0, "step": 236, "step_time": 6.452000556000257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.28125, "completions/mean_terminated_length": 2.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.294971227645874, "epoch": 0.00237, "frac_reward_zero_std": 1.0, "grad_norm": 4.2004070564871654e-05, "kl": 0.04399583343183622, "learning_rate": 7.99998504195679e-06, "loss": 0.0, "num_tokens": 5602651.0, "reward": 1.2312500476837158, "reward_std": 0.45680341124534607, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.28125, "rewards/probe_completion_length/std": 0.45680341124534607, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.0553680658340454, "sampling/importance_sampling_ratio/mean": 0.7829026579856873, "sampling/importance_sampling_ratio/min": 0.41982340812683105, "sampling/sampling_logp_difference/max": 0.5661625862121582, "sampling/sampling_logp_difference/mean": 0.12211545556783676, "step": 237, "step_time": 10.069317695000791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.300062656402588, "epoch": 0.00238, "grad_norm": 3.1589595892000943e-05, "kl": 0.039544737548567355, "learning_rate": 7.999984892750425e-06, "loss": 0.0, "step": 238, "step_time": 6.534631725000509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.65625, "completions/mean_terminated_length": 2.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9069351330399513, "epoch": 0.00239, "frac_reward_zero_std": 1.0, "grad_norm": 0.00010841053153853863, "kl": 0.9024822637438774, "learning_rate": 7.999984742803586e-06, "loss": 0.0, "num_tokens": 5651408.0, "reward": 2.106250047683716, "reward_std": 1.1670026779174805, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.65625, "rewards/probe_completion_length/std": 0.4825586974620819, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.0072801113128662, "sampling/importance_sampling_ratio/mean": 0.7536620497703552, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.7853782176971436, "sampling/sampling_logp_difference/mean": 0.13082578778266907, "step": 239, "step_time": 10.10397600000033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9182785451412201, "epoch": 0.0024, "grad_norm": 0.00010150086745852605, "kl": 0.898620568215847, "learning_rate": 7.999984592116268e-06, "loss": 0.0, "step": 240, "step_time": 6.599010469999939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.78125, "completions/mean_terminated_length": 2.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.791859045624733, "epoch": 0.00241, "frac_reward_zero_std": 1.0, "grad_norm": 0.001330450875684619, "kl": 0.6041602776385844, "learning_rate": 7.999984440688477e-06, "loss": 0.0, "num_tokens": 5695231.0, "reward": 2.762500047683716, "reward_std": 1.2556325197219849, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 0.420013427734375, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.0298597812652588, "sampling/importance_sampling_ratio/mean": 0.7487383484840393, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.494455337524414, "sampling/sampling_logp_difference/mean": 0.15715257823467255, "step": 241, "step_time": 9.925578503999986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7927238866686821, "epoch": 0.00242, "grad_norm": 0.0007401671609841287, "kl": 0.46895526768639684, "learning_rate": 7.999984288520209e-06, "loss": 0.0, "step": 242, "step_time": 5.904219341000498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.59375, "completions/mean_terminated_length": 2.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8608187511563301, "epoch": 0.00243, "frac_reward_zero_std": 1.0, "grad_norm": 3.881949669448659e-05, "kl": 0.20748848652510787, "learning_rate": 7.999984135611465e-06, "loss": 0.0, "num_tokens": 5739390.0, "reward": 2.543750047683716, "reward_std": 1.4560080766677856, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.59375, "rewards/probe_completion_length/std": 0.49899089336395264, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.1807578802108765, "sampling/importance_sampling_ratio/mean": 0.9090617895126343, "sampling/importance_sampling_ratio/min": 0.38658732175827026, "sampling/sampling_logp_difference/max": 0.6620534062385559, "sampling/sampling_logp_difference/mean": 0.07685399055480957, "step": 243, "step_time": 10.37353355400046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8746257722377777, "epoch": 0.00244, "grad_norm": 4.032689321320504e-05, "kl": 0.20645966165466234, "learning_rate": 7.999983981962246e-06, "loss": 0.0, "step": 244, "step_time": 6.476325880000331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 2.375, "completions/mean_terminated_length": 2.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.4716509729623795, "epoch": 0.00245, "frac_reward_zero_std": 1.0, "grad_norm": 9.890943329082802e-05, "kl": 0.11096427217125893, "learning_rate": 7.999983827572551e-06, "loss": 0.0, "num_tokens": 5786717.0, "reward": 1.3875000476837158, "reward_std": 1.435438632965088, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.4375, "rewards/probe_completion_length/std": 1.4354385137557983, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2329347133636475, "sampling/importance_sampling_ratio/mean": 0.7989503145217896, "sampling/importance_sampling_ratio/min": 0.3760721981525421, "sampling/sampling_logp_difference/max": 0.6286892890930176, "sampling/sampling_logp_difference/mean": 0.14064887166023254, "step": 245, "step_time": 9.838691988000392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.481234386563301, "epoch": 0.00246, "grad_norm": 0.00011490120232338086, "kl": 0.1183417895808816, "learning_rate": 7.999983672442382e-06, "loss": 0.0, "step": 246, "step_time": 5.9876724660002765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.71875, "completions/mean_terminated_length": 2.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9683929979801178, "epoch": 0.00247, "frac_reward_zero_std": 0.75, "grad_norm": 0.0005364255630411208, "kl": 1.5163855329155922, "learning_rate": 7.999983516571737e-06, "loss": -0.0, "num_tokens": 5834566.0, "reward": 2.606250047683716, "reward_std": 1.334634780883789, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.71875, "rewards/probe_completion_length/std": 0.45680341124534607, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.53125, "rewards/probe_shaping_dominance/std": 0.507007360458374, "rewards/probe_terminal_raw/mean": 0.46875, "rewards/probe_terminal_raw/std": 0.507007360458374, "rewards/rollout_reward_func/mean": -0.0625, "rewards/rollout_reward_func/std": 1.014014720916748, "sampling/importance_sampling_ratio/max": 1.0176223516464233, "sampling/importance_sampling_ratio/mean": 0.8093540668487549, "sampling/importance_sampling_ratio/min": 0.3403654098510742, "sampling/sampling_logp_difference/max": 0.8563747406005859, "sampling/sampling_logp_difference/mean": 0.11184030771255493, "step": 247, "step_time": 10.412933766999686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9714526236057281, "epoch": 0.00248, "grad_norm": 0.001109355129301548, "kl": 1.3958311006426811, "learning_rate": 7.999983359960615e-06, "loss": -0.0, "step": 248, "step_time": 6.410530790000394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2722000256180763, "epoch": 0.00249, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002374774921918288, "kl": 0.2879834126215428, "learning_rate": 7.999983202609019e-06, "loss": 0.0, "num_tokens": 5885261.0, "reward": 1.3875000476837158, "reward_std": 0.504016101360321, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.4375, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.276147484779358, "sampling/importance_sampling_ratio/mean": 0.7094168066978455, "sampling/importance_sampling_ratio/min": 0.21389688551425934, "sampling/sampling_logp_difference/max": 1.3515539169311523, "sampling/sampling_logp_difference/mean": 0.18311890959739685, "step": 249, "step_time": 10.01058061699996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.271938718855381, "epoch": 0.0025, "grad_norm": 0.0001647109747864306, "kl": 0.30384280625730753, "learning_rate": 7.999983044516948e-06, "loss": 0.0, "step": 250, "step_time": 6.048756611999579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.25, "completions/mean_terminated_length": 2.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.4384316056966782, "epoch": 0.00251, "frac_reward_zero_std": 1.0, "grad_norm": 5.8330115280114114e-05, "kl": 0.07054687337949872, "learning_rate": 7.999982885684401e-06, "loss": 0.0, "num_tokens": 5931755.0, "reward": 1.2000000476837158, "reward_std": 0.4399413466453552, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.25, "rewards/probe_completion_length/std": 0.4399413466453552, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.0516539812088013, "sampling/importance_sampling_ratio/mean": 0.7808082103729248, "sampling/importance_sampling_ratio/min": 0.33390164375305176, "sampling/sampling_logp_difference/max": 0.6143656969070435, "sampling/sampling_logp_difference/mean": 0.13979840278625488, "step": 251, "step_time": 10.83073114400031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4351339936256409, "epoch": 0.00252, "grad_norm": 4.7220288251992315e-05, "kl": 0.06005410762736574, "learning_rate": 7.99998272611138e-06, "loss": 0.0, "step": 252, "step_time": 5.9603065350002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 2.625, "completions/mean_terminated_length": 2.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2312592715024948, "epoch": 0.00253, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005001653335057199, "kl": 0.42731712557724677, "learning_rate": 7.999982565797882e-06, "loss": 0.0, "num_tokens": 5983713.0, "reward": 1.6999999284744263, "reward_std": 1.2443419694900513, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.6875, "rewards/probe_completion_length/std": 1.2556324005126953, "rewards/probe_invalid_count/mean": 0.0625, "rewards/probe_invalid_count/std": 0.24593468010425568, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4038161039352417, "sampling/importance_sampling_ratio/mean": 0.8100531697273254, "sampling/importance_sampling_ratio/min": 0.29241448640823364, "sampling/sampling_logp_difference/max": 1.844306468963623, "sampling/sampling_logp_difference/mean": 0.14470317959785461, "step": 253, "step_time": 10.147666580999612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2324032559990883, "epoch": 0.00254, "grad_norm": 0.0008306861855089664, "kl": 0.47986630973173305, "learning_rate": 7.999982404743908e-06, "loss": 0.0, "step": 254, "step_time": 6.12800545500113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7071742042899132, "epoch": 0.00255, "frac_reward_zero_std": 1.0, "grad_norm": 0.00011028751032426953, "kl": 1.5195309668779373, "learning_rate": 7.999982242949461e-06, "loss": 0.0, "num_tokens": 6031353.0, "reward": 2.325000047683716, "reward_std": 1.1845781803131104, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.7071067690849304, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.6566476821899414, "sampling/importance_sampling_ratio/mean": 0.869540810585022, "sampling/importance_sampling_ratio/min": 0.24219129979610443, "sampling/sampling_logp_difference/max": 0.9972515106201172, "sampling/sampling_logp_difference/mean": 0.09803014993667603, "step": 255, "step_time": 10.103539768000246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7113880515098572, "epoch": 0.00256, "grad_norm": 0.00011303301289444789, "kl": 1.5209232168272138, "learning_rate": 7.999982080414539e-06, "loss": 0.0, "step": 256, "step_time": 7.0594580389997645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.78125, "completions/mean_terminated_length": 2.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.795683853328228, "epoch": 0.00257, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001374085695715621, "kl": 0.541247180350183, "learning_rate": 7.999981917139141e-06, "loss": 0.0, "num_tokens": 6079938.0, "reward": 2.731250047683716, "reward_std": 1.2885193824768066, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 0.420013427734375, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.2407677173614502, "sampling/importance_sampling_ratio/mean": 0.8522495031356812, "sampling/importance_sampling_ratio/min": 0.46599096059799194, "sampling/sampling_logp_difference/max": 0.549491286277771, "sampling/sampling_logp_difference/mean": 0.07480097562074661, "step": 257, "step_time": 10.19285303900051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7980589717626572, "epoch": 0.00258, "grad_norm": 0.0001367518270853907, "kl": 0.5404217857867479, "learning_rate": 7.999981753123268e-06, "loss": 0.0, "step": 258, "step_time": 6.162642582999524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5748703218996525, "epoch": 0.00259, "frac_reward_zero_std": 1.0, "grad_norm": 9.582992061041296e-05, "kl": 1.202084738528356, "learning_rate": 7.999981588366921e-06, "loss": 0.0, "num_tokens": 6122880.0, "reward": 2.762500047683716, "reward_std": 1.2556325197219849, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 0.3965577781200409, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.2011796236038208, "sampling/importance_sampling_ratio/mean": 0.7956522703170776, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.9468035697937012, "sampling/sampling_logp_difference/mean": 0.13236525654792786, "step": 259, "step_time": 9.850775819999853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5715639665722847, "epoch": 0.0026, "grad_norm": 9.380384290125221e-05, "kl": 1.196174212731421, "learning_rate": 7.999981422870099e-06, "loss": 0.0, "step": 260, "step_time": 6.849544752000384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5606408379971981, "epoch": 0.00261, "frac_reward_zero_std": 0.75, "grad_norm": 0.0020095854997634888, "kl": 0.7049203501082957, "learning_rate": 7.999981256632802e-06, "loss": -0.0001, "num_tokens": 6166733.0, "reward": 3.075000047683716, "reward_std": 1.263635277748108, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 0.3965577781200409, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.34375, "rewards/probe_shaping_dominance/std": 0.4825586974620819, "rewards/probe_terminal_raw/mean": 0.65625, "rewards/probe_terminal_raw/std": 0.4825586974620819, "rewards/rollout_reward_func/mean": 0.3125, "rewards/rollout_reward_func/std": 0.9651173949241638, "sampling/importance_sampling_ratio/max": 1.6943975687026978, "sampling/importance_sampling_ratio/mean": 0.94061279296875, "sampling/importance_sampling_ratio/min": 0.6277596354484558, "sampling/sampling_logp_difference/max": 0.5732210874557495, "sampling/sampling_logp_difference/mean": 0.058817073702812195, "step": 261, "step_time": 9.924657725000088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5698106549680233, "epoch": 0.00262, "grad_norm": 0.0022092636208981276, "kl": 0.7005162222776562, "learning_rate": 7.999981089655028e-06, "loss": -0.0001, "step": 262, "step_time": 5.950440025000717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.53125, "completions/mean_terminated_length": 2.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0628485679626465, "epoch": 0.00263, "frac_reward_zero_std": 0.75, "grad_norm": 0.0022156224586069584, "kl": 1.1021189391613007, "learning_rate": 7.999980921936782e-06, "loss": -0.0001, "num_tokens": 6216735.0, "reward": 1.7625000476837158, "reward_std": 0.9651173949241638, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.53125, "rewards/probe_completion_length/std": 0.507007360458374, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.875, "rewards/probe_shaping_dominance/std": 0.33601075410842896, "rewards/probe_terminal_raw/mean": 0.125, "rewards/probe_terminal_raw/std": 0.33601075410842896, "rewards/rollout_reward_func/mean": -0.75, "rewards/rollout_reward_func/std": 0.6720215082168579, "sampling/importance_sampling_ratio/max": 1.2917293310165405, "sampling/importance_sampling_ratio/mean": 0.7865704298019409, "sampling/importance_sampling_ratio/min": 0.22512774169445038, "sampling/sampling_logp_difference/max": 1.3956575393676758, "sampling/sampling_logp_difference/mean": 0.15481485426425934, "step": 263, "step_time": 10.660381645999678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0550967678427696, "epoch": 0.00264, "grad_norm": 0.002029229188337922, "kl": 1.2448055073618889, "learning_rate": 7.999980753478058e-06, "loss": -0.0001, "step": 264, "step_time": 6.621740929000225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 2.34375, "completions/mean_terminated_length": 2.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.3302894234657288, "epoch": 0.00265, "frac_reward_zero_std": 1.0, "grad_norm": 4.390978574519977e-05, "kl": 0.07602045580279082, "learning_rate": 7.999980584278861e-06, "loss": 0.0, "num_tokens": 6266793.0, "reward": 1.3562500476837158, "reward_std": 1.2664244174957275, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.40625, "rewards/probe_completion_length/std": 1.266424298286438, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5775710344314575, "sampling/importance_sampling_ratio/mean": 0.8080694079399109, "sampling/importance_sampling_ratio/min": 0.4113946259021759, "sampling/sampling_logp_difference/max": 0.6071059703826904, "sampling/sampling_logp_difference/mean": 0.13547483086585999, "step": 265, "step_time": 10.054258211000615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3213010132312775, "epoch": 0.00266, "grad_norm": 4.630487455870025e-05, "kl": 0.0754888579249382, "learning_rate": 7.999980414339192e-06, "loss": 0.0, "step": 266, "step_time": 6.096907744999498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.78125, "completions/mean_terminated_length": 2.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5773092843592167, "epoch": 0.00267, "frac_reward_zero_std": 0.75, "grad_norm": 0.0018685347167775035, "kl": 1.856453113257885, "learning_rate": 7.999980243659046e-06, "loss": -0.0, "num_tokens": 6315035.0, "reward": 3.168750047683716, "reward_std": 1.2885193824768066, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 0.420013427734375, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.28125, "rewards/probe_shaping_dominance/std": 0.45680341124534607, "rewards/probe_terminal_raw/mean": 0.71875, "rewards/probe_terminal_raw/std": 0.45680341124534607, "rewards/rollout_reward_func/mean": 0.4375, "rewards/rollout_reward_func/std": 0.9136068224906921, "sampling/importance_sampling_ratio/max": 1.2453944683074951, "sampling/importance_sampling_ratio/mean": 0.9141055345535278, "sampling/importance_sampling_ratio/min": 0.3399631381034851, "sampling/sampling_logp_difference/max": 0.9761523008346558, "sampling/sampling_logp_difference/mean": 0.06499834358692169, "step": 267, "step_time": 9.843675738999991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5740438252687454, "epoch": 0.00268, "grad_norm": 0.0016455126460641623, "kl": 1.8581569492816925, "learning_rate": 7.999980072238424e-06, "loss": -0.0001, "step": 268, "step_time": 6.936017353999887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.6875, "completions/mean_terminated_length": 2.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7434339039027691, "epoch": 0.00269, "frac_reward_zero_std": 1.0, "grad_norm": 4.207367601338774e-05, "kl": 0.4917968454919901, "learning_rate": 7.999979900077329e-06, "loss": 0.0, "num_tokens": 6363326.0, "reward": 2.637500047683716, "reward_std": 1.3781123161315918, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.6875, "rewards/probe_completion_length/std": 0.4709290862083435, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.243372917175293, "sampling/importance_sampling_ratio/mean": 0.9188703298568726, "sampling/importance_sampling_ratio/min": 0.5505338311195374, "sampling/sampling_logp_difference/max": 0.6038958430290222, "sampling/sampling_logp_difference/mean": 0.07901027053594589, "step": 269, "step_time": 9.94298315099968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7558643519878387, "epoch": 0.0027, "grad_norm": 4.379342863103375e-05, "kl": 0.48969965358264744, "learning_rate": 7.99997972717576e-06, "loss": 0.0, "step": 270, "step_time": 6.032589478999853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.78125, "completions/mean_terminated_length": 2.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6409019902348518, "epoch": 0.00271, "frac_reward_zero_std": 1.0, "grad_norm": 0.020250845700502396, "kl": 5.129798948764801, "learning_rate": 7.999979553533716e-06, "loss": 0.0001, "num_tokens": 6407150.0, "reward": 2.762500047683716, "reward_std": 1.2556325197219849, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 0.420013427734375, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.0568947792053223, "sampling/importance_sampling_ratio/mean": 0.7568554282188416, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.129774332046509, "sampling/sampling_logp_difference/mean": 0.1845780611038208, "step": 271, "step_time": 10.397699974999341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6460685841739178, "epoch": 0.00272, "grad_norm": 0.001033518579788506, "kl": 1.4755041021853685, "learning_rate": 7.999979379151197e-06, "loss": 0.0, "step": 272, "step_time": 6.4973694099994646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 2.59375, "completions/mean_terminated_length": 2.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0658483356237411, "epoch": 0.00273, "frac_reward_zero_std": 1.0, "grad_norm": 9.214817691827193e-05, "kl": 0.6959873335435987, "learning_rate": 7.999979204028205e-06, "loss": 0.0, "num_tokens": 6453348.0, "reward": 2.075000047683716, "reward_std": 1.660742163658142, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.625, "rewards/probe_completion_length/std": 1.263635277748108, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.0120024681091309, "sampling/importance_sampling_ratio/mean": 0.8527206778526306, "sampling/importance_sampling_ratio/min": 0.5133639574050903, "sampling/sampling_logp_difference/max": 0.5644892454147339, "sampling/sampling_logp_difference/mean": 0.08288981765508652, "step": 273, "step_time": 9.938741726000444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0469078943133354, "epoch": 0.00274, "grad_norm": 0.00013808686344418675, "kl": 0.6965378280729055, "learning_rate": 7.999979028164737e-06, "loss": 0.0, "step": 274, "step_time": 6.027978434999113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4376696143299341, "epoch": 0.00275, "frac_reward_zero_std": 1.0, "grad_norm": 1.7920627215062268e-05, "kl": 1.6866260170936584, "learning_rate": 7.999978851560795e-06, "loss": 0.0, "num_tokens": 6502294.0, "reward": 3.262500047683716, "reward_std": 1.2296733856201172, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 0.3965577781200409, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.25, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.75, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": 0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.3865877389907837, "sampling/importance_sampling_ratio/mean": 0.9549223184585571, "sampling/importance_sampling_ratio/min": 0.7004026770591736, "sampling/sampling_logp_difference/max": 0.33465075492858887, "sampling/sampling_logp_difference/mean": 0.03543006628751755, "step": 275, "step_time": 9.919397402000413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.43271936662495136, "epoch": 0.00276, "grad_norm": 1.5806304872967303e-05, "kl": 1.6843811720609665, "learning_rate": 7.999978674216379e-06, "loss": 0.0, "step": 276, "step_time": 6.94891806600026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.78125, "completions/mean_terminated_length": 2.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.586757205426693, "epoch": 0.00277, "frac_reward_zero_std": 1.0, "grad_norm": 0.00038270672666840255, "kl": 2.193971574306488, "learning_rate": 7.99997849613149e-06, "loss": 0.0, "num_tokens": 6549161.0, "reward": 1.7625000476837158, "reward_std": 0.3965577781200409, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 0.420013427734375, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.461033821105957, "sampling/importance_sampling_ratio/mean": 0.9305300712585449, "sampling/importance_sampling_ratio/min": 0.20976446568965912, "sampling/sampling_logp_difference/max": 1.4805474281311035, "sampling/sampling_logp_difference/mean": 0.15049991011619568, "step": 277, "step_time": 10.054510989999471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5988273657858372, "epoch": 0.00278, "grad_norm": 0.00021034348173998296, "kl": 2.174222081899643, "learning_rate": 7.999978317306126e-06, "loss": 0.0, "step": 278, "step_time": 6.037595613000121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 2.625, "completions/mean_terminated_length": 2.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9894053190946579, "epoch": 0.00279, "frac_reward_zero_std": 1.0, "grad_norm": 7.872794958529994e-05, "kl": 0.7821534425020218, "learning_rate": 7.999978137740288e-06, "loss": 0.0, "num_tokens": 6597198.0, "reward": 2.1374998092651367, "reward_std": 1.7677668333053589, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.6875, "rewards/probe_completion_length/std": 1.4241578578948975, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.947765588760376, "sampling/importance_sampling_ratio/mean": 0.8623788356781006, "sampling/importance_sampling_ratio/min": 0.31989023089408875, "sampling/sampling_logp_difference/max": 0.7898147106170654, "sampling/sampling_logp_difference/mean": 0.11040355265140533, "step": 279, "step_time": 10.506534798000303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.00410046428442, "epoch": 0.0028, "grad_norm": 5.26364310644567e-05, "kl": 0.7675415091216564, "learning_rate": 7.999977957433975e-06, "loss": 0.0, "step": 280, "step_time": 6.0566636830008065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.3125, "completions/mean_terminated_length": 2.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.015853799879551, "epoch": 0.00281, "frac_reward_zero_std": 1.0, "grad_norm": 5.767720358562656e-05, "kl": 0.06597586660063826, "learning_rate": 7.999977776387188e-06, "loss": 0.0, "num_tokens": 6641048.0, "reward": 1.7625000476837158, "reward_std": 1.3060035705566406, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.3125, "rewards/probe_completion_length/std": 0.4709290862083435, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 0.9529492855072021, "sampling/importance_sampling_ratio/mean": 0.8520058393478394, "sampling/importance_sampling_ratio/min": 0.516437292098999, "sampling/sampling_logp_difference/max": 0.35334205627441406, "sampling/sampling_logp_difference/mean": 0.07898004353046417, "step": 281, "step_time": 10.406129021000197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0222055688500404, "epoch": 0.00282, "grad_norm": 4.6576886234106496e-05, "kl": 0.06345673595205881, "learning_rate": 7.999977594599927e-06, "loss": 0.0, "step": 282, "step_time": 6.00192527199988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.46875, "completions/mean_terminated_length": 2.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9479651227593422, "epoch": 0.00283, "frac_reward_zero_std": 1.0, "grad_norm": 3.634697350207716e-05, "kl": 0.46159003078355454, "learning_rate": 7.999977412072193e-06, "loss": 0.0, "num_tokens": 6684990.0, "reward": 1.9187500476837158, "reward_std": 1.256836175918579, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.46875, "rewards/probe_completion_length/std": 0.507007360458374, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.1589446067810059, "sampling/importance_sampling_ratio/mean": 0.8635474443435669, "sampling/importance_sampling_ratio/min": 0.5721397995948792, "sampling/sampling_logp_difference/max": 0.4582357704639435, "sampling/sampling_logp_difference/mean": 0.08887867629528046, "step": 283, "step_time": 10.391508174999217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9357596710324287, "epoch": 0.00284, "grad_norm": 3.538501914590597e-05, "kl": 0.46022001723758876, "learning_rate": 7.999977228803984e-06, "loss": 0.0, "step": 284, "step_time": 5.972688897000353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.71875, "completions/mean_terminated_length": 2.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8306822814047337, "epoch": 0.00285, "frac_reward_zero_std": 0.75, "grad_norm": 0.003344170982018113, "kl": 1.2514954134821892, "learning_rate": 7.999977044795302e-06, "loss": 0.0, "num_tokens": 6730915.0, "reward": 2.106250047683716, "reward_std": 1.0809009075164795, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.71875, "rewards/probe_completion_length/std": 0.45680341124534607, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.78125, "rewards/probe_shaping_dominance/std": 0.420013427734375, "rewards/probe_terminal_raw/mean": 0.21875, "rewards/probe_terminal_raw/std": 0.420013427734375, "rewards/rollout_reward_func/mean": -0.5625, "rewards/rollout_reward_func/std": 0.84002685546875, "sampling/importance_sampling_ratio/max": 1.9693899154663086, "sampling/importance_sampling_ratio/mean": 0.9473754167556763, "sampling/importance_sampling_ratio/min": 0.4411812424659729, "sampling/sampling_logp_difference/max": 0.7108652591705322, "sampling/sampling_logp_difference/mean": 0.10070252418518066, "step": 285, "step_time": 10.50476976299933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8169671483337879, "epoch": 0.00286, "grad_norm": 0.002965368563309312, "kl": 1.2458821088075638, "learning_rate": 7.999976860046145e-06, "loss": -0.0, "step": 286, "step_time": 6.519596415000251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9309164360165596, "epoch": 0.00287, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011960415868088603, "kl": 1.297960703726858, "learning_rate": 7.999976674556518e-06, "loss": 0.0, "num_tokens": 6779611.0, "reward": 1.4500000476837158, "reward_std": 0.5080004930496216, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5, "rewards/probe_completion_length/std": 0.5080004930496216, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.1808512210845947, "sampling/importance_sampling_ratio/mean": 0.9273348450660706, "sampling/importance_sampling_ratio/min": 0.3587171137332916, "sampling/sampling_logp_difference/max": 1.1489166021347046, "sampling/sampling_logp_difference/mean": 0.13752076029777527, "step": 287, "step_time": 10.203225437000128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9222960993647575, "epoch": 0.00288, "grad_norm": 0.0010410655522719026, "kl": 1.2635090134572238, "learning_rate": 7.999976488326414e-06, "loss": 0.0, "step": 288, "step_time": 6.195064060999812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.28125, "completions/mean_terminated_length": 2.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.526524841785431, "epoch": 0.00289, "frac_reward_zero_std": 1.0, "grad_norm": 8.704458741704002e-05, "kl": 0.1819338989444077, "learning_rate": 7.999976301355836e-06, "loss": 0.0, "num_tokens": 6830139.0, "reward": 1.2312500476837158, "reward_std": 0.45680341124534607, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.28125, "rewards/probe_completion_length/std": 0.45680341124534607, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6263999938964844, "sampling/importance_sampling_ratio/mean": 0.797622799873352, "sampling/importance_sampling_ratio/min": 0.3925057351589203, "sampling/sampling_logp_difference/max": 0.7666944265365601, "sampling/sampling_logp_difference/mean": 0.1322537660598755, "step": 289, "step_time": 10.485197891000098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5026581287384033, "epoch": 0.0029, "grad_norm": 9.270982991438359e-05, "kl": 0.18431306723505259, "learning_rate": 7.999976113644787e-06, "loss": 0.0, "step": 290, "step_time": 6.567377654000666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.78125, "completions/mean_terminated_length": 2.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7679154798388481, "epoch": 0.00291, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001906748948385939, "kl": 1.2085663825273514, "learning_rate": 7.999975925193261e-06, "loss": 0.0, "num_tokens": 6878792.0, "reward": 2.731250047683716, "reward_std": 1.2885193824768066, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 0.420013427734375, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.0082029104232788, "sampling/importance_sampling_ratio/mean": 0.889763593673706, "sampling/importance_sampling_ratio/min": 0.6102039217948914, "sampling/sampling_logp_difference/max": 0.4290947914123535, "sampling/sampling_logp_difference/mean": 0.05696694180369377, "step": 291, "step_time": 10.025515152000025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.755341375246644, "epoch": 0.00292, "grad_norm": 9.211798169417307e-05, "kl": 1.1880728974938393, "learning_rate": 7.999975736001263e-06, "loss": 0.0, "step": 292, "step_time": 6.049688758999764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0296694338321686, "epoch": 0.00293, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004035626188851893, "kl": 0.7810260709375143, "learning_rate": 7.999975546068793e-06, "loss": 0.0, "num_tokens": 6929365.0, "reward": 1.4812500476837158, "reward_std": 0.507007360458374, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5, "rewards/probe_completion_length/std": 0.5080004930496216, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.1541950702667236, "sampling/importance_sampling_ratio/mean": 0.9323853254318237, "sampling/importance_sampling_ratio/min": 0.3904179334640503, "sampling/sampling_logp_difference/max": 0.9739227294921875, "sampling/sampling_logp_difference/mean": 0.1474728286266327, "step": 293, "step_time": 10.618863860999681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0283247902989388, "epoch": 0.00294, "grad_norm": 0.0003429663192946464, "kl": 0.720265094190836, "learning_rate": 7.999975355395847e-06, "loss": 0.0, "step": 294, "step_time": 6.128017419000116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8973243683576584, "epoch": 0.00295, "frac_reward_zero_std": 0.75, "grad_norm": 0.002437910996377468, "kl": 0.7489821650087833, "learning_rate": 7.999975163982429e-06, "loss": -0.0001, "num_tokens": 6978906.0, "reward": 2.200000047683716, "reward_std": 1.4142135381698608, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5, "rewards/probe_completion_length/std": 0.5080004930496216, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.625, "rewards/probe_shaping_dominance/std": 0.49186936020851135, "rewards/probe_terminal_raw/mean": 0.375, "rewards/probe_terminal_raw/std": 0.49186936020851135, "rewards/rollout_reward_func/mean": -0.25, "rewards/rollout_reward_func/std": 0.9837387204170227, "sampling/importance_sampling_ratio/max": 1.274229884147644, "sampling/importance_sampling_ratio/mean": 0.8626682162284851, "sampling/importance_sampling_ratio/min": 0.45520147681236267, "sampling/sampling_logp_difference/max": 0.44555020332336426, "sampling/sampling_logp_difference/mean": 0.09587711840867996, "step": 295, "step_time": 10.517878631000258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8956578373908997, "epoch": 0.00296, "grad_norm": 0.0023497892543673515, "kl": 0.7520033298060298, "learning_rate": 7.999974971828538e-06, "loss": -0.0001, "step": 296, "step_time": 6.085854749000191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.59375, "completions/mean_terminated_length": 2.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8559794425964355, "epoch": 0.00297, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006165934028103948, "kl": 1.2970306277275085, "learning_rate": 7.999974778934173e-06, "loss": 0.0, "num_tokens": 7022843.0, "reward": 2.075000047683716, "reward_std": 1.1845782995224, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.59375, "rewards/probe_completion_length/std": 0.49899089336395264, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 2.3478376865386963, "sampling/importance_sampling_ratio/mean": 0.9364867210388184, "sampling/importance_sampling_ratio/min": 0.2075859159231186, "sampling/sampling_logp_difference/max": 1.4937734603881836, "sampling/sampling_logp_difference/mean": 0.11856293678283691, "step": 297, "step_time": 10.372231002998888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8517452776432037, "epoch": 0.00298, "grad_norm": 0.0012263483367860317, "kl": 1.4815890714526176, "learning_rate": 7.999974585299335e-06, "loss": 0.0, "step": 298, "step_time": 6.441055234999112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1535784602165222, "epoch": 0.00299, "frac_reward_zero_std": 1.0, "grad_norm": 0.00010754216782515869, "kl": 0.1399077856913209, "learning_rate": 7.999974390924023e-06, "loss": 0.0, "num_tokens": 7074516.0, "reward": 1.4500000476837158, "reward_std": 0.5080004930496216, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5, "rewards/probe_completion_length/std": 0.5080004930496216, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 0.9696638584136963, "sampling/importance_sampling_ratio/mean": 0.8000353574752808, "sampling/importance_sampling_ratio/min": 0.4501090347766876, "sampling/sampling_logp_difference/max": 0.7076842188835144, "sampling/sampling_logp_difference/mean": 0.11227491497993469, "step": 299, "step_time": 10.031227658000262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1581177338957787, "epoch": 0.003, "grad_norm": 0.00011685074423439801, "kl": 0.1419827735517174, "learning_rate": 7.999974195808239e-06, "loss": 0.0, "step": 300, "step_time": 6.074399898000138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.59375, "completions/mean_terminated_length": 2.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8323883041739464, "epoch": 0.00301, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001984036498470232, "kl": 0.813261678442359, "learning_rate": 7.999973999951982e-06, "loss": 0.0, "num_tokens": 7125615.0, "reward": 2.043750047683716, "reward_std": 1.201058030128479, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.59375, "rewards/probe_completion_length/std": 0.49899089336395264, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.0074903964996338, "sampling/importance_sampling_ratio/mean": 0.8131163716316223, "sampling/importance_sampling_ratio/min": 0.4442375898361206, "sampling/sampling_logp_difference/max": 0.6795529127120972, "sampling/sampling_logp_difference/mean": 0.09874939173460007, "step": 301, "step_time": 11.23406893099991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8443499356508255, "epoch": 0.00302, "grad_norm": 0.00015355952200479805, "kl": 0.8130504302680492, "learning_rate": 7.99997380335525e-06, "loss": 0.0, "step": 302, "step_time": 6.22487125599946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.6875, "completions/mean_terminated_length": 2.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6427240371704102, "epoch": 0.00303, "frac_reward_zero_std": 1.0, "grad_norm": 0.000308831047732383, "kl": 1.0533999186009169, "learning_rate": 7.999973606018048e-06, "loss": 0.0, "num_tokens": 7174736.0, "reward": 2.231250047683716, "reward_std": 1.0846248865127563, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.6875, "rewards/probe_completion_length/std": 0.4709290862083435, "rewards/probe_invalid_count/mean": 0.09375, "rewards/probe_invalid_count/std": 0.2961445748806, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.6255303621292114, "sampling/importance_sampling_ratio/mean": 0.8998515009880066, "sampling/importance_sampling_ratio/min": 0.24930788576602936, "sampling/sampling_logp_difference/max": 1.3098429441452026, "sampling/sampling_logp_difference/mean": 0.11029675602912903, "step": 303, "step_time": 9.959219268000197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6381810642778873, "epoch": 0.00304, "grad_norm": 0.00026818981859833, "kl": 1.0389648918062449, "learning_rate": 7.99997340794037e-06, "loss": 0.0, "step": 304, "step_time": 6.502279902000737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.46875, "completions/mean_terminated_length": 2.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9642986953258514, "epoch": 0.00305, "frac_reward_zero_std": 1.0, "grad_norm": 0.00017586298054084182, "kl": 0.37446144194109365, "learning_rate": 7.999973209122222e-06, "loss": 0.0, "num_tokens": 7224003.0, "reward": 1.4187500476837158, "reward_std": 0.507007360458374, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.46875, "rewards/probe_completion_length/std": 0.507007360458374, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.28328537940979, "sampling/importance_sampling_ratio/mean": 0.8521016836166382, "sampling/importance_sampling_ratio/min": 0.4148902893066406, "sampling/sampling_logp_difference/max": 0.7877531051635742, "sampling/sampling_logp_difference/mean": 0.10069666802883148, "step": 305, "step_time": 10.471455122999942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9792665392160416, "epoch": 0.00306, "grad_norm": 0.00018103634647559375, "kl": 0.3658354175131535, "learning_rate": 7.999973009563599e-06, "loss": 0.0, "step": 306, "step_time": 6.058177401000194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.625, "completions/mean_terminated_length": 2.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8190870024263859, "epoch": 0.00307, "frac_reward_zero_std": 0.75, "grad_norm": 0.002435210859403014, "kl": 0.9355106446892023, "learning_rate": 7.999972809264505e-06, "loss": -0.0001, "num_tokens": 7271665.0, "reward": 2.293750047683716, "reward_std": 1.2853862047195435, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.625, "rewards/probe_completion_length/std": 0.49186936020851135, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.65625, "rewards/probe_shaping_dominance/std": 0.4825586974620819, "rewards/probe_terminal_raw/mean": 0.34375, "rewards/probe_terminal_raw/std": 0.4825586974620819, "rewards/rollout_reward_func/mean": -0.3125, "rewards/rollout_reward_func/std": 0.9651173949241638, "sampling/importance_sampling_ratio/max": 1.3618639707565308, "sampling/importance_sampling_ratio/mean": 0.8423999547958374, "sampling/importance_sampling_ratio/min": 0.15370477735996246, "sampling/sampling_logp_difference/max": 1.2758878469467163, "sampling/sampling_logp_difference/mean": 0.1437636762857437, "step": 307, "step_time": 10.07711297600008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.8193341009318829, "epoch": 0.00308, "grad_norm": 0.002486324403434992, "kl": 0.9421810358762741, "learning_rate": 7.999972608224937e-06, "loss": -0.0001, "step": 308, "step_time": 6.035029341999234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.375, "completions/mean_terminated_length": 2.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7972683534026146, "epoch": 0.00309, "frac_reward_zero_std": 1.0, "grad_norm": 2.102801408909727e-05, "kl": 0.66733174957335, "learning_rate": 7.999972406444895e-06, "loss": 0.0, "num_tokens": 7324209.0, "reward": 1.3250000476837158, "reward_std": 0.49186936020851135, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.375, "rewards/probe_completion_length/std": 0.49186936020851135, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.0272186994552612, "sampling/importance_sampling_ratio/mean": 0.8698651790618896, "sampling/importance_sampling_ratio/min": 0.6035609245300293, "sampling/sampling_logp_difference/max": 0.34899115562438965, "sampling/sampling_logp_difference/mean": 0.06759645789861679, "step": 309, "step_time": 11.134619106000173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8096836917102337, "epoch": 0.0031, "grad_norm": 1.849373620643746e-05, "kl": 0.6643700585700572, "learning_rate": 7.999972203924383e-06, "loss": 0.0, "step": 310, "step_time": 6.1093775609997465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 2.59375, "completions/mean_terminated_length": 2.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1299720034003258, "epoch": 0.00311, "frac_reward_zero_std": 1.0, "grad_norm": 0.003075299086049199, "kl": 1.807578288950026, "learning_rate": 7.999972000663396e-06, "loss": 0.0, "num_tokens": 7369434.0, "reward": 1.5750000476837158, "reward_std": 1.263635277748108, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.625, "rewards/probe_completion_length/std": 1.263635277748108, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.1367149353027344, "sampling/importance_sampling_ratio/mean": 0.816227912902832, "sampling/importance_sampling_ratio/min": 0.17019084095954895, "sampling/sampling_logp_difference/max": 1.5584993362426758, "sampling/sampling_logp_difference/mean": 0.14971722662448883, "step": 311, "step_time": 9.711283419999745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1384611800312996, "epoch": 0.00312, "grad_norm": 0.001805527019314468, "kl": 1.2573999939486384, "learning_rate": 7.999971796661938e-06, "loss": 0.0, "step": 312, "step_time": 6.865537390999634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.53125, "completions/mean_terminated_length": 2.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8577270582318306, "epoch": 0.00313, "frac_reward_zero_std": 0.75, "grad_norm": 0.0013959156349301338, "kl": 1.3633378157392144, "learning_rate": 7.999971591920007e-06, "loss": -0.0, "num_tokens": 7419095.0, "reward": 1.9187500476837158, "reward_std": 1.1773227453231812, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.53125, "rewards/probe_completion_length/std": 0.507007360458374, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.78125, "rewards/probe_shaping_dominance/std": 0.420013427734375, "rewards/probe_terminal_raw/mean": 0.21875, "rewards/probe_terminal_raw/std": 0.420013427734375, "rewards/rollout_reward_func/mean": -0.5625, "rewards/rollout_reward_func/std": 0.84002685546875, "sampling/importance_sampling_ratio/max": 1.8378033638000488, "sampling/importance_sampling_ratio/mean": 0.9367517232894897, "sampling/importance_sampling_ratio/min": 0.4363749027252197, "sampling/sampling_logp_difference/max": 0.8578743934631348, "sampling/sampling_logp_difference/mean": 0.12448535114526749, "step": 313, "step_time": 10.324483344001237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8521794825792313, "epoch": 0.00314, "grad_norm": 0.0013546322006732225, "kl": 1.3631921615451574, "learning_rate": 7.999971386437603e-06, "loss": -0.0, "step": 314, "step_time": 6.133231628999511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.21875, "completions/mean_terminated_length": 2.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2760131061077118, "epoch": 0.00315, "frac_reward_zero_std": 1.0, "grad_norm": 6.7212684371043e-05, "kl": 0.1668471217271872, "learning_rate": 7.999971180214728e-06, "loss": 0.0, "num_tokens": 7469149.0, "reward": 1.1687500476837158, "reward_std": 0.420013427734375, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.21875, "rewards/probe_completion_length/std": 0.420013427734375, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3021711111068726, "sampling/importance_sampling_ratio/mean": 0.8215183019638062, "sampling/importance_sampling_ratio/min": 0.38338208198547363, "sampling/sampling_logp_difference/max": 0.8493318557739258, "sampling/sampling_logp_difference/mean": 0.13118374347686768, "step": 315, "step_time": 10.013548799000091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2984453290700912, "epoch": 0.00316, "grad_norm": 5.114684608997777e-05, "kl": 0.16854519414482638, "learning_rate": 7.99997097325138e-06, "loss": 0.0, "step": 316, "step_time": 7.113780104000853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5625, "completions/mean_terminated_length": 2.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0801144316792488, "epoch": 0.00317, "frac_reward_zero_std": 1.0, "grad_norm": 8.528330363333225e-05, "kl": 0.07406117697246373, "learning_rate": 7.999970765547559e-06, "loss": 0.0, "num_tokens": 7517551.0, "reward": 2.012500047683716, "reward_std": 1.2164862155914307, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5625, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.1232194900512695, "sampling/importance_sampling_ratio/mean": 0.8244851231575012, "sampling/importance_sampling_ratio/min": 0.3947485089302063, "sampling/sampling_logp_difference/max": 0.4756275415420532, "sampling/sampling_logp_difference/mean": 0.10443481802940369, "step": 317, "step_time": 9.911432873999729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0896883979439735, "epoch": 0.00318, "grad_norm": 8.261961193056777e-05, "kl": 0.08147628034930676, "learning_rate": 7.999970557103267e-06, "loss": 0.0, "step": 318, "step_time": 6.029059536999739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.34375, "completions/mean_terminated_length": 2.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1932886689901352, "epoch": 0.00319, "frac_reward_zero_std": 1.0, "grad_norm": 0.00011847085261251777, "kl": 0.7054581567645073, "learning_rate": 7.999970347918501e-06, "loss": 0.0, "num_tokens": 7565287.0, "reward": 1.7937500476837158, "reward_std": 1.297873616218567, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.34375, "rewards/probe_completion_length/std": 0.4825586974620819, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.246423363685608, "sampling/importance_sampling_ratio/mean": 0.8327354192733765, "sampling/importance_sampling_ratio/min": 0.40081343054771423, "sampling/sampling_logp_difference/max": 0.558934211730957, "sampling/sampling_logp_difference/mean": 0.11713166534900665, "step": 319, "step_time": 9.851570212000752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.194627471268177, "epoch": 0.0032, "grad_norm": 0.00012471656373236328, "kl": 0.6935346499085426, "learning_rate": 7.999970137993264e-06, "loss": 0.0, "step": 320, "step_time": 7.014572012999906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.78125, "completions/mean_terminated_length": 2.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5268306843936443, "epoch": 0.00321, "frac_reward_zero_std": 1.0, "grad_norm": 1.4538780305883847e-05, "kl": 0.13023399263329338, "learning_rate": 7.999969927327556e-06, "loss": 0.0, "num_tokens": 7605921.0, "reward": 3.231250047683716, "reward_std": 1.2759405374526978, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 0.420013427734375, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.25, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.75, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": 0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.2444671392440796, "sampling/importance_sampling_ratio/mean": 0.9207519292831421, "sampling/importance_sampling_ratio/min": 0.4403969347476959, "sampling/sampling_logp_difference/max": 0.6484390497207642, "sampling/sampling_logp_difference/mean": 0.043011896312236786, "step": 321, "step_time": 9.745958758999222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5186222679913044, "epoch": 0.00322, "grad_norm": 1.4087959243624937e-05, "kl": 0.13184529542922974, "learning_rate": 7.999969715921373e-06, "loss": 0.0, "step": 322, "step_time": 5.857027678000122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.45947896130383015, "epoch": 0.00323, "frac_reward_zero_std": 1.0, "grad_norm": 3.156924503855407e-05, "kl": 1.6311049610376358, "learning_rate": 7.999969503774719e-06, "loss": 0.0, "num_tokens": 7647859.0, "reward": 3.325000047683716, "reward_std": 1.1288018226623535, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.25, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.75, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": 0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 0.995955228805542, "sampling/importance_sampling_ratio/mean": 0.9161585569381714, "sampling/importance_sampling_ratio/min": 0.4671463966369629, "sampling/sampling_logp_difference/max": 0.5933887958526611, "sampling/sampling_logp_difference/mean": 0.040462058037519455, "step": 323, "step_time": 10.309850834999452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4685290865600109, "epoch": 0.00324, "grad_norm": 7.684806769248098e-05, "kl": 1.6309725642204285, "learning_rate": 7.999969290887594e-06, "loss": 0.0, "step": 324, "step_time": 6.386313178999899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 2.375, "completions/mean_terminated_length": 2.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2863778471946716, "epoch": 0.00325, "frac_reward_zero_std": 1.0, "grad_norm": 0.00012449390487745404, "kl": 0.10761683306191117, "learning_rate": 7.999969077259998e-06, "loss": 0.0, "num_tokens": 7696966.0, "reward": 1.3875000476837158, "reward_std": 1.435438632965088, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.4375, "rewards/probe_completion_length/std": 1.4354385137557983, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1330184936523438, "sampling/importance_sampling_ratio/mean": 0.7982755899429321, "sampling/importance_sampling_ratio/min": 0.30686163902282715, "sampling/sampling_logp_difference/max": 1.0647956132888794, "sampling/sampling_logp_difference/mean": 0.1156022697687149, "step": 325, "step_time": 9.99423157000001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.289127841591835, "epoch": 0.00326, "grad_norm": 9.207100083585829e-05, "kl": 0.09634128154721111, "learning_rate": 7.999968862891929e-06, "loss": 0.0, "step": 326, "step_time": 6.051717358000587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 2.71875, "completions/mean_terminated_length": 2.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.097517840564251, "epoch": 0.00327, "frac_reward_zero_std": 0.75, "grad_norm": 0.0018292975146323442, "kl": 0.41003574151545763, "learning_rate": 7.999968647783389e-06, "loss": -0.0001, "num_tokens": 7743961.0, "reward": 1.9500000476837158, "reward_std": 1.4591203927993774, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.75, "rewards/probe_completion_length/std": 1.2443419694900513, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.875, "rewards/probe_shaping_dominance/std": 0.33601075410842896, "rewards/probe_terminal_raw/mean": 0.125, "rewards/probe_terminal_raw/std": 0.33601075410842896, "rewards/rollout_reward_func/mean": -0.75, "rewards/rollout_reward_func/std": 0.6720215082168579, "sampling/importance_sampling_ratio/max": 1.4463415145874023, "sampling/importance_sampling_ratio/mean": 0.7840434312820435, "sampling/importance_sampling_ratio/min": 0.21838130056858063, "sampling/sampling_logp_difference/max": 1.3481109142303467, "sampling/sampling_logp_difference/mean": 0.14009910821914673, "step": 327, "step_time": 10.658869628999582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0987923666834831, "epoch": 0.00328, "grad_norm": 0.0015403992729261518, "kl": 0.40840206295251846, "learning_rate": 7.999968431934376e-06, "loss": -0.0001, "step": 328, "step_time": 6.608717246999731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.46875, "completions/mean_terminated_length": 2.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.3225805312395096, "epoch": 0.00329, "frac_reward_zero_std": 1.0, "grad_norm": 0.000299303384963423, "kl": 0.4579396164044738, "learning_rate": 7.999968215344892e-06, "loss": 0.0, "num_tokens": 7794251.0, "reward": 1.4187500476837158, "reward_std": 0.507007360458374, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.46875, "rewards/probe_completion_length/std": 0.507007360458374, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.700522541999817, "sampling/importance_sampling_ratio/mean": 0.7628653049468994, "sampling/importance_sampling_ratio/min": 0.2527129054069519, "sampling/sampling_logp_difference/max": 1.2428406476974487, "sampling/sampling_logp_difference/mean": 0.17843830585479736, "step": 329, "step_time": 10.212800845000402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.32346860319376, "epoch": 0.0033, "grad_norm": 0.00023284112103283405, "kl": 0.41496767988428473, "learning_rate": 7.999967998014936e-06, "loss": 0.0, "step": 330, "step_time": 6.193185579000328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.46875, "completions/mean_terminated_length": 2.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0128037929534912, "epoch": 0.00331, "frac_reward_zero_std": 1.0, "grad_norm": 5.9650203183991835e-05, "kl": 0.9195960871875286, "learning_rate": 7.999967779944508e-06, "loss": 0.0, "num_tokens": 7844694.0, "reward": 1.9187500476837158, "reward_std": 1.256836175918579, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.46875, "rewards/probe_completion_length/std": 0.507007360458374, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.2084201574325562, "sampling/importance_sampling_ratio/mean": 0.8935514688491821, "sampling/importance_sampling_ratio/min": 0.5958609580993652, "sampling/sampling_logp_difference/max": 0.49711179733276367, "sampling/sampling_logp_difference/mean": 0.08623553812503815, "step": 331, "step_time": 10.570847360000243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0223295539617538, "epoch": 0.00332, "grad_norm": 4.8721845814725384e-05, "kl": 0.9137447625398636, "learning_rate": 7.99996756113361e-06, "loss": 0.0, "step": 332, "step_time": 6.144921876000353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5625, "completions/mean_terminated_length": 2.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6741441264748573, "epoch": 0.00333, "frac_reward_zero_std": 1.0, "grad_norm": 1.6808338841656223e-05, "kl": 0.7198268654756248, "learning_rate": 7.999967341582239e-06, "loss": 0.0, "num_tokens": 7890635.0, "reward": 2.512500047683716, "reward_std": 1.4797013998031616, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5625, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 0.9704124927520752, "sampling/importance_sampling_ratio/mean": 0.8803048133850098, "sampling/importance_sampling_ratio/min": 0.6561123728752136, "sampling/sampling_logp_difference/max": 0.5014957785606384, "sampling/sampling_logp_difference/mean": 0.05533091723918915, "step": 333, "step_time": 10.435302833999685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.681053951382637, "epoch": 0.00334, "grad_norm": 2.799690446408931e-05, "kl": 0.7224257413763553, "learning_rate": 7.999967121290396e-06, "loss": 0.0, "step": 334, "step_time": 6.085340915999041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.78125, "completions/mean_terminated_length": 2.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7493787109851837, "epoch": 0.00335, "frac_reward_zero_std": 1.0, "grad_norm": 8.788624836597592e-05, "kl": 0.3644955639792897, "learning_rate": 7.999966900258084e-06, "loss": 0.0, "num_tokens": 7935553.0, "reward": 2.731250047683716, "reward_std": 1.2885193824768066, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 0.420013427734375, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.079957365989685, "sampling/importance_sampling_ratio/mean": 0.87494957447052, "sampling/importance_sampling_ratio/min": 0.37640491127967834, "sampling/sampling_logp_difference/max": 0.6546320915222168, "sampling/sampling_logp_difference/mean": 0.07020466029644012, "step": 335, "step_time": 10.833335889999944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7399843037128448, "epoch": 0.00336, "grad_norm": 4.674637602875009e-05, "kl": 0.3563638808336691, "learning_rate": 7.9999666784853e-06, "loss": 0.0, "step": 336, "step_time": 6.176430282999718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.53125, "completions/mean_terminated_length": 2.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2268180549144745, "epoch": 0.00337, "frac_reward_zero_std": 0.75, "grad_norm": 0.0032034514006227255, "kl": 0.92481086589396, "learning_rate": 7.999966455972044e-06, "loss": -0.0001, "num_tokens": 7985896.0, "reward": 1.8875000476837158, "reward_std": 1.1053389310836792, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.53125, "rewards/probe_completion_length/std": 0.507007360458374, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.8125, "rewards/probe_shaping_dominance/std": 0.3965577781200409, "rewards/probe_terminal_raw/mean": 0.1875, "rewards/probe_terminal_raw/std": 0.3965577781200409, "rewards/rollout_reward_func/mean": -0.625, "rewards/rollout_reward_func/std": 0.7931155562400818, "sampling/importance_sampling_ratio/max": 1.6343834400177002, "sampling/importance_sampling_ratio/mean": 0.828697919845581, "sampling/importance_sampling_ratio/min": 0.12531737983226776, "sampling/sampling_logp_difference/max": 1.9917540550231934, "sampling/sampling_logp_difference/mean": 0.15161240100860596, "step": 337, "step_time": 11.238573729999644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 1.2293306812644005, "epoch": 0.00338, "grad_norm": 0.0005875456845387816, "kl": 0.9996783956885338, "learning_rate": 7.999966232718316e-06, "loss": -0.0001, "step": 338, "step_time": 6.34771773500006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9276796728372574, "epoch": 0.00339, "frac_reward_zero_std": 1.0, "grad_norm": 4.245446325512603e-05, "kl": 0.5849672869662754, "learning_rate": 7.999966008724119e-06, "loss": 0.0, "num_tokens": 8037066.0, "reward": 1.8875000476837158, "reward_std": 1.2684128284454346, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.4375, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.435063362121582, "sampling/importance_sampling_ratio/mean": 0.895423173904419, "sampling/importance_sampling_ratio/min": 0.6500217914581299, "sampling/sampling_logp_difference/max": 0.41063392162323, "sampling/sampling_logp_difference/mean": 0.08168335258960724, "step": 339, "step_time": 11.09809919000054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9489234611392021, "epoch": 0.0034, "grad_norm": 5.4233478294918314e-05, "kl": 0.5783893242478371, "learning_rate": 7.99996578398945e-06, "loss": 0.0, "step": 340, "step_time": 6.86810050899885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.78125, "completions/mean_terminated_length": 2.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6301294788718224, "epoch": 0.00341, "frac_reward_zero_std": 1.0, "grad_norm": 3.1229348678607494e-05, "kl": 0.5702050784602761, "learning_rate": 7.99996555851431e-06, "loss": 0.0, "num_tokens": 8084129.0, "reward": 2.731250047683716, "reward_std": 1.2885193824768066, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 0.420013427734375, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.1891618967056274, "sampling/importance_sampling_ratio/mean": 0.8978214263916016, "sampling/importance_sampling_ratio/min": 0.6014400124549866, "sampling/sampling_logp_difference/max": 0.3437986373901367, "sampling/sampling_logp_difference/mean": 0.05562354624271393, "step": 341, "step_time": 10.276972778999607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6289165541529655, "epoch": 0.00342, "grad_norm": 0.00010118436330230907, "kl": 0.5909891685005277, "learning_rate": 7.999965332298698e-06, "loss": 0.0, "step": 342, "step_time": 6.750384622999718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.680240910500288, "epoch": 0.00343, "frac_reward_zero_std": 1.0, "grad_norm": 9.979560854844749e-05, "kl": 1.108576413244009, "learning_rate": 7.999965105342615e-06, "loss": 0.0, "num_tokens": 8130707.0, "reward": 2.793750047683716, "reward_std": 1.221035361289978, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 0.3689020276069641, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 0.974475622177124, "sampling/importance_sampling_ratio/mean": 0.7939993143081665, "sampling/importance_sampling_ratio/min": 0.2770652174949646, "sampling/sampling_logp_difference/max": 0.9215399026870728, "sampling/sampling_logp_difference/mean": 0.10923172533512115, "step": 343, "step_time": 10.298724081999353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6709529925137758, "epoch": 0.00344, "grad_norm": 0.00011715908476617187, "kl": 1.1041308715939522, "learning_rate": 7.999964877646064e-06, "loss": 0.0, "step": 344, "step_time": 6.713455954999972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5625, "completions/mean_terminated_length": 2.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.934910848736763, "epoch": 0.00345, "frac_reward_zero_std": 1.0, "grad_norm": 0.001362826325930655, "kl": 0.9546709419228137, "learning_rate": 7.99996464920904e-06, "loss": 0.0, "num_tokens": 8179015.0, "reward": 2.012500047683716, "reward_std": 1.2164862155914307, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5625, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.38411545753479, "sampling/importance_sampling_ratio/mean": 0.8699967861175537, "sampling/importance_sampling_ratio/min": 0.14834560453891754, "sampling/sampling_logp_difference/max": 1.8495113849639893, "sampling/sampling_logp_difference/mean": 0.11092771589756012, "step": 345, "step_time": 10.603762035999353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9600491523742676, "epoch": 0.00346, "grad_norm": 0.0016778368735685945, "kl": 1.0116195227019489, "learning_rate": 7.999964420031546e-06, "loss": 0.0, "step": 346, "step_time": 6.956354311000268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5329532325267792, "epoch": 0.00347, "frac_reward_zero_std": 0.75, "grad_norm": 0.0027003376744687557, "kl": 1.3288137391209602, "learning_rate": 7.99996419011358e-06, "loss": -0.0001, "num_tokens": 8224667.0, "reward": 3.168750047683716, "reward_std": 1.2110878229141235, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 0.3965577781200409, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.3125, "rewards/probe_shaping_dominance/std": 0.4709290862083435, "rewards/probe_terminal_raw/mean": 0.6875, "rewards/probe_terminal_raw/std": 0.4709290862083435, "rewards/rollout_reward_func/mean": 0.375, "rewards/rollout_reward_func/std": 0.941858172416687, "sampling/importance_sampling_ratio/max": 1.0700640678405762, "sampling/importance_sampling_ratio/mean": 0.8882616758346558, "sampling/importance_sampling_ratio/min": 0.14675666391849518, "sampling/sampling_logp_difference/max": 1.6794652938842773, "sampling/sampling_logp_difference/mean": 0.06773966550827026, "step": 347, "step_time": 10.328664960999504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.5438846126198769, "epoch": 0.00348, "grad_norm": 0.0006248099962249398, "kl": 1.3313154578208923, "learning_rate": 7.999963959455145e-06, "loss": -0.0001, "step": 348, "step_time": 6.798959563998778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.59375, "completions/mean_terminated_length": 2.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8449730277061462, "epoch": 0.00349, "frac_reward_zero_std": 0.75, "grad_norm": 0.0066831219010055065, "kl": 1.3654840737581253, "learning_rate": 7.999963728056238e-06, "loss": -0.0, "num_tokens": 8270891.0, "reward": 2.012500047683716, "reward_std": 1.16224205493927, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.59375, "rewards/probe_completion_length/std": 0.49899089336395264, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.78125, "rewards/probe_shaping_dominance/std": 0.420013427734375, "rewards/probe_terminal_raw/mean": 0.21875, "rewards/probe_terminal_raw/std": 0.420013427734375, "rewards/rollout_reward_func/mean": -0.5625, "rewards/rollout_reward_func/std": 0.84002685546875, "sampling/importance_sampling_ratio/max": 1.4426002502441406, "sampling/importance_sampling_ratio/mean": 0.8822833299636841, "sampling/importance_sampling_ratio/min": 0.3499508798122406, "sampling/sampling_logp_difference/max": 1.0837316513061523, "sampling/sampling_logp_difference/mean": 0.0948411375284195, "step": 349, "step_time": 10.381334160999359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.846816748380661, "epoch": 0.0035, "grad_norm": 0.0034231310710310936, "kl": 1.6555369943380356, "learning_rate": 7.99996349591686e-06, "loss": -0.0, "step": 350, "step_time": 6.824553742999797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 2.46875, "completions/mean_terminated_length": 2.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.4471484422683716, "epoch": 0.00351, "frac_reward_zero_std": 1.0, "grad_norm": 0.00028458540327847004, "kl": 0.40467185433954, "learning_rate": 7.999963263037014e-06, "loss": 0.0, "num_tokens": 8316234.0, "reward": 1.4812500476837158, "reward_std": 1.6061248779296875, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.53125, "rewards/probe_completion_length/std": 1.6061248779296875, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.0914900302886963, "sampling/importance_sampling_ratio/mean": 0.7615956664085388, "sampling/importance_sampling_ratio/min": 0.32088056206703186, "sampling/sampling_logp_difference/max": 0.9977098703384399, "sampling/sampling_logp_difference/mean": 0.14794917404651642, "step": 351, "step_time": 10.446612559999267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4311248511075974, "epoch": 0.00352, "grad_norm": 0.0003620213537942618, "kl": 0.4597238264977932, "learning_rate": 7.999963029416695e-06, "loss": 0.0, "step": 352, "step_time": 6.943062976000419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 2.53125, "completions/mean_terminated_length": 2.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0510557070374489, "epoch": 0.00353, "frac_reward_zero_std": 0.75, "grad_norm": 0.0010256932582706213, "kl": 1.2628090418875217, "learning_rate": 7.999962795055906e-06, "loss": -0.0001, "num_tokens": 8362968.0, "reward": 2.137500047683716, "reward_std": 1.533233880996704, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5, "rewards/probe_completion_length/std": 0.6720215082168579, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.65625, "rewards/probe_shaping_dominance/std": 0.4825586974620819, "rewards/probe_terminal_raw/mean": 0.34375, "rewards/probe_terminal_raw/std": 0.4825586974620819, "rewards/rollout_reward_func/mean": -0.3125, "rewards/rollout_reward_func/std": 0.9651173949241638, "sampling/importance_sampling_ratio/max": 1.4461380243301392, "sampling/importance_sampling_ratio/mean": 0.8395605087280273, "sampling/importance_sampling_ratio/min": 0.3192848265171051, "sampling/sampling_logp_difference/max": 1.0445311069488525, "sampling/sampling_logp_difference/mean": 0.13624471426010132, "step": 353, "step_time": 10.658984269000484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0730230584740639, "epoch": 0.00354, "grad_norm": 0.0011227383511140943, "kl": 1.2259647622704506, "learning_rate": 7.99996255995465e-06, "loss": -0.0001, "step": 354, "step_time": 6.383262848999493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 2.65625, "completions/mean_terminated_length": 2.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0429575219750404, "epoch": 0.00355, "frac_reward_zero_std": 1.0, "grad_norm": 4.014531441498548e-05, "kl": 0.059757897251984105, "learning_rate": 7.99996232411292e-06, "loss": 0.0, "num_tokens": 8408151.0, "reward": 2.168750047683716, "reward_std": 1.621118187904358, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.71875, "rewards/probe_completion_length/std": 1.2504031658172607, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 0.9531012773513794, "sampling/importance_sampling_ratio/mean": 0.827174961566925, "sampling/importance_sampling_ratio/min": 0.5229939222335815, "sampling/sampling_logp_difference/max": 0.48770320415496826, "sampling/sampling_logp_difference/mean": 0.0860244557261467, "step": 355, "step_time": 10.83670291099952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0519123449921608, "epoch": 0.00356, "grad_norm": 3.611388456192799e-05, "kl": 0.0671477972791763, "learning_rate": 7.999962087530722e-06, "loss": 0.0, "step": 356, "step_time": 6.801647416999458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.625, "completions/mean_terminated_length": 2.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.876899354159832, "epoch": 0.00357, "frac_reward_zero_std": 1.0, "grad_norm": 4.189152241451666e-05, "kl": 0.5517611491377465, "learning_rate": 7.999961850208053e-06, "loss": 0.0, "num_tokens": 8453997.0, "reward": 2.575000047683716, "reward_std": 1.4312187433242798, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.625, "rewards/probe_completion_length/std": 0.49186936020851135, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.1144719123840332, "sampling/importance_sampling_ratio/mean": 0.8704279661178589, "sampling/importance_sampling_ratio/min": 0.2846354842185974, "sampling/sampling_logp_difference/max": 0.721602737903595, "sampling/sampling_logp_difference/mean": 0.09897022694349289, "step": 357, "step_time": 10.110818112000743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8830019608139992, "epoch": 0.00358, "grad_norm": 4.22624361817725e-05, "kl": 0.5523047639580909, "learning_rate": 7.999961612144914e-06, "loss": 0.0, "step": 358, "step_time": 5.968289966999691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.65625, "completions/mean_terminated_length": 2.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9959534220397472, "epoch": 0.00359, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002864841080736369, "kl": 0.8776353914290667, "learning_rate": 7.999961373341304e-06, "loss": 0.0, "num_tokens": 8503781.0, "reward": 2.137500047683716, "reward_std": 1.1482806205749512, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.65625, "rewards/probe_completion_length/std": 0.4825586974620819, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 2.968733310699463, "sampling/importance_sampling_ratio/mean": 0.8962721824645996, "sampling/importance_sampling_ratio/min": 0.31244829297065735, "sampling/sampling_logp_difference/max": 1.1493885517120361, "sampling/sampling_logp_difference/mean": 0.12804019451141357, "step": 359, "step_time": 10.358135880999725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0032146386802197, "epoch": 0.0036, "grad_norm": 0.0002744219091255218, "kl": 0.8691087402403355, "learning_rate": 7.999961133797226e-06, "loss": 0.0, "step": 360, "step_time": 5.9020768589998625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5623603612184525, "epoch": 0.00361, "frac_reward_zero_std": 1.0, "grad_norm": 9.27875516936183e-05, "kl": 1.2790136635303497, "learning_rate": 7.999960893512676e-06, "loss": 0.0, "num_tokens": 8546449.0, "reward": 2.325000047683716, "reward_std": 1.008032202720642, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 0.3689020276069641, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.3561346530914307, "sampling/importance_sampling_ratio/mean": 0.8485797643661499, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.5407872200012207, "sampling/sampling_logp_difference/mean": 0.09200026839971542, "step": 361, "step_time": 10.702817508000408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5627776719629765, "epoch": 0.00362, "grad_norm": 8.150070789270103e-05, "kl": 1.2784964963793755, "learning_rate": 7.999960652487659e-06, "loss": 0.0, "step": 362, "step_time": 6.0848739280008886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.46875, "completions/mean_terminated_length": 2.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.968632735311985, "epoch": 0.00363, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008145206957124174, "kl": 0.6598832843301352, "learning_rate": 7.99996041072217e-06, "loss": 0.0, "num_tokens": 8592159.0, "reward": 1.9187500476837158, "reward_std": 1.256836175918579, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.46875, "rewards/probe_completion_length/std": 0.507007360458374, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.374538540840149, "sampling/importance_sampling_ratio/mean": 0.8541547060012817, "sampling/importance_sampling_ratio/min": 0.17510434985160828, "sampling/sampling_logp_difference/max": 1.6452641487121582, "sampling/sampling_logp_difference/mean": 0.14628112316131592, "step": 363, "step_time": 10.549635234000107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9661797136068344, "epoch": 0.00364, "grad_norm": 0.0006351504125632346, "kl": 0.5736912706051953, "learning_rate": 7.999960168216212e-06, "loss": 0.0, "step": 364, "step_time": 6.1295828570000594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.53125, "completions/mean_terminated_length": 2.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9562252089381218, "epoch": 0.00365, "frac_reward_zero_std": 0.75, "grad_norm": 0.001311680767685175, "kl": 1.6579833626747131, "learning_rate": 7.999959924969784e-06, "loss": -0.0, "num_tokens": 8642179.0, "reward": 2.043750047683716, "reward_std": 1.2790968418121338, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.53125, "rewards/probe_completion_length/std": 0.507007360458374, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.71875, "rewards/probe_shaping_dominance/std": 0.45680341124534607, "rewards/probe_terminal_raw/mean": 0.28125, "rewards/probe_terminal_raw/std": 0.45680341124534607, "rewards/rollout_reward_func/mean": -0.4375, "rewards/rollout_reward_func/std": 0.9136068224906921, "sampling/importance_sampling_ratio/max": 1.3930572271347046, "sampling/importance_sampling_ratio/mean": 0.7577755451202393, "sampling/importance_sampling_ratio/min": 0.16704365611076355, "sampling/sampling_logp_difference/max": 1.0883684158325195, "sampling/sampling_logp_difference/mean": 0.18352536857128143, "step": 365, "step_time": 11.436037391999434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9706911072134972, "epoch": 0.00366, "grad_norm": 0.0013973768800497055, "kl": 1.7851417511701584, "learning_rate": 7.999959680982886e-06, "loss": -0.0, "step": 366, "step_time": 6.348716901999978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.535181537270546, "epoch": 0.00367, "frac_reward_zero_std": 0.75, "grad_norm": 0.001643925323151052, "kl": 0.6322375678769276, "learning_rate": 7.99995943625552e-06, "loss": -0.0001, "num_tokens": 8685529.0, "reward": 3.293750047683716, "reward_std": 1.0035220384597778, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.96875, "rewards/probe_completion_length/std": 0.1767766922712326, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.3125, "rewards/probe_shaping_dominance/std": 0.4709290862083435, "rewards/probe_terminal_raw/mean": 0.6875, "rewards/probe_terminal_raw/std": 0.4709290862083435, "rewards/rollout_reward_func/mean": 0.375, "rewards/rollout_reward_func/std": 0.941858172416687, "sampling/importance_sampling_ratio/max": 1.264983892440796, "sampling/importance_sampling_ratio/mean": 0.8649332523345947, "sampling/importance_sampling_ratio/min": 0.20083816349506378, "sampling/sampling_logp_difference/max": 1.3320589065551758, "sampling/sampling_logp_difference/mean": 0.08798055350780487, "step": 367, "step_time": 10.462185017000138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5350116603076458, "epoch": 0.00368, "grad_norm": 0.0019252891652286053, "kl": 0.6243522961935923, "learning_rate": 7.999959190787684e-06, "loss": -0.0001, "step": 368, "step_time": 5.9952414070003215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.65625, "completions/mean_terminated_length": 2.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0593371242284775, "epoch": 0.00369, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002375837357249111, "kl": 0.7576817069202662, "learning_rate": 7.999958944579377e-06, "loss": 0.0, "num_tokens": 8733838.0, "reward": 2.106250047683716, "reward_std": 1.1670026779174805, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.65625, "rewards/probe_completion_length/std": 0.4825586974620819, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 0.9666203856468201, "sampling/importance_sampling_ratio/mean": 0.7411911487579346, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8964768052101135, "sampling/sampling_logp_difference/mean": 0.13502906262874603, "step": 369, "step_time": 10.525359173999732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0495215579867363, "epoch": 0.0037, "grad_norm": 0.0002475433284416795, "kl": 0.7588968193158507, "learning_rate": 7.999958697630603e-06, "loss": 0.0, "step": 370, "step_time": 6.018097386999671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9190730340778828, "epoch": 0.00371, "frac_reward_zero_std": 0.75, "grad_norm": 0.001725654467009008, "kl": 1.4933142140507698, "learning_rate": 7.999958449941359e-06, "loss": -0.0001, "num_tokens": 8782213.0, "reward": 2.356250047683716, "reward_std": 1.4560080766677856, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5, "rewards/probe_completion_length/std": 0.5080004930496216, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.5625, "rewards/probe_shaping_dominance/std": 0.504016101360321, "rewards/probe_terminal_raw/mean": 0.4375, "rewards/probe_terminal_raw/std": 0.504016101360321, "rewards/rollout_reward_func/mean": -0.125, "rewards/rollout_reward_func/std": 1.008032202720642, "sampling/importance_sampling_ratio/max": 2.0596885681152344, "sampling/importance_sampling_ratio/mean": 0.850114107131958, "sampling/importance_sampling_ratio/min": 0.1749039888381958, "sampling/sampling_logp_difference/max": 2.175316333770752, "sampling/sampling_logp_difference/mean": 0.1391027569770813, "step": 371, "step_time": 10.462862956000663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9161196202039719, "epoch": 0.00372, "grad_norm": 0.0019953493028879166, "kl": 1.440907794982195, "learning_rate": 7.999958201511645e-06, "loss": -0.0001, "step": 372, "step_time": 6.057245024000167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.30112432315945625, "epoch": 0.00373, "frac_reward_zero_std": 1.0, "grad_norm": 4.154292128077941e-07, "kl": 0.9947994146496058, "learning_rate": 7.999957952341462e-06, "loss": 0.0, "num_tokens": 8826771.0, "reward": 3.950000047683716, "reward_std": 0.0, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 2.0, "rewards/probe_completion_length/std": 0.0, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 1.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": 1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 0.9597049951553345, "sampling/importance_sampling_ratio/mean": 0.9405255317687988, "sampling/importance_sampling_ratio/min": 0.9263921976089478, "sampling/sampling_logp_difference/max": 0.07755201309919357, "sampling/sampling_logp_difference/mean": 0.020689481869339943, "step": 373, "step_time": 9.649506622000445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2976001650094986, "epoch": 0.00374, "grad_norm": 3.7091297144797863e-07, "kl": 0.9948522942140698, "learning_rate": 7.99995770243081e-06, "loss": 0.0, "step": 374, "step_time": 5.476371095999639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5625, "completions/mean_terminated_length": 2.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.179700344800949, "epoch": 0.00375, "frac_reward_zero_std": 1.0, "grad_norm": 0.00019490659178700298, "kl": 0.7850398123264313, "learning_rate": 7.999957451779688e-06, "loss": 0.0, "num_tokens": 8874930.0, "reward": 2.043750047683716, "reward_std": 1.201058030128479, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5625, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.468320369720459, "sampling/importance_sampling_ratio/mean": 0.819076657295227, "sampling/importance_sampling_ratio/min": 0.35633477568626404, "sampling/sampling_logp_difference/max": 0.8690546751022339, "sampling/sampling_logp_difference/mean": 0.14318270981311798, "step": 375, "step_time": 10.502751606999482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1702759563922882, "epoch": 0.00376, "grad_norm": 0.00013393217523116618, "kl": 0.81276436150074, "learning_rate": 7.9999572003881e-06, "loss": 0.0, "step": 376, "step_time": 6.032545493999351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.65625, "completions/mean_terminated_length": 2.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7177925407886505, "epoch": 0.00377, "frac_reward_zero_std": 1.0, "grad_norm": 4.9635516916168854e-05, "kl": 1.2584875486791134, "learning_rate": 7.99995694825604e-06, "loss": 0.0, "num_tokens": 8918234.0, "reward": 2.606250047683716, "reward_std": 1.4052752256393433, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.65625, "rewards/probe_completion_length/std": 0.4825586974620819, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.3882752656936646, "sampling/importance_sampling_ratio/mean": 0.9102308750152588, "sampling/importance_sampling_ratio/min": 0.6210145354270935, "sampling/sampling_logp_difference/max": 0.4975299835205078, "sampling/sampling_logp_difference/mean": 0.060513950884342194, "step": 377, "step_time": 10.323000849000437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7326309494674206, "epoch": 0.00378, "grad_norm": 4.1395982407266274e-05, "kl": 1.2528069280087948, "learning_rate": 7.999956695383513e-06, "loss": 0.0, "step": 378, "step_time": 5.978144651000093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.4752974957227707, "epoch": 0.00379, "frac_reward_zero_std": 1.0, "grad_norm": 7.717848347965628e-05, "kl": 0.4324369365349412, "learning_rate": 7.999956441770516e-06, "loss": 0.0, "num_tokens": 8965080.0, "reward": 1.5750000476837158, "reward_std": 1.338029384613037, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.59375, "rewards/probe_completion_length/std": 1.2916449308395386, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9374521970748901, "sampling/importance_sampling_ratio/mean": 0.8539052605628967, "sampling/importance_sampling_ratio/min": 0.258626252412796, "sampling/sampling_logp_difference/max": 0.9988803863525391, "sampling/sampling_logp_difference/mean": 0.19715960323810577, "step": 379, "step_time": 10.432353575000889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4704746305942535, "epoch": 0.0038, "grad_norm": 0.00011949010513490066, "kl": 0.44164562225341797, "learning_rate": 7.999956187417052e-06, "loss": 0.0, "step": 380, "step_time": 6.4899310639998475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.53125, "completions/mean_terminated_length": 2.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.111838586628437, "epoch": 0.00381, "frac_reward_zero_std": 1.0, "grad_norm": 0.00014719787577632815, "kl": 0.6005670647136867, "learning_rate": 7.999955932323117e-06, "loss": 0.0, "num_tokens": 9015299.0, "reward": 1.9812500476837158, "reward_std": 1.2309025526046753, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.53125, "rewards/probe_completion_length/std": 0.507007360458374, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.1842180490493774, "sampling/importance_sampling_ratio/mean": 0.839226484298706, "sampling/importance_sampling_ratio/min": 0.42126908898353577, "sampling/sampling_logp_difference/max": 0.5560178756713867, "sampling/sampling_logp_difference/mean": 0.10737180709838867, "step": 381, "step_time": 9.803125550000004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1229261308908463, "epoch": 0.00382, "grad_norm": 0.0001319620496360585, "kl": 0.5978384953923523, "learning_rate": 7.999955676488715e-06, "loss": 0.0, "step": 382, "step_time": 5.941128261000358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.6875, "completions/mean_terminated_length": 2.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0769974440336227, "epoch": 0.00383, "frac_reward_zero_std": 1.0, "grad_norm": 0.00012360778055153787, "kl": 0.4884014050476253, "learning_rate": 7.999955419913844e-06, "loss": 0.0, "num_tokens": 9061810.0, "reward": 2.137500047683716, "reward_std": 1.1482806205749512, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.6875, "rewards/probe_completion_length/std": 0.4709290862083435, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 0.9834169745445251, "sampling/importance_sampling_ratio/mean": 0.7719845771789551, "sampling/importance_sampling_ratio/min": 0.2768201529979706, "sampling/sampling_logp_difference/max": 1.0129671096801758, "sampling/sampling_logp_difference/mean": 0.1313181072473526, "step": 383, "step_time": 10.252918843000316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.08458661288023, "epoch": 0.00384, "grad_norm": 0.00014907358854543418, "kl": 0.5057441159151495, "learning_rate": 7.999955162598504e-06, "loss": 0.0, "step": 384, "step_time": 7.167399140999805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.542071346193552, "epoch": 0.00385, "frac_reward_zero_std": 1.0, "grad_norm": 3.077173460042104e-05, "kl": 1.9730855226516724, "learning_rate": 7.999954904542697e-06, "loss": 0.0, "num_tokens": 9106137.0, "reward": 3.293750047683716, "reward_std": 1.180742621421814, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 0.3689020276069641, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.25, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.75, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": 0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.0693261623382568, "sampling/importance_sampling_ratio/mean": 0.9118248820304871, "sampling/importance_sampling_ratio/min": 0.41071459650993347, "sampling/sampling_logp_difference/max": 0.7213708162307739, "sampling/sampling_logp_difference/mean": 0.04966055974364281, "step": 385, "step_time": 9.818418909998854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5439686328172684, "epoch": 0.00386, "grad_norm": 2.3318221792578697e-05, "kl": 1.966349296271801, "learning_rate": 7.999954645746422e-06, "loss": 0.0, "step": 386, "step_time": 5.921195296999485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.46875, "completions/mean_terminated_length": 2.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.3177487701177597, "epoch": 0.00387, "frac_reward_zero_std": 1.0, "grad_norm": 0.00017945896252058446, "kl": 0.47732366155833006, "learning_rate": 7.999954386209677e-06, "loss": 0.0, "num_tokens": 9154765.0, "reward": 1.4500000476837158, "reward_std": 0.6720215082168579, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.46875, "rewards/probe_completion_length/std": 0.5670737028121948, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1261528730392456, "sampling/importance_sampling_ratio/mean": 0.7251231670379639, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.286957859992981, "sampling/sampling_logp_difference/mean": 0.18756341934204102, "step": 387, "step_time": 11.11347648400033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.313660904765129, "epoch": 0.00388, "grad_norm": 0.00017108296742662787, "kl": 0.4758234744949732, "learning_rate": 7.999954125932465e-06, "loss": 0.0, "step": 388, "step_time": 6.096910730000218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.53125, "completions/mean_terminated_length": 2.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7469111457467079, "epoch": 0.00389, "frac_reward_zero_std": 1.0, "grad_norm": 2.495557964721229e-05, "kl": 0.037764198847071384, "learning_rate": 7.999953864914783e-06, "loss": 0.0, "num_tokens": 9194799.0, "reward": 2.481250047683716, "reward_std": 1.5023503303527832, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.53125, "rewards/probe_completion_length/std": 0.507007360458374, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 0.9957536458969116, "sampling/importance_sampling_ratio/mean": 0.8884516358375549, "sampling/importance_sampling_ratio/min": 0.6219885349273682, "sampling/sampling_logp_difference/max": 0.3152664601802826, "sampling/sampling_logp_difference/mean": 0.054477110505104065, "step": 389, "step_time": 9.634475558999384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.749498151242733, "epoch": 0.0039, "grad_norm": 2.757420043053571e-05, "kl": 0.037885259749600664, "learning_rate": 7.999953603156633e-06, "loss": 0.0, "step": 390, "step_time": 5.825650949000192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 2.6875, "completions/mean_terminated_length": 2.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2955705225467682, "epoch": 0.00391, "frac_reward_zero_std": 1.0, "grad_norm": 0.00012024679745081812, "kl": 0.3169508697465062, "learning_rate": 7.999953340658018e-06, "loss": 0.0, "num_tokens": 9237578.0, "reward": 2.1999998092651367, "reward_std": 1.8837162256240845, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.75, "rewards/probe_completion_length/std": 1.5862311124801636, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.169108271598816, "sampling/importance_sampling_ratio/mean": 0.8330661058425903, "sampling/importance_sampling_ratio/min": 0.5545997023582458, "sampling/sampling_logp_difference/max": 0.614017903804779, "sampling/sampling_logp_difference/mean": 0.11316787451505661, "step": 391, "step_time": 10.490885173000152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.303221918642521, "epoch": 0.00392, "grad_norm": 0.00011566941975615919, "kl": 0.31426724046468735, "learning_rate": 7.999953077418933e-06, "loss": 0.0, "step": 392, "step_time": 6.56834959400021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.375, "completions/mean_terminated_length": 2.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.460183471441269, "epoch": 0.00393, "frac_reward_zero_std": 1.0, "grad_norm": 0.00010010498954216018, "kl": 0.25794385001063347, "learning_rate": 7.99995281343938e-06, "loss": 0.0, "num_tokens": 9290253.0, "reward": 1.3250000476837158, "reward_std": 0.49186939001083374, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.375, "rewards/probe_completion_length/std": 0.49186936020851135, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4612799882888794, "sampling/importance_sampling_ratio/mean": 0.8141236305236816, "sampling/importance_sampling_ratio/min": 0.5086011290550232, "sampling/sampling_logp_difference/max": 0.46626272797584534, "sampling/sampling_logp_difference/mean": 0.14415019750595093, "step": 393, "step_time": 10.023167942000327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4618552178144455, "epoch": 0.00394, "grad_norm": 9.292273171013221e-05, "kl": 0.24823577469214797, "learning_rate": 7.99995254871936e-06, "loss": 0.0, "step": 394, "step_time": 6.091329966000103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5105515196919441, "epoch": 0.00395, "frac_reward_zero_std": 0.75, "grad_norm": 0.006318738218396902, "kl": 1.4926518842112273, "learning_rate": 7.999952283258871e-06, "loss": -0.0, "num_tokens": 9339783.0, "reward": 3.325000047683716, "reward_std": 1.0701221227645874, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.28125, "rewards/probe_shaping_dominance/std": 0.45680341124534607, "rewards/probe_terminal_raw/mean": 0.71875, "rewards/probe_terminal_raw/std": 0.45680341124534607, "rewards/rollout_reward_func/mean": 0.4375, "rewards/rollout_reward_func/std": 0.9136068224906921, "sampling/importance_sampling_ratio/max": 1.946527123451233, "sampling/importance_sampling_ratio/mean": 0.998159646987915, "sampling/importance_sampling_ratio/min": 0.5635565519332886, "sampling/sampling_logp_difference/max": 0.6543045043945312, "sampling/sampling_logp_difference/mean": 0.07140873372554779, "step": 395, "step_time": 10.683865035999588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.5092885829508305, "epoch": 0.00396, "grad_norm": 0.0005902937264181674, "kl": 1.4889302601804957, "learning_rate": 7.999952017057914e-06, "loss": -0.0001, "step": 396, "step_time": 6.604787075999866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 2.75, "completions/mean_terminated_length": 2.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8198611810803413, "epoch": 0.00397, "frac_reward_zero_std": 1.0, "grad_norm": 6.475277041317895e-05, "kl": 0.7255684435367584, "learning_rate": 7.99995175011649e-06, "loss": 0.0, "num_tokens": 9383200.0, "reward": 2.7312498092651367, "reward_std": 1.73641037940979, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 1.2374368906021118, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.1218085289001465, "sampling/importance_sampling_ratio/mean": 0.8827304840087891, "sampling/importance_sampling_ratio/min": 0.5214709639549255, "sampling/sampling_logp_difference/max": 0.4110407829284668, "sampling/sampling_logp_difference/mean": 0.07405085116624832, "step": 397, "step_time": 9.918148451999514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8066371604800224, "epoch": 0.00398, "grad_norm": 6.86766070430167e-05, "kl": 0.7211511321365833, "learning_rate": 7.9999514824346e-06, "loss": 0.0, "step": 398, "step_time": 6.492818946999705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.65625, "completions/mean_terminated_length": 2.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6338056772947311, "epoch": 0.00399, "frac_reward_zero_std": 0.75, "grad_norm": 0.0021156487055122852, "kl": 2.406707100570202, "learning_rate": 7.999951214012241e-06, "loss": -0.0001, "num_tokens": 9432820.0, "reward": 2.481250047683716, "reward_std": 1.2947629690170288, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.65625, "rewards/probe_completion_length/std": 0.4825586974620819, "rewards/probe_invalid_count/mean": 0.0625, "rewards/probe_invalid_count/std": 0.24593468010425568, "rewards/probe_shaping_dominance/mean": 0.59375, "rewards/probe_shaping_dominance/std": 0.49899089336395264, "rewards/probe_terminal_raw/mean": 0.40625, "rewards/probe_terminal_raw/std": 0.49899089336395264, "rewards/rollout_reward_func/mean": -0.1875, "rewards/rollout_reward_func/std": 0.9979817867279053, "sampling/importance_sampling_ratio/max": 2.8168559074401855, "sampling/importance_sampling_ratio/mean": 0.9036719799041748, "sampling/importance_sampling_ratio/min": 0.24805206060409546, "sampling/sampling_logp_difference/max": 1.526820421218872, "sampling/sampling_logp_difference/mean": 0.1296796202659607, "step": 399, "step_time": 10.381847932000255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.026041666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026041666977107525, "entropy": 0.6298884674906731, "epoch": 0.004, "grad_norm": 0.003479708917438984, "kl": 2.6866486370563507, "learning_rate": 7.999950944849416e-06, "loss": -0.0001, "step": 400, "step_time": 6.760044167000615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0361065343022346, "epoch": 0.00401, "frac_reward_zero_std": 1.0, "grad_norm": 4.91031605633907e-05, "kl": 0.47253769740927964, "learning_rate": 7.999950674946121e-06, "loss": 0.0, "num_tokens": 9478722.0, "reward": 2.4187498092651367, "reward_std": 1.9997479915618896, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.96875, "rewards/probe_completion_length/std": 1.7867680788040161, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.11195707321167, "sampling/importance_sampling_ratio/mean": 0.844825029373169, "sampling/importance_sampling_ratio/min": 0.6442330479621887, "sampling/sampling_logp_difference/max": 0.2771162986755371, "sampling/sampling_logp_difference/mean": 0.07445387542247772, "step": 401, "step_time": 10.186995967000257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.030918937176466, "epoch": 0.00402, "grad_norm": 3.9547725464217365e-05, "kl": 0.46755533292889595, "learning_rate": 7.99995040430236e-06, "loss": 0.0, "step": 402, "step_time": 6.485445361999609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.53125, "completions/mean_terminated_length": 2.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2354515939950943, "epoch": 0.00403, "frac_reward_zero_std": 1.0, "grad_norm": 7.260632264660671e-05, "kl": 0.7972908793017268, "learning_rate": 7.999950132918132e-06, "loss": 0.0, "num_tokens": 9529957.0, "reward": 1.9812500476837158, "reward_std": 1.2309025526046753, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.53125, "rewards/probe_completion_length/std": 0.507007360458374, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.3521214723587036, "sampling/importance_sampling_ratio/mean": 0.8476842045783997, "sampling/importance_sampling_ratio/min": 0.43482327461242676, "sampling/sampling_logp_difference/max": 0.671419620513916, "sampling/sampling_logp_difference/mean": 0.12039721012115479, "step": 403, "step_time": 10.79022957999996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2416410520672798, "epoch": 0.00404, "grad_norm": 6.640212814090773e-05, "kl": 0.8009253116324544, "learning_rate": 7.999949860793436e-06, "loss": 0.0, "step": 404, "step_time": 6.121063306999531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.40625, "completions/mean_terminated_length": 2.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.044917955994606, "epoch": 0.00405, "frac_reward_zero_std": 1.0, "grad_norm": 4.647417881642468e-05, "kl": 0.5192130440846086, "learning_rate": 7.999949587928276e-06, "loss": 0.0, "num_tokens": 9577996.0, "reward": 1.8562500476837158, "reward_std": 1.2790968418121338, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.40625, "rewards/probe_completion_length/std": 0.49899089336395264, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.008124828338623, "sampling/importance_sampling_ratio/mean": 0.868937075138092, "sampling/importance_sampling_ratio/min": 0.5071920156478882, "sampling/sampling_logp_difference/max": 0.4627190828323364, "sampling/sampling_logp_difference/mean": 0.07959282398223877, "step": 405, "step_time": 10.07913560499992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.054791770875454, "epoch": 0.00406, "grad_norm": 4.776316927745938e-05, "kl": 0.5234422187786549, "learning_rate": 7.999949314322646e-06, "loss": 0.0, "step": 406, "step_time": 6.5588358670011075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.40625, "completions/mean_terminated_length": 2.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2371145486831665, "epoch": 0.00407, "frac_reward_zero_std": 1.0, "grad_norm": 0.00010027150710811839, "kl": 0.8442691564559937, "learning_rate": 7.999949039976548e-06, "loss": 0.0, "num_tokens": 9626057.0, "reward": 1.3562500476837158, "reward_std": 0.498990923166275, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.40625, "rewards/probe_completion_length/std": 0.49899089336395264, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8078845739364624, "sampling/importance_sampling_ratio/mean": 0.8824479579925537, "sampling/importance_sampling_ratio/min": 0.4061150550842285, "sampling/sampling_logp_difference/max": 0.8688634037971497, "sampling/sampling_logp_difference/mean": 0.1303093582391739, "step": 407, "step_time": 9.787462137999682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.231427051126957, "epoch": 0.00408, "grad_norm": 8.325048111146316e-05, "kl": 0.8341128025203943, "learning_rate": 7.999948764889987e-06, "loss": 0.0, "step": 408, "step_time": 6.426104604999637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.75, "completions/mean_terminated_length": 2.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5369663275778294, "epoch": 0.00409, "frac_reward_zero_std": 1.0, "grad_norm": 2.07056673389161e-05, "kl": 1.0741528943181038, "learning_rate": 7.999948489062955e-06, "loss": 0.0, "num_tokens": 9666963.0, "reward": 2.700000047683716, "reward_std": 1.319823980331421, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.75, "rewards/probe_completion_length/std": 0.4399413466453552, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.2189736366271973, "sampling/importance_sampling_ratio/mean": 0.9475693702697754, "sampling/importance_sampling_ratio/min": 0.6639173030853271, "sampling/sampling_logp_difference/max": 0.38153910636901855, "sampling/sampling_logp_difference/mean": 0.044145338237285614, "step": 409, "step_time": 9.87107176499967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5364666283130646, "epoch": 0.0041, "grad_norm": 2.00495524040889e-05, "kl": 1.0751704201102257, "learning_rate": 7.99994821249546e-06, "loss": 0.0, "step": 410, "step_time": 6.38307224800019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.375, "completions/mean_terminated_length": 2.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.059143915772438, "epoch": 0.00411, "frac_reward_zero_std": 1.0, "grad_norm": 4.802541297976859e-05, "kl": 0.8314541056752205, "learning_rate": 7.999947935187496e-06, "loss": 0.0, "num_tokens": 9717854.0, "reward": 1.8250000476837158, "reward_std": 1.2889105081558228, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.375, "rewards/probe_completion_length/std": 0.49186936020851135, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.2114936113357544, "sampling/importance_sampling_ratio/mean": 0.8912780284881592, "sampling/importance_sampling_ratio/min": 0.5582963824272156, "sampling/sampling_logp_difference/max": 0.5062055587768555, "sampling/sampling_logp_difference/mean": 0.10437913984060287, "step": 411, "step_time": 9.990031543999521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0532640889286995, "epoch": 0.00412, "grad_norm": 4.7814017307246104e-05, "kl": 0.8273545559495687, "learning_rate": 7.999947657139067e-06, "loss": 0.0, "step": 412, "step_time": 6.57262487800017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.46875, "completions/mean_terminated_length": 2.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2248899713158607, "epoch": 0.00413, "frac_reward_zero_std": 1.0, "grad_norm": 6.309987657004967e-05, "kl": 0.09391478716861457, "learning_rate": 7.99994737835017e-06, "loss": 0.0, "num_tokens": 9764095.0, "reward": 1.9187500476837158, "reward_std": 1.256836175918579, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.46875, "rewards/probe_completion_length/std": 0.507007360458374, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.1762675046920776, "sampling/importance_sampling_ratio/mean": 0.8028596639633179, "sampling/importance_sampling_ratio/min": 0.4906952977180481, "sampling/sampling_logp_difference/max": 0.42553746700286865, "sampling/sampling_logp_difference/mean": 0.11625593155622482, "step": 413, "step_time": 9.962155038000674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2161956802010536, "epoch": 0.00414, "grad_norm": 5.9610069001792e-05, "kl": 0.09792823957832297, "learning_rate": 7.999947098820806e-06, "loss": 0.0, "step": 414, "step_time": 6.5541509539998515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5474282167851925, "epoch": 0.00415, "frac_reward_zero_std": 1.0, "grad_norm": 7.157572690630332e-05, "kl": 1.5889837145805359, "learning_rate": 7.999946818550977e-06, "loss": 0.0, "num_tokens": 9810805.0, "reward": 2.262500047683716, "reward_std": 1.0606601238250732, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 0.3965577781200409, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.550018310546875, "sampling/importance_sampling_ratio/mean": 0.966625452041626, "sampling/importance_sampling_ratio/min": 0.3172820806503296, "sampling/sampling_logp_difference/max": 1.058587670326233, "sampling/sampling_logp_difference/mean": 0.06538467109203339, "step": 415, "step_time": 10.29786402399941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5420740880072117, "epoch": 0.00416, "grad_norm": 7.944254321046174e-05, "kl": 1.5934259369969368, "learning_rate": 7.99994653754068e-06, "loss": 0.0, "step": 416, "step_time": 5.919829822999418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.34375, "completions/mean_terminated_length": 2.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.3573927953839302, "epoch": 0.00417, "frac_reward_zero_std": 1.0, "grad_norm": 5.489736213348806e-05, "kl": 0.4752029323717579, "learning_rate": 7.999946255789918e-06, "loss": 0.0, "num_tokens": 9858587.0, "reward": 1.2937500476837158, "reward_std": 0.4825587272644043, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.34375, "rewards/probe_completion_length/std": 0.4825586974620819, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2800050973892212, "sampling/importance_sampling_ratio/mean": 0.776732325553894, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.6876821517944336, "sampling/sampling_logp_difference/mean": 0.16653087735176086, "step": 417, "step_time": 10.134290047998547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.338614284992218, "epoch": 0.00418, "grad_norm": 8.523750875610858e-05, "kl": 0.48709472338669, "learning_rate": 7.99994597329869e-06, "loss": 0.0, "step": 418, "step_time": 6.640404371999466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.59375, "completions/mean_terminated_length": 2.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8219548910856247, "epoch": 0.00419, "frac_reward_zero_std": 1.0, "grad_norm": 2.453891465847846e-05, "kl": 0.6859103422611952, "learning_rate": 7.999945690066996e-06, "loss": 0.0, "num_tokens": 9903235.0, "reward": 2.543750047683716, "reward_std": 1.4560080766677856, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.59375, "rewards/probe_completion_length/std": 0.49899089336395264, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 0.9690582752227783, "sampling/importance_sampling_ratio/mean": 0.8696466088294983, "sampling/importance_sampling_ratio/min": 0.528867781162262, "sampling/sampling_logp_difference/max": 0.45124274492263794, "sampling/sampling_logp_difference/mean": 0.07117698341608047, "step": 419, "step_time": 9.838608534000741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8214314952492714, "epoch": 0.0042, "grad_norm": 3.332663618493825e-05, "kl": 0.6894950913265347, "learning_rate": 7.999945406094835e-06, "loss": 0.0, "step": 420, "step_time": 6.389929472001313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.65625, "completions/mean_terminated_length": 2.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8140515983104706, "epoch": 0.00421, "frac_reward_zero_std": 1.0, "grad_norm": 1.6370302546420135e-05, "kl": 0.49950144518516026, "learning_rate": 7.999945121382207e-06, "loss": 0.0, "num_tokens": 9951581.0, "reward": 2.606250047683716, "reward_std": 1.4052752256393433, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.65625, "rewards/probe_completion_length/std": 0.4825586974620819, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.0606497526168823, "sampling/importance_sampling_ratio/mean": 0.8690915107727051, "sampling/importance_sampling_ratio/min": 0.5077239274978638, "sampling/sampling_logp_difference/max": 0.46195554733276367, "sampling/sampling_logp_difference/mean": 0.07101378589868546, "step": 421, "step_time": 10.472032985000624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7896543815732002, "epoch": 0.00422, "grad_norm": 9.226971997122746e-06, "kl": 0.49870440473023336, "learning_rate": 7.999944835929116e-06, "loss": 0.0, "step": 422, "step_time": 6.057825643999422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.53125, "completions/mean_terminated_length": 2.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1121821850538254, "epoch": 0.00423, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004775114357471466, "kl": 1.3636024817824364, "learning_rate": 7.999944549735557e-06, "loss": 0.0, "num_tokens": 10000036.0, "reward": 1.9812500476837158, "reward_std": 1.2309025526046753, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.53125, "rewards/probe_completion_length/std": 0.507007360458374, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.3543111085891724, "sampling/importance_sampling_ratio/mean": 0.8356016874313354, "sampling/importance_sampling_ratio/min": 0.18780741095542908, "sampling/sampling_logp_difference/max": 1.3883202075958252, "sampling/sampling_logp_difference/mean": 0.15372709929943085, "step": 423, "step_time": 9.897093682999184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.121554672718048, "epoch": 0.00424, "grad_norm": 0.00040099461330100894, "kl": 1.33827693015337, "learning_rate": 7.999944262801533e-06, "loss": 0.0, "step": 424, "step_time": 6.476370014000167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.75, "completions/mean_terminated_length": 2.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.602243434637785, "epoch": 0.00425, "frac_reward_zero_std": 1.0, "grad_norm": 5.09244782733731e-05, "kl": 1.7651392817497253, "learning_rate": 7.999943975127043e-06, "loss": 0.0, "num_tokens": 10046052.0, "reward": 2.731250047683716, "reward_std": 1.2885193824768066, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.75, "rewards/probe_completion_length/std": 0.4399413466453552, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.8853356838226318, "sampling/importance_sampling_ratio/mean": 0.9488242268562317, "sampling/importance_sampling_ratio/min": 0.2765178978443146, "sampling/sampling_logp_difference/max": 1.041520118713379, "sampling/sampling_logp_difference/mean": 0.07927368581295013, "step": 425, "step_time": 10.415064779000204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6055986508727074, "epoch": 0.00426, "grad_norm": 3.977708183811046e-05, "kl": 1.755710482597351, "learning_rate": 7.999943686712088e-06, "loss": 0.0, "step": 426, "step_time": 5.944202355000925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 2.40625, "completions/mean_terminated_length": 2.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.4355954229831696, "epoch": 0.00427, "frac_reward_zero_std": 1.0, "grad_norm": 0.000142176344525069, "kl": 0.18295252230018377, "learning_rate": 7.999943397556666e-06, "loss": 0.0, "num_tokens": 10090652.0, "reward": 1.4187500476837158, "reward_std": 1.43649160861969, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.46875, "rewards/probe_completion_length/std": 1.43649160861969, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3885198831558228, "sampling/importance_sampling_ratio/mean": 0.830295979976654, "sampling/importance_sampling_ratio/min": 0.39266979694366455, "sampling/sampling_logp_difference/max": 0.7006630897521973, "sampling/sampling_logp_difference/mean": 0.13479085266590118, "step": 427, "step_time": 9.895013823001136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.435707375407219, "epoch": 0.00428, "grad_norm": 0.00015517303836531937, "kl": 0.18490907363593578, "learning_rate": 7.99994310766078e-06, "loss": 0.0, "step": 428, "step_time": 6.982020691000798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 2.71875, "completions/mean_terminated_length": 2.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1079173982143402, "epoch": 0.00429, "frac_reward_zero_std": 1.0, "grad_norm": 4.5960969146108255e-05, "kl": 0.755518133752048, "learning_rate": 7.999942817024428e-06, "loss": 0.0, "num_tokens": 10137753.0, "reward": 2.231250047683716, "reward_std": 1.727096676826477, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 1.4081416130065918, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.081230640411377, "sampling/importance_sampling_ratio/mean": 0.8246882557868958, "sampling/importance_sampling_ratio/min": 0.5850779414176941, "sampling/sampling_logp_difference/max": 0.6071332693099976, "sampling/sampling_logp_difference/mean": 0.08669533580541611, "step": 429, "step_time": 9.993113438999899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1078582182526588, "epoch": 0.0043, "grad_norm": 7.557954086223617e-05, "kl": 0.7598963440395892, "learning_rate": 7.99994252564761e-06, "loss": 0.0, "step": 430, "step_time": 6.045752523999909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 3.09375, "completions/mean_terminated_length": 3.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.42129435390233994, "epoch": 0.00431, "frac_reward_zero_std": 1.0, "grad_norm": 0.00011003066174453124, "kl": 0.6489775236696005, "learning_rate": 7.999942233530327e-06, "loss": 0.0, "num_tokens": 10175885.0, "reward": 3.5749998092651367, "reward_std": 1.2636353969573975, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 2.125, "rewards/probe_completion_length/std": 1.099853277206421, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.25, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.75, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": 0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.341364860534668, "sampling/importance_sampling_ratio/mean": 0.9086952209472656, "sampling/importance_sampling_ratio/min": 0.49460649490356445, "sampling/sampling_logp_difference/max": 0.5881896615028381, "sampling/sampling_logp_difference/mean": 0.047025080770254135, "step": 431, "step_time": 9.545812129002115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4164751283824444, "epoch": 0.00432, "grad_norm": 0.000159382849233225, "kl": 0.6406664741225541, "learning_rate": 7.999941940672578e-06, "loss": 0.0, "step": 432, "step_time": 6.606084928999735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.59375, "completions/mean_terminated_length": 2.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8442269414663315, "epoch": 0.00433, "frac_reward_zero_std": 1.0, "grad_norm": 3.960997491958551e-05, "kl": 0.9814293906092644, "learning_rate": 7.999941647074366e-06, "loss": 0.0, "num_tokens": 10219910.0, "reward": 2.543750047683716, "reward_std": 1.4560080766677856, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.59375, "rewards/probe_completion_length/std": 0.49899089336395264, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 0.9716951847076416, "sampling/importance_sampling_ratio/mean": 0.8736358284950256, "sampling/importance_sampling_ratio/min": 0.4931846559047699, "sampling/sampling_logp_difference/max": 0.4490457773208618, "sampling/sampling_logp_difference/mean": 0.06978647410869598, "step": 433, "step_time": 9.839958128001854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8495101407170296, "epoch": 0.00434, "grad_norm": 3.293993358965963e-05, "kl": 0.9773873090744019, "learning_rate": 7.999941352735688e-06, "loss": 0.0, "step": 434, "step_time": 5.947311551000894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.577320072799921, "epoch": 0.00435, "frac_reward_zero_std": 0.75, "grad_norm": 0.008683150634169579, "kl": 5.158851059153676, "learning_rate": 7.999941057656543e-06, "loss": -0.0, "num_tokens": 10264421.0, "reward": 3.075000047683716, "reward_std": 1.263635277748108, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 0.3965577781200409, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.34375, "rewards/probe_shaping_dominance/std": 0.4825586974620819, "rewards/probe_terminal_raw/mean": 0.65625, "rewards/probe_terminal_raw/std": 0.4825586974620819, "rewards/rollout_reward_func/mean": 0.3125, "rewards/rollout_reward_func/std": 0.9651173949241638, "sampling/importance_sampling_ratio/max": 1.0832629203796387, "sampling/importance_sampling_ratio/mean": 0.8609358072280884, "sampling/importance_sampling_ratio/min": 0.0714401826262474, "sampling/sampling_logp_difference/max": 1.9287629127502441, "sampling/sampling_logp_difference/mean": 0.108085498213768, "step": 435, "step_time": 9.750276677000329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.574884194880724, "epoch": 0.00436, "grad_norm": 0.005062948912382126, "kl": 3.17036358686164, "learning_rate": 7.999940761836937e-06, "loss": -0.0001, "step": 436, "step_time": 6.396997322999596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0359537526965141, "epoch": 0.00437, "frac_reward_zero_std": 1.0, "grad_norm": 3.464319524937309e-05, "kl": 0.48364219442009926, "learning_rate": 7.999940465276862e-06, "loss": 0.0, "num_tokens": 10312773.0, "reward": 1.9500000476837158, "reward_std": 1.2443419694900513, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5, "rewards/probe_completion_length/std": 0.5080004930496216, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 0.9713358879089355, "sampling/importance_sampling_ratio/mean": 0.8230075240135193, "sampling/importance_sampling_ratio/min": 0.5460324883460999, "sampling/sampling_logp_difference/max": 0.39858847856521606, "sampling/sampling_logp_difference/mean": 0.08530294895172119, "step": 437, "step_time": 10.383505194999088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0389421209692955, "epoch": 0.00438, "grad_norm": 5.601278826361522e-05, "kl": 0.4953901164699346, "learning_rate": 7.999940167976326e-06, "loss": 0.0, "step": 438, "step_time": 5.959065071000623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.75, "completions/mean_terminated_length": 2.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8178563863039017, "epoch": 0.00439, "frac_reward_zero_std": 0.75, "grad_norm": 0.0030496560502797365, "kl": 1.5864214673638344, "learning_rate": 7.999939869935323e-06, "loss": -0.0001, "num_tokens": 10361704.0, "reward": 2.106250047683716, "reward_std": 0.9873188138008118, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.75, "rewards/probe_completion_length/std": 0.4399413466453552, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.8125, "rewards/probe_shaping_dominance/std": 0.3965577781200409, "rewards/probe_terminal_raw/mean": 0.1875, "rewards/probe_terminal_raw/std": 0.3965577781200409, "rewards/rollout_reward_func/mean": -0.625, "rewards/rollout_reward_func/std": 0.7931155562400818, "sampling/importance_sampling_ratio/max": 2.53995418548584, "sampling/importance_sampling_ratio/mean": 0.8853044509887695, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1533396244049072, "sampling/sampling_logp_difference/mean": 0.18097969889640808, "step": 439, "step_time": 10.668017594998673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.834572046995163, "epoch": 0.0044, "grad_norm": 0.003595987567678094, "kl": 1.560511738061905, "learning_rate": 7.999939571153855e-06, "loss": -0.0001, "step": 440, "step_time": 6.122170336001545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.78125, "completions/mean_terminated_length": 2.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0095400363206863, "epoch": 0.00441, "frac_reward_zero_std": 0.75, "grad_norm": 0.0025006576906889677, "kl": 1.4991996549069881, "learning_rate": 7.999939271631924e-06, "loss": -0.0001, "num_tokens": 10413488.0, "reward": 2.043750047683716, "reward_std": 0.9283830523490906, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 0.420013427734375, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.84375, "rewards/probe_shaping_dominance/std": 0.3689020276069641, "rewards/probe_terminal_raw/mean": 0.15625, "rewards/probe_terminal_raw/std": 0.3689020276069641, "rewards/rollout_reward_func/mean": -0.6875, "rewards/rollout_reward_func/std": 0.7378040552139282, "sampling/importance_sampling_ratio/max": 1.7598189115524292, "sampling/importance_sampling_ratio/mean": 0.8456007242202759, "sampling/importance_sampling_ratio/min": 0.22255338728427887, "sampling/sampling_logp_difference/max": 1.387916088104248, "sampling/sampling_logp_difference/mean": 0.17741605639457703, "step": 441, "step_time": 10.27434647300106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.0087963938713074, "epoch": 0.00442, "grad_norm": 0.0019897983875125647, "kl": 1.8581994846463203, "learning_rate": 7.999938971369529e-06, "loss": -0.0001, "step": 442, "step_time": 6.660500067999237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.625, "completions/mean_terminated_length": 2.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9083375707268715, "epoch": 0.00443, "frac_reward_zero_std": 0.75, "grad_norm": 0.0012449432397261262, "kl": 0.7166382949799299, "learning_rate": 7.999938670366667e-06, "loss": -0.0001, "num_tokens": 10460910.0, "reward": 2.450000047683716, "reward_std": 1.3912166357040405, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.625, "rewards/probe_completion_length/std": 0.49186936020851135, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5625, "rewards/probe_shaping_dominance/std": 0.504016101360321, "rewards/probe_terminal_raw/mean": 0.4375, "rewards/probe_terminal_raw/std": 0.504016101360321, "rewards/rollout_reward_func/mean": -0.125, "rewards/rollout_reward_func/std": 1.008032202720642, "sampling/importance_sampling_ratio/max": 1.1054221391677856, "sampling/importance_sampling_ratio/mean": 0.8311811089515686, "sampling/importance_sampling_ratio/min": 0.3434506058692932, "sampling/sampling_logp_difference/max": 1.0333223342895508, "sampling/sampling_logp_difference/mean": 0.09627163410186768, "step": 443, "step_time": 10.282398681000814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9153599664568901, "epoch": 0.00444, "grad_norm": 0.0012570996768772602, "kl": 0.7166626800899394, "learning_rate": 7.999938368623343e-06, "loss": -0.0001, "step": 444, "step_time": 5.903002717001982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5625, "completions/mean_terminated_length": 2.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.057725302875042, "epoch": 0.00445, "frac_reward_zero_std": 0.5, "grad_norm": 0.0031768870539963245, "kl": 1.6499460637569427, "learning_rate": 7.999938066139555e-06, "loss": -0.0001, "num_tokens": 10509423.0, "reward": 2.231250047683716, "reward_std": 1.3255400657653809, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5625, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.65625, "rewards/probe_shaping_dominance/std": 0.4825586974620819, "rewards/probe_terminal_raw/mean": 0.34375, "rewards/probe_terminal_raw/std": 0.4825586974620819, "rewards/rollout_reward_func/mean": -0.3125, "rewards/rollout_reward_func/std": 0.9651173949241638, "sampling/importance_sampling_ratio/max": 1.0972373485565186, "sampling/importance_sampling_ratio/mean": 0.781715989112854, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.737563133239746, "sampling/sampling_logp_difference/mean": 0.16636095941066742, "step": 445, "step_time": 10.328625375999763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 1.0501547306776047, "epoch": 0.00446, "grad_norm": 0.0023717940784990788, "kl": 1.9352448098361492, "learning_rate": 7.9999377629153e-06, "loss": -0.0001, "step": 446, "step_time": 7.223937839999053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.53125, "completions/mean_terminated_length": 2.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0991413369774818, "epoch": 0.00447, "frac_reward_zero_std": 1.0, "grad_norm": 0.00013539350766222924, "kl": 0.8366762921214104, "learning_rate": 7.999937458950583e-06, "loss": 0.0, "num_tokens": 10556571.0, "reward": 1.9812500476837158, "reward_std": 1.2309025526046753, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.53125, "rewards/probe_completion_length/std": 0.507007360458374, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.5306068658828735, "sampling/importance_sampling_ratio/mean": 0.896122932434082, "sampling/importance_sampling_ratio/min": 0.3056519329547882, "sampling/sampling_logp_difference/max": 1.0416702032089233, "sampling/sampling_logp_difference/mean": 0.12326531112194061, "step": 447, "step_time": 10.101042933999452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0788709297776222, "epoch": 0.00448, "grad_norm": 0.00021080716396681964, "kl": 0.8710470348596573, "learning_rate": 7.999937154245402e-06, "loss": 0.0, "step": 448, "step_time": 6.108521285000279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5352600961923599, "epoch": 0.00449, "frac_reward_zero_std": 1.0, "grad_norm": 1.1860423001053277e-05, "kl": 0.4157614109572023, "learning_rate": 7.999936848799756e-06, "loss": 0.0, "num_tokens": 10598609.0, "reward": 3.262500047683716, "reward_std": 1.2296733856201172, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 0.3965577781200409, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.25, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.75, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": 0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 0.9666202664375305, "sampling/importance_sampling_ratio/mean": 0.9112852215766907, "sampling/importance_sampling_ratio/min": 0.6785628199577332, "sampling/sampling_logp_difference/max": 0.21126577258110046, "sampling/sampling_logp_difference/mean": 0.03843056410551071, "step": 449, "step_time": 9.731360219000635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5475796833634377, "epoch": 0.0045, "grad_norm": 9.530402167001739e-06, "kl": 0.4151203960645944, "learning_rate": 7.999936542613647e-06, "loss": 0.0, "step": 450, "step_time": 6.8167696039990915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 2.59375, "completions/mean_terminated_length": 2.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.4937551617622375, "epoch": 0.00451, "frac_reward_zero_std": 0.75, "grad_norm": 0.0027509112842381, "kl": 0.8361366703175008, "learning_rate": 7.999936235687075e-06, "loss": -0.0001, "num_tokens": 10644998.0, "reward": 2.043750047683716, "reward_std": 1.7479827404022217, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.65625, "rewards/probe_completion_length/std": 1.4280457496643066, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.78125, "rewards/probe_shaping_dominance/std": 0.420013427734375, "rewards/probe_terminal_raw/mean": 0.21875, "rewards/probe_terminal_raw/std": 0.420013427734375, "rewards/rollout_reward_func/mean": -0.5625, "rewards/rollout_reward_func/std": 0.84002685546875, "sampling/importance_sampling_ratio/max": 2.1693289279937744, "sampling/importance_sampling_ratio/mean": 0.8219958543777466, "sampling/importance_sampling_ratio/min": 0.3935191333293915, "sampling/sampling_logp_difference/max": 0.8192603588104248, "sampling/sampling_logp_difference/mean": 0.15191489458084106, "step": 451, "step_time": 9.84771198999988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4842835664749146, "epoch": 0.00452, "grad_norm": 0.0019148472929373384, "kl": 0.8515774551779032, "learning_rate": 7.999935928020036e-06, "loss": -0.0001, "step": 452, "step_time": 5.956881039999644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2700788527727127, "epoch": 0.00453, "frac_reward_zero_std": 1.0, "grad_norm": 7.074048335198313e-05, "kl": 0.7058316059410572, "learning_rate": 7.999935619612536e-06, "loss": 0.0, "num_tokens": 10694855.0, "reward": 1.9500000476837158, "reward_std": 1.2443419694900513, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5, "rewards/probe_completion_length/std": 0.5080004930496216, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.1525969505310059, "sampling/importance_sampling_ratio/mean": 0.8169641494750977, "sampling/importance_sampling_ratio/min": 0.5228918194770813, "sampling/sampling_logp_difference/max": 0.5732545852661133, "sampling/sampling_logp_difference/mean": 0.11736132204532623, "step": 453, "step_time": 9.987618331999329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.278110608458519, "epoch": 0.00454, "grad_norm": 7.446208473993465e-05, "kl": 0.7188287526369095, "learning_rate": 7.999935310464572e-06, "loss": 0.0, "step": 454, "step_time": 7.042841277000662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6826481185853481, "epoch": 0.00455, "frac_reward_zero_std": 0.75, "grad_norm": 0.0026605785824358463, "kl": 3.3382283598184586, "learning_rate": 7.999935000576144e-06, "loss": -0.0, "num_tokens": 10740810.0, "reward": 2.731250047683716, "reward_std": 1.2110878229141235, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 0.3689020276069641, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.53125, "rewards/probe_shaping_dominance/std": 0.507007360458374, "rewards/probe_terminal_raw/mean": 0.46875, "rewards/probe_terminal_raw/std": 0.507007360458374, "rewards/rollout_reward_func/mean": -0.0625, "rewards/rollout_reward_func/std": 1.014014720916748, "sampling/importance_sampling_ratio/max": 1.809893012046814, "sampling/importance_sampling_ratio/mean": 0.8945350050926208, "sampling/importance_sampling_ratio/min": 0.2569253146648407, "sampling/sampling_logp_difference/max": 1.2167439460754395, "sampling/sampling_logp_difference/mean": 0.08854126930236816, "step": 455, "step_time": 10.07339215400043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6866127476096153, "epoch": 0.00456, "grad_norm": 0.001385181793011725, "kl": 3.1702629774808884, "learning_rate": 7.999934689947254e-06, "loss": -0.0, "step": 456, "step_time": 6.053322766999372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.31620290130376816, "epoch": 0.00457, "frac_reward_zero_std": 1.0, "grad_norm": 5.6734403187874705e-05, "kl": 1.541536584496498, "learning_rate": 7.999934378577899e-06, "loss": 0.0, "num_tokens": 10783603.0, "reward": 2.950000047683716, "reward_std": 1.0160009860992432, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 2.0, "rewards/probe_completion_length/std": 0.0, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.371078372001648, "sampling/importance_sampling_ratio/mean": 0.9306595921516418, "sampling/importance_sampling_ratio/min": 0.1436803787946701, "sampling/sampling_logp_difference/max": 1.6921932697296143, "sampling/sampling_logp_difference/mean": 0.06238972768187523, "step": 457, "step_time": 10.153244150002138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.31905546225607395, "epoch": 0.00458, "grad_norm": 4.7455170715693384e-05, "kl": 1.537596296519041, "learning_rate": 7.999934066468082e-06, "loss": 0.0, "step": 458, "step_time": 6.367441510999015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.78125, "completions/mean_terminated_length": 2.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.675567265599966, "epoch": 0.00459, "frac_reward_zero_std": 1.0, "grad_norm": 6.736534123774618e-05, "kl": 1.7775992900133133, "learning_rate": 7.9999337536178e-06, "loss": 0.0, "num_tokens": 10828138.0, "reward": 2.731250047683716, "reward_std": 1.2885193824768066, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 0.420013427734375, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.9140326976776123, "sampling/importance_sampling_ratio/mean": 0.9304972887039185, "sampling/importance_sampling_ratio/min": 0.31738898158073425, "sampling/sampling_logp_difference/max": 0.7054009437561035, "sampling/sampling_logp_difference/mean": 0.08023737370967865, "step": 459, "step_time": 9.88763274899975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6782685145735741, "epoch": 0.0046, "grad_norm": 6.407535693142563e-05, "kl": 1.7728114277124405, "learning_rate": 7.999933440027056e-06, "loss": 0.0, "step": 460, "step_time": 5.971275758998672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.65625, "completions/mean_terminated_length": 2.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.009953811764717, "epoch": 0.00461, "frac_reward_zero_std": 1.0, "grad_norm": 0.00026355075533501804, "kl": 0.9404683746397495, "learning_rate": 7.999933125695849e-06, "loss": 0.0, "num_tokens": 10876467.0, "reward": 2.106250047683716, "reward_std": 1.1670026779174805, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.65625, "rewards/probe_completion_length/std": 0.4825586974620819, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 2.7634549140930176, "sampling/importance_sampling_ratio/mean": 0.867573082447052, "sampling/importance_sampling_ratio/min": 0.36806216835975647, "sampling/sampling_logp_difference/max": 1.037782907485962, "sampling/sampling_logp_difference/mean": 0.11394397169351578, "step": 461, "step_time": 11.22426544300015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0309009402990341, "epoch": 0.00462, "grad_norm": 0.00029655409161932766, "kl": 0.9389306157827377, "learning_rate": 7.99993281062418e-06, "loss": 0.0, "step": 462, "step_time": 6.238317131001168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5625, "completions/mean_terminated_length": 2.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9029425978660583, "epoch": 0.00463, "frac_reward_zero_std": 1.0, "grad_norm": 3.5131532058585435e-05, "kl": 0.7519008315866813, "learning_rate": 7.999932494812047e-06, "loss": 0.0, "num_tokens": 10918254.0, "reward": 2.012500047683716, "reward_std": 1.2164862155914307, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5625, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 2.604414224624634, "sampling/importance_sampling_ratio/mean": 0.9013475775718689, "sampling/importance_sampling_ratio/min": 0.5035668611526489, "sampling/sampling_logp_difference/max": 0.8525474071502686, "sampling/sampling_logp_difference/mean": 0.09090504050254822, "step": 463, "step_time": 9.717574374998549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9116902351379395, "epoch": 0.00464, "grad_norm": 3.008692874573171e-05, "kl": 0.7550733370007947, "learning_rate": 7.999932178259451e-06, "loss": 0.0, "step": 464, "step_time": 6.374117108998689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.75, "completions/mean_terminated_length": 2.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7433590814471245, "epoch": 0.00465, "frac_reward_zero_std": 1.0, "grad_norm": 0.008980808779597282, "kl": 4.237264834344387, "learning_rate": 7.999931860966393e-06, "loss": 0.0001, "num_tokens": 10967792.0, "reward": 1.7000000476837158, "reward_std": 0.4399413466453552, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.75, "rewards/probe_completion_length/std": 0.4399413466453552, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.894777774810791, "sampling/importance_sampling_ratio/mean": 0.8646867871284485, "sampling/importance_sampling_ratio/min": 0.03391490504145622, "sampling/sampling_logp_difference/max": 3.4479689598083496, "sampling/sampling_logp_difference/mean": 0.24308747053146362, "step": 465, "step_time": 10.31518953199975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7749368958175182, "epoch": 0.00466, "grad_norm": 0.001255774055607617, "kl": 2.4550288692116737, "learning_rate": 7.999931542932872e-06, "loss": 0.0, "step": 466, "step_time": 6.806484487000489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.4020432382822037, "epoch": 0.00467, "frac_reward_zero_std": 1.0, "grad_norm": 4.1622817661846057e-05, "kl": 0.062206802947912365, "learning_rate": 7.999931224158886e-06, "loss": 0.0, "num_tokens": 11014060.0, "reward": 1.4500000476837158, "reward_std": 1.436842441558838, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5, "rewards/probe_completion_length/std": 1.436842441558838, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.131276249885559, "sampling/importance_sampling_ratio/mean": 0.7747564315795898, "sampling/importance_sampling_ratio/min": 0.3255142569541931, "sampling/sampling_logp_difference/max": 0.6411696076393127, "sampling/sampling_logp_difference/mean": 0.135621577501297, "step": 467, "step_time": 9.957253848999244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4081729501485825, "epoch": 0.00468, "grad_norm": 4.6337001549545676e-05, "kl": 0.06316643842728809, "learning_rate": 7.999930904644442e-06, "loss": 0.0, "step": 468, "step_time": 6.577253157999621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1130.0, "completions/max_terminated_length": 1130.0, "completions/mean_length": 37.96875, "completions/mean_terminated_length": 37.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1360742896795273, "epoch": 0.00469, "frac_reward_zero_std": 1.0, "grad_norm": 0.00010231900523649529, "kl": 1.0736553617753088, "learning_rate": 7.999930584389531e-06, "loss": 0.0, "num_tokens": 11066179.0, "reward": 104.69999694824219, "reward_std": 579.4194946289062, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 104.25, "rewards/probe_completion_length/std": 579.5098876953125, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.8770849704742432, "sampling/importance_sampling_ratio/mean": 0.8900884389877319, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.822144031524658, "sampling/sampling_logp_difference/mean": 0.27112311124801636, "step": 469, "step_time": 23.543196358000387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1283305957913399, "epoch": 0.0047, "grad_norm": 0.00010479523916728795, "kl": 1.07857805211097, "learning_rate": 7.999930263394161e-06, "loss": 0.0, "step": 470, "step_time": 9.207572682000318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6140802390873432, "epoch": 0.00471, "frac_reward_zero_std": 1.0, "grad_norm": 8.397161582252011e-05, "kl": 1.2529713734984398, "learning_rate": 7.999929941658327e-06, "loss": 0.0, "num_tokens": 11108801.0, "reward": 2.762500047683716, "reward_std": 1.2556325197219849, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 0.3965577781200409, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 2.2854928970336914, "sampling/importance_sampling_ratio/mean": 0.9871470928192139, "sampling/importance_sampling_ratio/min": 0.4138258695602417, "sampling/sampling_logp_difference/max": 0.8604846000671387, "sampling/sampling_logp_difference/mean": 0.08210620284080505, "step": 471, "step_time": 9.92915231400002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6049374490976334, "epoch": 0.00472, "grad_norm": 5.9758218412753195e-05, "kl": 1.2381772547960281, "learning_rate": 7.999929619182034e-06, "loss": 0.0, "step": 472, "step_time": 6.531484164000176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.78125, "completions/mean_terminated_length": 2.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6399192288517952, "epoch": 0.00473, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001346557110082358, "kl": 1.022419050335884, "learning_rate": 7.999929295965276e-06, "loss": 0.0, "num_tokens": 11154060.0, "reward": 2.731250047683716, "reward_std": 1.2885193824768066, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 0.420013427734375, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.9041831493377686, "sampling/importance_sampling_ratio/mean": 0.9148904085159302, "sampling/importance_sampling_ratio/min": 0.41519424319267273, "sampling/sampling_logp_difference/max": 0.7960150241851807, "sampling/sampling_logp_difference/mean": 0.09308066964149475, "step": 473, "step_time": 10.540310639998097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.646208006888628, "epoch": 0.00474, "grad_norm": 7.723711314611137e-05, "kl": 0.993750736117363, "learning_rate": 7.999928972008055e-06, "loss": 0.0, "step": 474, "step_time": 6.0534506069980125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 2.78125, "completions/mean_terminated_length": 2.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9178853631019592, "epoch": 0.00475, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007165942224673927, "kl": 0.6427033228101209, "learning_rate": 7.999928647310374e-06, "loss": 0.0, "num_tokens": 11202464.0, "reward": 1.7937500476837158, "reward_std": 1.221035361289978, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 1.221035361289978, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1609777212142944, "sampling/importance_sampling_ratio/mean": 0.8023998737335205, "sampling/importance_sampling_ratio/min": 0.39279136061668396, "sampling/sampling_logp_difference/max": 0.8687930107116699, "sampling/sampling_logp_difference/mean": 0.12288682162761688, "step": 475, "step_time": 10.169633613001679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9118505492806435, "epoch": 0.00476, "grad_norm": 0.000408078107284382, "kl": 0.5350325474282727, "learning_rate": 7.99992832187223e-06, "loss": 0.0, "step": 476, "step_time": 6.567168232002587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.40625, "completions/mean_terminated_length": 2.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.200137197971344, "epoch": 0.00477, "frac_reward_zero_std": 1.0, "grad_norm": 7.701815047767013e-05, "kl": 0.7950491067022085, "learning_rate": 7.999927995693626e-06, "loss": 0.0, "num_tokens": 11251359.0, "reward": 1.3562500476837158, "reward_std": 0.498990923166275, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.40625, "rewards/probe_completion_length/std": 0.49899089336395264, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.108764886856079, "sampling/importance_sampling_ratio/mean": 0.8880026340484619, "sampling/importance_sampling_ratio/min": 0.40861380100250244, "sampling/sampling_logp_difference/max": 0.7756710052490234, "sampling/sampling_logp_difference/mean": 0.142902672290802, "step": 477, "step_time": 10.48534435199872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1992550939321518, "epoch": 0.00478, "grad_norm": 7.540653314208612e-05, "kl": 0.7962087821215391, "learning_rate": 7.999927668774559e-06, "loss": 0.0, "step": 478, "step_time": 6.0412125680004465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.78125, "completions/mean_terminated_length": 2.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5685002394020557, "epoch": 0.00479, "frac_reward_zero_std": 1.0, "grad_norm": 1.6812458852655254e-05, "kl": 0.6405358919873834, "learning_rate": 7.99992734111503e-06, "loss": 0.0, "num_tokens": 11296370.0, "reward": 3.231250047683716, "reward_std": 1.2759405374526978, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 0.420013427734375, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.25, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.75, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": 0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.0279954671859741, "sampling/importance_sampling_ratio/mean": 0.9103195667266846, "sampling/importance_sampling_ratio/min": 0.6296721696853638, "sampling/sampling_logp_difference/max": 0.38632071018218994, "sampling/sampling_logp_difference/mean": 0.04309441149234772, "step": 479, "step_time": 9.993394022998473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5688362512737513, "epoch": 0.0048, "grad_norm": 1.9768529455177486e-05, "kl": 0.6405658056028187, "learning_rate": 7.99992701271504e-06, "loss": 0.0, "step": 480, "step_time": 6.553862806000325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6740151382982731, "epoch": 0.00481, "frac_reward_zero_std": 0.75, "grad_norm": 0.0008734637522138655, "kl": 2.5446321358904243, "learning_rate": 7.99992668357459e-06, "loss": -0.0, "num_tokens": 11344807.0, "reward": 2.762500047683716, "reward_std": 1.1760376691818237, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.53125, "rewards/probe_shaping_dominance/std": 0.507007360458374, "rewards/probe_terminal_raw/mean": 0.46875, "rewards/probe_terminal_raw/std": 0.507007360458374, "rewards/rollout_reward_func/mean": -0.0625, "rewards/rollout_reward_func/std": 1.014014720916748, "sampling/importance_sampling_ratio/max": 2.2089757919311523, "sampling/importance_sampling_ratio/mean": 0.9335870146751404, "sampling/importance_sampling_ratio/min": 0.3754686415195465, "sampling/sampling_logp_difference/max": 0.992009162902832, "sampling/sampling_logp_difference/mean": 0.08749516308307648, "step": 481, "step_time": 10.437285636000524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6751360222697258, "epoch": 0.00482, "grad_norm": 0.0008649210212752223, "kl": 2.542562405578792, "learning_rate": 7.999926353693675e-06, "loss": -0.0, "step": 482, "step_time": 5.9687393949998295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.59375, "completions/mean_terminated_length": 2.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.018196739256382, "epoch": 0.00483, "frac_reward_zero_std": 0.75, "grad_norm": 0.0012504520127549767, "kl": 1.1826329734176397, "learning_rate": 7.999926023072302e-06, "loss": -0.0, "num_tokens": 11392337.0, "reward": 1.9812500476837158, "reward_std": 1.1495966911315918, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.59375, "rewards/probe_completion_length/std": 0.49899089336395264, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.78125, "rewards/probe_shaping_dominance/std": 0.420013427734375, "rewards/probe_terminal_raw/mean": 0.21875, "rewards/probe_terminal_raw/std": 0.420013427734375, "rewards/rollout_reward_func/mean": -0.5625, "rewards/rollout_reward_func/std": 0.84002685546875, "sampling/importance_sampling_ratio/max": 1.1086639165878296, "sampling/importance_sampling_ratio/mean": 0.8167065382003784, "sampling/importance_sampling_ratio/min": 0.43818601965904236, "sampling/sampling_logp_difference/max": 0.6695658564567566, "sampling/sampling_logp_difference/mean": 0.11686171591281891, "step": 483, "step_time": 10.05440507900039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0212653428316116, "epoch": 0.00484, "grad_norm": 0.0011736839078366756, "kl": 1.1902284994721413, "learning_rate": 7.999925691710467e-06, "loss": -0.0, "step": 484, "step_time": 6.5248691330007205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.71875, "completions/mean_terminated_length": 2.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.051191158592701, "epoch": 0.00485, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008444490376859903, "kl": 1.4066885709762573, "learning_rate": 7.99992535960817e-06, "loss": 0.0, "num_tokens": 11444223.0, "reward": 1.6687500476837158, "reward_std": 0.45680341124534607, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.71875, "rewards/probe_completion_length/std": 0.45680341124534607, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.790355920791626, "sampling/importance_sampling_ratio/mean": 0.8630131483078003, "sampling/importance_sampling_ratio/min": 0.07368746399879456, "sampling/sampling_logp_difference/max": 1.5465178489685059, "sampling/sampling_logp_difference/mean": 0.1692320853471756, "step": 485, "step_time": 10.868371823001326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0457961112260818, "epoch": 0.00486, "grad_norm": 0.0008401769446209073, "kl": 1.312499899417162, "learning_rate": 7.999925026765412e-06, "loss": 0.0, "step": 486, "step_time": 6.268463713998244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.625, "completions/mean_terminated_length": 2.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7576271295547485, "epoch": 0.00487, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004383134946692735, "kl": 1.3531483765691519, "learning_rate": 7.999924693182194e-06, "loss": 0.0, "num_tokens": 11487384.0, "reward": 1.5750000476837158, "reward_std": 0.49186936020851135, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.625, "rewards/probe_completion_length/std": 0.49186936020851135, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2666070461273193, "sampling/importance_sampling_ratio/mean": 0.8539038896560669, "sampling/importance_sampling_ratio/min": 0.26787251234054565, "sampling/sampling_logp_difference/max": 1.2270400524139404, "sampling/sampling_logp_difference/mean": 0.10817140340805054, "step": 487, "step_time": 9.842391171998315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7594375871121883, "epoch": 0.00488, "grad_norm": 0.0006697545177303255, "kl": 1.3161140335723758, "learning_rate": 7.999924358858514e-06, "loss": 0.0, "step": 488, "step_time": 6.4828962940000565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.883793793618679, "epoch": 0.00489, "frac_reward_zero_std": 0.75, "grad_norm": 0.0034089318942278624, "kl": 0.3923273850377882, "learning_rate": 7.999924023794374e-06, "loss": -0.0001, "num_tokens": 11536908.0, "reward": 2.6687498092651367, "reward_std": 1.6310372352600098, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 2.03125, "rewards/probe_completion_length/std": 1.331610083580017, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.65625, "rewards/probe_shaping_dominance/std": 0.4825586974620819, "rewards/probe_terminal_raw/mean": 0.34375, "rewards/probe_terminal_raw/std": 0.4825586974620819, "rewards/rollout_reward_func/mean": -0.3125, "rewards/rollout_reward_func/std": 0.9651173949241638, "sampling/importance_sampling_ratio/max": 1.9263346195220947, "sampling/importance_sampling_ratio/mean": 0.8542444109916687, "sampling/importance_sampling_ratio/min": 0.4926546812057495, "sampling/sampling_logp_difference/max": 0.520141065120697, "sampling/sampling_logp_difference/mean": 0.09610345959663391, "step": 489, "step_time": 10.60969110699989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8811840042471886, "epoch": 0.0049, "grad_norm": 0.0034968475811183453, "kl": 0.3980398298444925, "learning_rate": 7.999923687989774e-06, "loss": -0.0001, "step": 490, "step_time": 6.079914465999536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2838631570339203, "epoch": 0.00491, "frac_reward_zero_std": 0.75, "grad_norm": 0.0006703120889142156, "kl": 2.198386996984482, "learning_rate": 7.999923351444713e-06, "loss": -0.0, "num_tokens": 11585701.0, "reward": 3.887500047683716, "reward_std": 0.3535533845424652, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 2.0, "rewards/probe_completion_length/std": 0.0, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.03125, "rewards/probe_shaping_dominance/std": 0.1767766922712326, "rewards/probe_terminal_raw/mean": 0.96875, "rewards/probe_terminal_raw/std": 0.1767766922712326, "rewards/rollout_reward_func/mean": 0.9375, "rewards/rollout_reward_func/std": 0.3535533845424652, "sampling/importance_sampling_ratio/max": 1.039068341255188, "sampling/importance_sampling_ratio/mean": 0.9455829858779907, "sampling/importance_sampling_ratio/min": 0.30147337913513184, "sampling/sampling_logp_difference/max": 1.0723538398742676, "sampling/sampling_logp_difference/mean": 0.033339060842990875, "step": 491, "step_time": 9.407341567999538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2829307820647955, "epoch": 0.00492, "grad_norm": 0.00041976518696174026, "kl": 2.2534179985523224, "learning_rate": 7.99992301415919e-06, "loss": -0.0, "step": 492, "step_time": 6.122673765999025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.59375, "completions/mean_terminated_length": 2.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9248806834220886, "epoch": 0.00493, "frac_reward_zero_std": 1.0, "grad_norm": 5.793804666609503e-05, "kl": 0.2539625926874578, "learning_rate": 7.999922676133208e-06, "loss": 0.0, "num_tokens": 11631151.0, "reward": 2.043750047683716, "reward_std": 1.201058030128479, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.59375, "rewards/probe_completion_length/std": 0.49899089336395264, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.1576757431030273, "sampling/importance_sampling_ratio/mean": 0.8414940237998962, "sampling/importance_sampling_ratio/min": 0.6457847356796265, "sampling/sampling_logp_difference/max": 0.37067556381225586, "sampling/sampling_logp_difference/mean": 0.0808270201086998, "step": 493, "step_time": 10.59047884300162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9139997959136963, "epoch": 0.00494, "grad_norm": 5.656076973536983e-05, "kl": 0.2586546838283539, "learning_rate": 7.999922337366765e-06, "loss": 0.0, "step": 494, "step_time": 6.090953175001232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5, "completions/mean_terminated_length": 2.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7931460365653038, "epoch": 0.00495, "frac_reward_zero_std": 1.0, "grad_norm": 6.027599738445133e-05, "kl": 1.1627589538693428, "learning_rate": 7.99992199785986e-06, "loss": 0.0, "num_tokens": 11676700.0, "reward": 2.450000047683716, "reward_std": 1.5240014791488647, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5, "rewards/probe_completion_length/std": 0.5080004930496216, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.3997493982315063, "sampling/importance_sampling_ratio/mean": 0.8892567753791809, "sampling/importance_sampling_ratio/min": 0.4850221574306488, "sampling/sampling_logp_difference/max": 0.6389734148979187, "sampling/sampling_logp_difference/mean": 0.07556334882974625, "step": 495, "step_time": 10.246895737998784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7806414198130369, "epoch": 0.00496, "grad_norm": 6.212398147908971e-05, "kl": 1.1609262004494667, "learning_rate": 7.999921657612498e-06, "loss": 0.0, "step": 496, "step_time": 6.424024951001229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5625, "completions/mean_terminated_length": 2.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1086363270878792, "epoch": 0.00497, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006601955974474549, "kl": 0.9634743640199304, "learning_rate": 7.999921316624673e-06, "loss": 0.0, "num_tokens": 11723406.0, "reward": 1.5125000476837158, "reward_std": 0.504016101360321, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5625, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6105371713638306, "sampling/importance_sampling_ratio/mean": 0.8632278442382812, "sampling/importance_sampling_ratio/min": 0.45380502939224243, "sampling/sampling_logp_difference/max": 0.7139332294464111, "sampling/sampling_logp_difference/mean": 0.11729858815670013, "step": 497, "step_time": 9.990106498999012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1224614009261131, "epoch": 0.00498, "grad_norm": 0.0005421527894213796, "kl": 0.9261851194314659, "learning_rate": 7.99992097489639e-06, "loss": 0.0, "step": 498, "step_time": 6.056653904000996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 2.625, "completions/mean_terminated_length": 2.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.22955272346735, "epoch": 0.00499, "frac_reward_zero_std": 0.75, "grad_norm": 0.0019598952494561672, "kl": 1.2435842752456665, "learning_rate": 7.999920632427647e-06, "loss": -0.0001, "num_tokens": 11774598.0, "reward": 2.043750047683716, "reward_std": 1.4888700246810913, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.65625, "rewards/probe_completion_length/std": 1.0957211256027222, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.78125, "rewards/probe_shaping_dominance/std": 0.420013427734375, "rewards/probe_terminal_raw/mean": 0.21875, "rewards/probe_terminal_raw/std": 0.420013427734375, "rewards/rollout_reward_func/mean": -0.5625, "rewards/rollout_reward_func/std": 0.84002685546875, "sampling/importance_sampling_ratio/max": 1.4074516296386719, "sampling/importance_sampling_ratio/mean": 0.8869490623474121, "sampling/importance_sampling_ratio/min": 0.37190303206443787, "sampling/sampling_logp_difference/max": 0.7668733596801758, "sampling/sampling_logp_difference/mean": 0.14819347858428955, "step": 499, "step_time": 10.845204309999644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 1.2178536430001259, "epoch": 0.005, "grad_norm": 0.0005243890336714685, "kl": 1.2198392376303673, "learning_rate": 7.999920289218444e-06, "loss": -0.0001, "step": 500, "step_time": 6.757824534000065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6885988488793373, "epoch": 0.00501, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009279325022362173, "kl": 2.073035791516304, "learning_rate": 7.999919945268779e-06, "loss": 0.0, "num_tokens": 11821828.0, "reward": 2.793750047683716, "reward_std": 1.221035361289978, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 0.3689020276069641, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.5924896001815796, "sampling/importance_sampling_ratio/mean": 0.7612407207489014, "sampling/importance_sampling_ratio/min": 0.2523704469203949, "sampling/sampling_logp_difference/max": 0.8371151685714722, "sampling/sampling_logp_difference/mean": 0.1486271321773529, "step": 501, "step_time": 10.10802043700005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6844572648406029, "epoch": 0.00502, "grad_norm": 0.0008749505504965782, "kl": 2.072124183177948, "learning_rate": 7.999919600578657e-06, "loss": 0.0, "step": 502, "step_time": 6.110820461997719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.5625, "completions/mean_terminated_length": 2.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9634811952710152, "epoch": 0.00503, "frac_reward_zero_std": 1.0, "grad_norm": 6.934843258932233e-05, "kl": 0.6088187689892948, "learning_rate": 7.999919255148074e-06, "loss": 0.0, "num_tokens": 11868393.0, "reward": 2.043750047683716, "reward_std": 1.201058030128479, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.5625, "rewards/probe_completion_length/std": 0.504016101360321, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.75, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.25, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": -0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 2.7559914588928223, "sampling/importance_sampling_ratio/mean": 0.9101375937461853, "sampling/importance_sampling_ratio/min": 0.4674367308616638, "sampling/sampling_logp_difference/max": 1.3814847469329834, "sampling/sampling_logp_difference/mean": 0.11712859570980072, "step": 503, "step_time": 10.467418278000878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9756656736135483, "epoch": 0.00504, "grad_norm": 6.629782728850842e-05, "kl": 0.6043889897409827, "learning_rate": 7.999918908977031e-06, "loss": 0.0, "step": 504, "step_time": 6.49657412599845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.75, "completions/mean_terminated_length": 2.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9048156104981899, "epoch": 0.00505, "frac_reward_zero_std": 1.0, "grad_norm": 9.591080743120983e-05, "kl": 0.7056189067661762, "learning_rate": 7.999918562065531e-06, "loss": 0.0, "num_tokens": 11914091.0, "reward": 2.731250047683716, "reward_std": 1.3133158683776855, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 0.4908435642719269, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.8559684753417969, "sampling/importance_sampling_ratio/mean": 0.8754900097846985, "sampling/importance_sampling_ratio/min": 0.41432133316993713, "sampling/sampling_logp_difference/max": 0.8816030025482178, "sampling/sampling_logp_difference/mean": 0.09118005633354187, "step": 505, "step_time": 10.203277341001012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9119976498186588, "epoch": 0.00506, "grad_norm": 6.370534538291395e-05, "kl": 0.6997845359146595, "learning_rate": 7.999918214413569e-06, "loss": 0.0, "step": 506, "step_time": 6.172720281999318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.3125, "completions/mean_terminated_length": 2.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.5444683581590652, "epoch": 0.00507, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025572467129677534, "kl": 1.402412122581154, "learning_rate": 7.999917866021148e-06, "loss": 0.0, "num_tokens": 11963464.0, "reward": 1.2625000476837158, "reward_std": 0.4709290862083435, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.3125, "rewards/probe_completion_length/std": 0.4709290862083435, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -1.0, "rewards/rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1553281545639038, "sampling/importance_sampling_ratio/mean": 0.7280164361000061, "sampling/importance_sampling_ratio/min": 0.22703957557678223, "sampling/sampling_logp_difference/max": 0.753678560256958, "sampling/sampling_logp_difference/mean": 0.1670120656490326, "step": 507, "step_time": 10.870288863999122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5414555668830872, "epoch": 0.00508, "grad_norm": 0.001770776347257197, "kl": 1.0420573372393847, "learning_rate": 7.999917516888269e-06, "loss": 0.0, "step": 508, "step_time": 6.611128889000611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5182851850986481, "epoch": 0.00509, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007590521709062159, "kl": 2.343875676393509, "learning_rate": 7.99991716701493e-06, "loss": 0.0, "num_tokens": 12009204.0, "reward": 2.887500047683716, "reward_std": 1.1053389310836792, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 2.407043695449829, "sampling/importance_sampling_ratio/mean": 0.9590823650360107, "sampling/importance_sampling_ratio/min": 0.3875659108161926, "sampling/sampling_logp_difference/max": 0.8325726985931396, "sampling/sampling_logp_difference/mean": 0.10490264743566513, "step": 509, "step_time": 9.980003214000135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5325130149722099, "epoch": 0.0051, "grad_norm": 0.0007388418889604509, "kl": 2.3545358180999756, "learning_rate": 7.999916816401132e-06, "loss": 0.0, "step": 510, "step_time": 6.473959976001424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.665569581091404, "epoch": 0.00511, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003944501222576946, "kl": 1.5593384206295013, "learning_rate": 7.999916465046877e-06, "loss": 0.0, "num_tokens": 12054761.0, "reward": 2.762500047683716, "reward_std": 1.2556325197219849, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 0.3965577781200409, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.5, "rewards/probe_shaping_dominance/std": 0.5080004930496216, "rewards/probe_terminal_raw/mean": 0.5, "rewards/probe_terminal_raw/std": 0.5080004930496216, "rewards/rollout_reward_func/mean": 0.0, "rewards/rollout_reward_func/std": 1.0160009860992432, "sampling/importance_sampling_ratio/max": 1.1313704252243042, "sampling/importance_sampling_ratio/mean": 0.8322467803955078, "sampling/importance_sampling_ratio/min": 0.3396356999874115, "sampling/sampling_logp_difference/max": 0.9650154113769531, "sampling/sampling_logp_difference/mean": 0.10194703191518784, "step": 511, "step_time": 10.016052311998465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.665514025837183, "epoch": 0.00512, "grad_norm": 0.00044581855763681233, "kl": 1.5684444457292557, "learning_rate": 7.99991611295216e-06, "loss": 0.0, "step": 512, "step_time": 6.577485599000283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.40841441601514816, "epoch": 0.00513, "frac_reward_zero_std": 0.5, "grad_norm": 0.0037170210853219032, "kl": 1.0080515779554844, "learning_rate": 7.999915760116986e-06, "loss": -0.0001, "num_tokens": 12104074.0, "reward": 3.700000047683716, "reward_std": 0.6720215082168579, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 2.0, "rewards/probe_completion_length/std": 0.0, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.125, "rewards/probe_shaping_dominance/std": 0.33601075410842896, "rewards/probe_terminal_raw/mean": 0.875, "rewards/probe_terminal_raw/std": 0.33601075410842896, "rewards/rollout_reward_func/mean": 0.75, "rewards/rollout_reward_func/std": 0.6720215082168579, "sampling/importance_sampling_ratio/max": 1.135704517364502, "sampling/importance_sampling_ratio/mean": 0.9181493520736694, "sampling/importance_sampling_ratio/min": 0.3490389287471771, "sampling/sampling_logp_difference/max": 0.7234532833099365, "sampling/sampling_logp_difference/mean": 0.05021464079618454, "step": 513, "step_time": 12.078954928000712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.02083333395421505, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.40340831875801086, "epoch": 0.00514, "grad_norm": 0.0032272571697831154, "kl": 1.0150629207491875, "learning_rate": 7.999915406541353e-06, "loss": -0.0001, "step": 514, "step_time": 7.781343302000096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02500000037252903, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02500000037252903, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4827618896961212, "epoch": 0.00515, "frac_reward_zero_std": 0.25, "grad_norm": 0.0024894806556403637, "kl": 1.4780778959393501, "learning_rate": 7.999915052225262e-06, "loss": -0.0, "num_tokens": 12162708.0, "reward": 3.075000047683716, "reward_std": 1.0998533964157104, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.40625, "rewards/probe_shaping_dominance/std": 0.49899089336395264, "rewards/probe_terminal_raw/mean": 0.59375, "rewards/probe_terminal_raw/std": 0.49899089336395264, "rewards/rollout_reward_func/mean": 0.1875, "rewards/rollout_reward_func/std": 0.9979817867279053, "sampling/importance_sampling_ratio/max": 1.5890978574752808, "sampling/importance_sampling_ratio/mean": 0.925313413143158, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.23642897605896, "sampling/sampling_logp_difference/mean": 0.07910196483135223, "step": 515, "step_time": 13.655495725999572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02500000037252903, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02500000037252903, "entropy": 0.46802522242069244, "epoch": 0.00516, "grad_norm": 0.002520441310480237, "kl": 1.4883594401180744, "learning_rate": 7.999914697168712e-06, "loss": -0.0, "step": 516, "step_time": 8.033845836000182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.40684185177087784, "epoch": 0.00517, "frac_reward_zero_std": 0.5, "grad_norm": 0.0035400697961449623, "kl": 1.5064535737037659, "learning_rate": 7.999914341371702e-06, "loss": -0.0001, "num_tokens": 12215507.0, "reward": 3.606250047683716, "reward_std": 0.8273305296897888, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.96875, "rewards/probe_completion_length/std": 0.1767766922712326, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.15625, "rewards/probe_shaping_dominance/std": 0.3689020276069641, "rewards/probe_terminal_raw/mean": 0.84375, "rewards/probe_terminal_raw/std": 0.3689020276069641, "rewards/rollout_reward_func/mean": 0.6875, "rewards/rollout_reward_func/std": 0.7378040552139282, "sampling/importance_sampling_ratio/max": 1.0795238018035889, "sampling/importance_sampling_ratio/mean": 0.8931475877761841, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2370891571044922, "sampling/sampling_logp_difference/mean": 0.054866012185811996, "step": 517, "step_time": 14.256954984998629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4029547944664955, "epoch": 0.00518, "grad_norm": 0.0032185025047510862, "kl": 1.5003392770886421, "learning_rate": 7.999913984834237e-06, "loss": -0.0001, "step": 518, "step_time": 8.012493073000769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5593776553869247, "epoch": 0.00519, "frac_reward_zero_std": 0.0, "grad_norm": 0.008156136609613895, "kl": 1.942481990903616, "learning_rate": 7.99991362755631e-06, "loss": -0.0002, "num_tokens": 12276740.0, "reward": 3.106250047683716, "reward_std": 1.050633430480957, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.40625, "rewards/probe_shaping_dominance/std": 0.49899089336395264, "rewards/probe_terminal_raw/mean": 0.59375, "rewards/probe_terminal_raw/std": 0.49899089336395264, "rewards/rollout_reward_func/mean": 0.1875, "rewards/rollout_reward_func/std": 0.9979817867279053, "sampling/importance_sampling_ratio/max": 2.931607961654663, "sampling/importance_sampling_ratio/mean": 0.8578198552131653, "sampling/importance_sampling_ratio/min": 0.10247347503900528, "sampling/sampling_logp_difference/max": 2.1812801361083984, "sampling/sampling_logp_difference/mean": 0.15371261537075043, "step": 519, "step_time": 13.328113587000189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03645833395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03645833395421505, "entropy": 0.5542311668395996, "epoch": 0.0052, "grad_norm": 0.008781365118920803, "kl": 3.49315713532269, "learning_rate": 7.999913269537927e-06, "loss": -0.0002, "step": 520, "step_time": 7.888086324001051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 3.15625, "completions/mean_terminated_length": 3.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.3672683946788311, "epoch": 0.00521, "frac_reward_zero_std": 0.25, "grad_norm": 0.006466452963650227, "kl": 3.0226640701293945, "learning_rate": 7.999912910779086e-06, "loss": 0.0, "num_tokens": 12334694.0, "reward": 3.6999998092651367, "reward_std": 1.2443419694900513, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 2.21875, "rewards/probe_completion_length/std": 1.2374368906021118, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.25, "rewards/probe_shaping_dominance/std": 0.4399413466453552, "rewards/probe_terminal_raw/mean": 0.75, "rewards/probe_terminal_raw/std": 0.4399413466453552, "rewards/rollout_reward_func/mean": 0.5, "rewards/rollout_reward_func/std": 0.8798826932907104, "sampling/importance_sampling_ratio/max": 1.2187602519989014, "sampling/importance_sampling_ratio/mean": 0.9314761161804199, "sampling/importance_sampling_ratio/min": 0.20403584837913513, "sampling/sampling_logp_difference/max": 1.4795587062835693, "sampling/sampling_logp_difference/mean": 0.05452505126595497, "step": 521, "step_time": 13.404775265998978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36731596663594246, "epoch": 0.00522, "grad_norm": 0.003731419797986746, "kl": 2.32449933886528, "learning_rate": 7.999912551279787e-06, "loss": 0.0, "step": 522, "step_time": 7.9222454560003825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2391245774924755, "epoch": 0.00523, "frac_reward_zero_std": 0.5, "grad_norm": 0.0008970812777988613, "kl": 2.0755921453237534, "learning_rate": 7.99991219104003e-06, "loss": 0.0001, "num_tokens": 12387223.0, "reward": 3.575000047683716, "reward_std": 0.7931155562400818, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 2.0, "rewards/probe_completion_length/std": 0.0, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.1875, "rewards/probe_shaping_dominance/std": 0.3965577781200409, "rewards/probe_terminal_raw/mean": 0.8125, "rewards/probe_terminal_raw/std": 0.3965577781200409, "rewards/rollout_reward_func/mean": 0.625, "rewards/rollout_reward_func/std": 0.7931155562400818, "sampling/importance_sampling_ratio/max": 1.8916287422180176, "sampling/importance_sampling_ratio/mean": 1.0045948028564453, "sampling/importance_sampling_ratio/min": 0.9068307280540466, "sampling/sampling_logp_difference/max": 0.6775636672973633, "sampling/sampling_logp_difference/mean": 0.026775330305099487, "step": 523, "step_time": 12.776815157999408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23557228222489357, "epoch": 0.00524, "grad_norm": 0.0009381213458254933, "kl": 2.076133668422699, "learning_rate": 7.999911830059816e-06, "loss": 0.0001, "step": 524, "step_time": 8.225136326999746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 3.09375, "completions/mean_terminated_length": 3.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4644627757370472, "epoch": 0.00525, "frac_reward_zero_std": 0.25, "grad_norm": 0.005579262040555477, "kl": 2.1972502768039703, "learning_rate": 7.999911468339143e-06, "loss": 0.0001, "num_tokens": 12440743.0, "reward": 3.4812498092651367, "reward_std": 1.43649160861969, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 2.15625, "rewards/probe_completion_length/std": 1.2727763652801514, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.3125, "rewards/probe_shaping_dominance/std": 0.4709290862083435, "rewards/probe_terminal_raw/mean": 0.6875, "rewards/probe_terminal_raw/std": 0.4709290862083435, "rewards/rollout_reward_func/mean": 0.375, "rewards/rollout_reward_func/std": 0.941858172416687, "sampling/importance_sampling_ratio/max": 2.116163969039917, "sampling/importance_sampling_ratio/mean": 0.9502134323120117, "sampling/importance_sampling_ratio/min": 0.1773698925971985, "sampling/sampling_logp_difference/max": 1.583463191986084, "sampling/sampling_logp_difference/mean": 0.08440504968166351, "step": 525, "step_time": 12.90298909700141 } ], "logging_steps": 1.0, "max_steps": 200000, "num_input_tokens_seen": 12440743, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }